diff --git a/finetune/CLAUDE.md b/finetune/CLAUDE.md index 9d7f45a..dd02a84 100644 --- a/finetune/CLAUDE.md +++ b/finetune/CLAUDE.md @@ -38,11 +38,11 @@ The schema is enforced by `dataset/schema.py:TrainingExample` (Pydantic model). | Repository | Purpose | |------------|---------| -| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT + GRPO) | +| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT baseline) | | `tobil/qmd-query-expansion-1.7B-gguf` | GGUF quantized versions for deployment | | `tobil/qmd-query-expansion-1.7B-sft` | SFT adapter checkpoint (intermediate) | -| `tobil/qmd-query-expansion-1.7B-grpo` | GRPO adapter checkpoint (intermediate) | | `tobil/qmd-query-expansion-train` | Prepared training dataset | +| `tobil/qmd-query-expansion-1.7B-grpo` | Experimental GRPO adapter (optional) | **Rules:** - No versioned repos (`-v1`, `-v2`, `-v4`, etc.) - update in place @@ -80,14 +80,14 @@ uv run train.py sft --config configs/sft.yaml hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py ``` -### Stage 2: GRPO +### Stage 2: (Experimental) GRPO ```bash -# Local (requires CUDA) -uv run train.py grpo --config configs/grpo.yaml +# Local (optional; experimental) +uv run train.py grpo --config experiments/grpo/grpo.yaml -# Cloud (HuggingFace Jobs) -hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py +# Experimental script +HF_TOKEN=${HF_TOKEN} uv run experiments/grpo/grpo.py ``` ### HuggingFace Jobs @@ -102,9 +102,9 @@ hf jobs cancel # Cancel a job ### Evaluation ```bash -uv run eval.py --model ./outputs/grpo -uv run eval.py --model tobil/qmd-query-expansion-1.7B -uv run eval.py --model ./outputs/grpo -o eval_results.json +uv run eval.py ./outputs/sft +uv run eval.py tobil/qmd-query-expansion-1.7B +uv run eval.py ./outputs/sft -o eval_results.json ``` ## Quality Scoring @@ -126,6 +126,9 @@ experiments/ ├── lfm2/ # LiquidAI LFM2-1.2B (hybrid architecture, faster inference) │ ├── sft_lfm2.yaml │ └── sft_lfm2.py +├── grpo/ # Experimental GRPO recipe and config +│ ├── grpo.py +│ └── grpo.yaml └── gepa/ # DSPy-based prompt optimization (GEPA) ├── dspy_gepa.py └── ... @@ -138,7 +141,7 @@ These are not part of the main training pipeline. ``` finetune/ ├── reward.py # Scoring function (single source of truth) -├── train.py # Unified SFT + GRPO training +├── train.py # SFT training entrypoint ├── eval.py # Generate and score expansions ├── convert_gguf.py # GGUF conversion ├── SCORING.md # Detailed scoring rubric @@ -147,8 +150,8 @@ finetune/ ├── data/ # All training JSONL files (strict schema) ├── dataset/ # Schema + data tools (Pydantic-based) ├── jobs/ # Self-contained HuggingFace Jobs scripts -├── configs/ # Training configs (sft.yaml, grpo.yaml) +├── configs/ # Training configs (sft.yaml) ├── evals/ # Test queries -├── experiments/ # Experimental configs (LFM2, GEPA) +├── experiments/ # Experimental configs (LFM2, GEPA, GRPO) └── outputs/ # Local training outputs (gitignored) ``` diff --git a/finetune/Justfile b/finetune/Justfile index 1563f87..0cd101d 100644 --- a/finetune/Justfile +++ b/finetune/Justfile @@ -26,6 +26,9 @@ train-local: HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node auto \ train.py sft --config configs/sft_local.yaml |& tee /tmp/qmd-sft-train.log -grpo-local: - CUDA_VISIBLE_DEVICES=1,2,3 HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node 3 \ - train.py grpo --config configs/grpo.yaml |& tee /tmp/qmd-grpo-train.log +# Experimental GRPO training is in finetune/experiments/grpo and not part of +# the default pipeline. +# +# grpo-local: +# HF_TOKEN=${HF_TOKEN} uv run train.py grpo --config experiments/grpo/grpo.yaml |& tee /tmp/qmd-grpo-train.log + diff --git a/finetune/README.md b/finetune/README.md index a339c96..a845bcc 100644 --- a/finetune/README.md +++ b/finetune/README.md @@ -40,22 +40,23 @@ These feed into QMD's three search backends: # 1. SFT: teach the model the output format (~45 min on A10G, ~$1.50) hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py -# 2. GRPO: RL refinement on top of SFT (~20 min on A10G, ~$0.50) -hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py +# 2. Evaluate against test queries (needs local GPU or use eval job) +uv run eval.py tobil/qmd-query-expansion-1.7B -# 3. Evaluate against test queries (needs local GPU or use eval job) -uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \ - --sft-model tobil/qmd-query-expansion-1.7B-sft - -# 4. Convert to GGUF for local deployment (Ollama, llama.cpp) +# 3. Convert to GGUF for local deployment (Ollama, llama.cpp) uv run convert_gguf.py --size 1.7B + +# NOTE: GRPO is currently experimental and moved to finetune/experiments/grpo +# if you want to run it manually, use uv run python experiments/grpo/grpo.py ``` ### Local training (if you have a GPU) ```bash uv run train.py sft --config configs/sft.yaml -uv run train.py grpo --config configs/grpo.yaml + +# Experimental GRPO +uv run train.py grpo --config experiments/grpo/grpo.yaml ``` ### Monitoring HF Jobs @@ -85,19 +86,19 @@ direct `lex:/vec:/hyde:` output without `` blocks. ``` finetune/ ├── reward.py # Scoring/reward function (single source of truth) -├── train.py # Unified SFT + GRPO training (two subcommands) +├── train.py # SFT training entrypoint ├── eval.py # Generate expansions and score them ├── convert_gguf.py # GGUF conversion for Ollama/llama.cpp ├── jobs/ │ ├── sft.py # Self-contained SFT for HuggingFace Jobs -│ ├── grpo.py # Self-contained GRPO for HuggingFace Jobs │ ├── eval.py # Self-contained eval for HuggingFace Jobs │ └── eval_common.py # Shared eval utilities ├── configs/ -│ ├── sft.yaml # SFT hyperparameters for Qwen3-1.7B -│ └── grpo.yaml # GRPO hyperparameters for Qwen3-1.7B +│ └── sft.yaml # SFT hyperparameters for Qwen3-1.7B ├── evals/ │ └── queries.txt # 31 test queries across 8 categories +├── experiments/ +│ └── grpo/ # Experimental GRPO configuration and script (optional) ├── data/ # Training JSONL files (all concatenated for training) ├── dataset/ │ ├── prepare_data.py # Format for Qwen3 chat template, dedup, split @@ -130,29 +131,14 @@ uv run train.py sft --config configs/sft.yaml uv run train.py sft --config configs/sft.yaml --dry-run # preview config ``` -### Stage 2: GRPO (Group Relative Policy Optimization) +### Stage 2: (Experimental) GRPO -Reinforcement learning on top of the merged SFT weights. The model generates -multiple expansions per query, they are scored by the reward function, and the -model is updated to prefer higher-scoring outputs. - -| Parameter | Value | -|-----------|-------| -| Base | Merged SFT checkpoint | -| Method | LoRA (rank 4, alpha 8) — smaller for RL stability | -| Target modules | q_proj, v_proj only | -| Reward | `reward.py` (rule-based, 5 dimensions) | -| KL beta | 0.04 — prevents drift from SFT checkpoint | -| Generations per prompt | 4 | -| Max steps | 200 | -| Learning rate | 5e-7 | - -**Important:** `beta > 0` is critical. With `beta=0` the model experiences -catastrophic drift and scores drop to 0%. +GRPO is currently treated as experimental and kept under `experiments/grpo/`. +It is not part of the default production path for this repository. ```bash -uv run train.py grpo --config configs/grpo.yaml -uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward function +# Optional experimental GRPO run +uv run train.py grpo --config experiments/grpo/grpo.yaml ``` ## Evaluation @@ -160,24 +146,26 @@ uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward functio `eval.py` generates expansions from a model and scores them against test queries: ```bash -# Evaluate an SFT model +# Evaluate a SFT model uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -# Evaluate a GRPO model (needs SFT adapter merged first) -uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \ - --sft-model tobil/qmd-query-expansion-1.7B-sft +# Evaluate an SFT output dir +uv run eval.py outputs/sft # Verbose output with deduction details -uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -v +uv run eval.py tobil/qmd-query-expansion-1.7B -v + +# Optional: evaluate GRPO experimental output (if run) +uv run eval.py outputs/grpo # Save detailed scores to JSON -uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -o scores.json +uv run eval.py tobil/qmd-query-expansion-1.7B -o scores.json ``` ## Reward Function -`reward.py` is the single source of truth for scoring. It is used both as the -GRPO reward signal during training and for evaluation. +`reward.py` is the single source of truth for scoring. It is used for evaluation +and (optionally) as the GRPO reward signal in the experimental path. Five scoring dimensions (max 120 without hyde, 140 with): @@ -201,8 +189,8 @@ uv run reward.py ## GGUF Conversion -Merges base + SFT + GRPO adapters into a single model and produces -quantized GGUF files for deployment: +Merges base + SFT and (optionally) GRPO adapters into a single model, then +produces quantized GGUF files for deployment: ```bash # Use preset for 1.7B @@ -240,15 +228,14 @@ just validate ## Architecture Notes -The two-stage training approach (SFT -> GRPO) is standard for structured-output models: +The production training approach is currently **SFT-only**: 1. **SFT** establishes format compliance and basic query understanding. It uses a large LoRA (rank 16, all projection layers) because it needs to learn a new output format from scratch. -2. **GRPO** refines quality within the learned format. It uses a small LoRA - (rank 4, q/v only) and KL regularization to make incremental improvements - without losing what SFT taught. +2. **GRPO** exists as an optional experimental path under `experiments/grpo/` + and is not in the production training pipeline. The reward function is entirely rule-based (no LLM judge) which makes it fast, deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubric. @@ -266,20 +253,12 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri | Epochs | 5 | | Hardware | A10G (24 GB VRAM) | -### GRPO - -| Metric | Value | -|--------|-------| -| Mean reward | 0.757 | -| Final loss | 0.0005 | -| KL divergence | 0.00048 | -| Mean completion length | ~58 tokens | -| Training time | ~19 min (200 steps) | -| Hardware | A10G (24 GB VRAM) | - ### Evaluation Scores | Model | Average Score | Excellent (30) | |-------|--------------|-----------------| | SFT | 92.0% | 30/30 | -| GRPO | 91.7% | 30/30 | + +> GRPO scores are not tracked in this branch; see `experiments/grpo/` for historical +> experimental results. + diff --git a/finetune/experiments/grpo/README.md b/finetune/experiments/grpo/README.md new file mode 100644 index 0000000..3308850 --- /dev/null +++ b/finetune/experiments/grpo/README.md @@ -0,0 +1,26 @@ +# GRPO (Experimental) + +This folder contains the **experimental** GRPO training path for query expansion. +It is not part of the default production pipeline. + +## Files + +- `grpo.yaml` – experimental GRPO hyperparameters +- `grpo.py` – standalone GRPO training script + +## Run + +```bash +# Recommended default: run from repo root +cd /home/tobi/qmd +uv run finetune/experiments/grpo/grpo.py + +# Or use unified entrypoint (deprecated in main pipeline): +uv run train.py grpo --config finetune/experiments/grpo/grpo.yaml +``` + +## Notes + +- Current mainline focuses on SFT-only quality and benchmarks. +- Keep this workflow isolated unless you are explicitly experimenting with + reinforcement-learning refinement. diff --git a/finetune/jobs/grpo.py b/finetune/experiments/grpo/grpo.py similarity index 94% rename from finetune/jobs/grpo.py rename to finetune/experiments/grpo/grpo.py index 1edb645..4493859 100644 --- a/finetune/jobs/grpo.py +++ b/finetune/experiments/grpo/grpo.py @@ -14,8 +14,10 @@ """ GRPO training for QMD query expansion (Qwen3-1.7B). -Runs on top of merged SFT weights. Self-contained for HuggingFace Jobs: - hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py +Experimental recipe run on top of merged SFT weights. Self-contained runner: + uv run experiments/grpo/grpo.py + +(If using HF Jobs, run this script as the job entrypoint.) """ import os @@ -42,7 +44,7 @@ if not os.path.exists(_eval_common_path): sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from eval_common import QMDRewardFunction, run_eval -# --- Config (inlined from configs/grpo.yaml) --- +# --- Config (inlined from experiments/grpo/grpo.yaml) --- BASE_MODEL = "Qwen/Qwen3-1.7B" SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft" OUTPUT_MODEL = "tobil/qmd-query-expansion-1.7B-grpo" diff --git a/finetune/configs/grpo.yaml b/finetune/experiments/grpo/grpo.yaml similarity index 95% rename from finetune/configs/grpo.yaml rename to finetune/experiments/grpo/grpo.yaml index db99207..c5b5aab 100644 --- a/finetune/configs/grpo.yaml +++ b/finetune/experiments/grpo/grpo.yaml @@ -1,7 +1,7 @@ # GRPO Training Config for QMD Query Expansion # Target: Qwen3-1.7B, trained on top of merged SFT weights # -# Usage: uv run train.py grpo --config configs/grpo.yaml +# Usage: uv run train.py grpo --config experiments/grpo/grpo.yaml # # The reward function (reward.py) scores expansions on format compliance, # diversity, hyde quality, content quality, and named entity preservation. diff --git a/finetune/train.py b/finetune/train.py index dc77ffb..ce5612d 100644 --- a/finetune/train.py +++ b/finetune/train.py @@ -18,14 +18,14 @@ """ Unified training script for QMD query expansion models. -Supports two stages: +Primary pipeline is SFT-only: sft - Supervised fine-tuning on labeled examples - grpo - Group Relative Policy Optimization (RL) on top of merged SFT weights + +GRPO was moved to `experiments/grpo/` and is not part of the main training +pipeline by default. Usage: uv run train.py sft --config configs/sft.yaml - uv run train.py grpo --config configs/grpo.yaml - uv run train.py grpo --config configs/grpo.yaml --dry-run """ import argparse @@ -412,6 +412,15 @@ def cmd_sft(args): def cmd_grpo(args): """Run GRPO reinforcement learning on top of merged SFT weights.""" + print( + "GRPO is not part of the main training pipeline and has been moved to `experiments/grpo/`." + ) + print("To run experimental GRPO, use:") + print(" cd finetune && uv run python experiments/grpo/grpo.py") + print("Or, if you have local config wiring ready:") + print(" uv run train.py grpo --config experiments/grpo/grpo.yaml") + return + import torch import torch.distributed as dist import os @@ -645,8 +654,6 @@ def main(): epilog=""" Examples: uv run train.py sft --config configs/sft.yaml - uv run train.py grpo --config configs/grpo.yaml - uv run train.py grpo --config configs/grpo.yaml --dry-run """, ) sub = parser.add_subparsers(dest="stage", required=True) @@ -657,7 +664,10 @@ Examples: "--dry-run", action="store_true", help="Print config and exit" ) - grpo_parser = sub.add_parser("grpo", help="GRPO reinforcement learning") + grpo_parser = sub.add_parser( + "grpo", + help="Experimental: GRPO reinforcement learning (moved to experiments/grpo/)", + ) grpo_parser.add_argument("--config", required=True, help="Path to GRPO config YAML") grpo_parser.add_argument( "--dry-run", action="store_true", help="Print config, test reward, and exit"