Move GRPO training out of default finetune pipeline

This commit is contained in:
Tobi Lütke 2026-02-22 15:26:23 -05:00
parent cbeeb1f89b
commit 189916d6fb
No known key found for this signature in database
7 changed files with 108 additions and 85 deletions

View File

@ -38,11 +38,11 @@ The schema is enforced by `dataset/schema.py:TrainingExample` (Pydantic model).
| Repository | Purpose | | Repository | Purpose |
|------------|---------| |------------|---------|
| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT + GRPO) | | `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT baseline) |
| `tobil/qmd-query-expansion-1.7B-gguf` | GGUF quantized versions for deployment | | `tobil/qmd-query-expansion-1.7B-gguf` | GGUF quantized versions for deployment |
| `tobil/qmd-query-expansion-1.7B-sft` | SFT adapter checkpoint (intermediate) | | `tobil/qmd-query-expansion-1.7B-sft` | SFT adapter checkpoint (intermediate) |
| `tobil/qmd-query-expansion-1.7B-grpo` | GRPO adapter checkpoint (intermediate) |
| `tobil/qmd-query-expansion-train` | Prepared training dataset | | `tobil/qmd-query-expansion-train` | Prepared training dataset |
| `tobil/qmd-query-expansion-1.7B-grpo` | Experimental GRPO adapter (optional) |
**Rules:** **Rules:**
- No versioned repos (`-v1`, `-v2`, `-v4`, etc.) - update in place - No versioned repos (`-v1`, `-v2`, `-v4`, etc.) - update in place
@ -80,14 +80,14 @@ uv run train.py sft --config configs/sft.yaml
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
``` ```
### Stage 2: GRPO ### Stage 2: (Experimental) GRPO
```bash ```bash
# Local (requires CUDA) # Local (optional; experimental)
uv run train.py grpo --config configs/grpo.yaml uv run train.py grpo --config experiments/grpo/grpo.yaml
# Cloud (HuggingFace Jobs) # Experimental script
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py HF_TOKEN=${HF_TOKEN} uv run experiments/grpo/grpo.py
``` ```
### HuggingFace Jobs ### HuggingFace Jobs
@ -102,9 +102,9 @@ hf jobs cancel <job-id> # Cancel a job
### Evaluation ### Evaluation
```bash ```bash
uv run eval.py --model ./outputs/grpo uv run eval.py ./outputs/sft
uv run eval.py --model tobil/qmd-query-expansion-1.7B uv run eval.py tobil/qmd-query-expansion-1.7B
uv run eval.py --model ./outputs/grpo -o eval_results.json uv run eval.py ./outputs/sft -o eval_results.json
``` ```
## Quality Scoring ## Quality Scoring
@ -126,6 +126,9 @@ experiments/
├── lfm2/ # LiquidAI LFM2-1.2B (hybrid architecture, faster inference) ├── lfm2/ # LiquidAI LFM2-1.2B (hybrid architecture, faster inference)
│ ├── sft_lfm2.yaml │ ├── sft_lfm2.yaml
│ └── sft_lfm2.py │ └── sft_lfm2.py
├── grpo/ # Experimental GRPO recipe and config
│ ├── grpo.py
│ └── grpo.yaml
└── gepa/ # DSPy-based prompt optimization (GEPA) └── gepa/ # DSPy-based prompt optimization (GEPA)
├── dspy_gepa.py ├── dspy_gepa.py
└── ... └── ...
@ -138,7 +141,7 @@ These are not part of the main training pipeline.
``` ```
finetune/ finetune/
├── reward.py # Scoring function (single source of truth) ├── reward.py # Scoring function (single source of truth)
├── train.py # Unified SFT + GRPO training ├── train.py # SFT training entrypoint
├── eval.py # Generate and score expansions ├── eval.py # Generate and score expansions
├── convert_gguf.py # GGUF conversion ├── convert_gguf.py # GGUF conversion
├── SCORING.md # Detailed scoring rubric ├── SCORING.md # Detailed scoring rubric
@ -147,8 +150,8 @@ finetune/
├── data/ # All training JSONL files (strict schema) ├── data/ # All training JSONL files (strict schema)
├── dataset/ # Schema + data tools (Pydantic-based) ├── dataset/ # Schema + data tools (Pydantic-based)
├── jobs/ # Self-contained HuggingFace Jobs scripts ├── jobs/ # Self-contained HuggingFace Jobs scripts
├── configs/ # Training configs (sft.yaml, grpo.yaml) ├── configs/ # Training configs (sft.yaml)
├── evals/ # Test queries ├── evals/ # Test queries
├── experiments/ # Experimental configs (LFM2, GEPA) ├── experiments/ # Experimental configs (LFM2, GEPA, GRPO)
└── outputs/ # Local training outputs (gitignored) └── outputs/ # Local training outputs (gitignored)
``` ```

View File

@ -26,6 +26,9 @@ train-local:
HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node auto \ HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node auto \
train.py sft --config configs/sft_local.yaml |& tee /tmp/qmd-sft-train.log train.py sft --config configs/sft_local.yaml |& tee /tmp/qmd-sft-train.log
grpo-local: # Experimental GRPO training is in finetune/experiments/grpo and not part of
CUDA_VISIBLE_DEVICES=1,2,3 HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node 3 \ # the default pipeline.
train.py grpo --config configs/grpo.yaml |& tee /tmp/qmd-grpo-train.log #
# grpo-local:
# HF_TOKEN=${HF_TOKEN} uv run train.py grpo --config experiments/grpo/grpo.yaml |& tee /tmp/qmd-grpo-train.log

View File

@ -40,22 +40,23 @@ These feed into QMD's three search backends:
# 1. SFT: teach the model the output format (~45 min on A10G, ~$1.50) # 1. SFT: teach the model the output format (~45 min on A10G, ~$1.50)
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
# 2. GRPO: RL refinement on top of SFT (~20 min on A10G, ~$0.50) # 2. Evaluate against test queries (needs local GPU or use eval job)
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py uv run eval.py tobil/qmd-query-expansion-1.7B
# 3. Evaluate against test queries (needs local GPU or use eval job) # 3. Convert to GGUF for local deployment (Ollama, llama.cpp)
uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
--sft-model tobil/qmd-query-expansion-1.7B-sft
# 4. Convert to GGUF for local deployment (Ollama, llama.cpp)
uv run convert_gguf.py --size 1.7B uv run convert_gguf.py --size 1.7B
# NOTE: GRPO is currently experimental and moved to finetune/experiments/grpo
# if you want to run it manually, use uv run python experiments/grpo/grpo.py
``` ```
### Local training (if you have a GPU) ### Local training (if you have a GPU)
```bash ```bash
uv run train.py sft --config configs/sft.yaml uv run train.py sft --config configs/sft.yaml
uv run train.py grpo --config configs/grpo.yaml
# Experimental GRPO
uv run train.py grpo --config experiments/grpo/grpo.yaml
``` ```
### Monitoring HF Jobs ### Monitoring HF Jobs
@ -85,19 +86,19 @@ direct `lex:/vec:/hyde:` output without `<think>` blocks.
``` ```
finetune/ finetune/
├── reward.py # Scoring/reward function (single source of truth) ├── reward.py # Scoring/reward function (single source of truth)
├── train.py # Unified SFT + GRPO training (two subcommands) ├── train.py # SFT training entrypoint
├── eval.py # Generate expansions and score them ├── eval.py # Generate expansions and score them
├── convert_gguf.py # GGUF conversion for Ollama/llama.cpp ├── convert_gguf.py # GGUF conversion for Ollama/llama.cpp
├── jobs/ ├── jobs/
│ ├── sft.py # Self-contained SFT for HuggingFace Jobs │ ├── sft.py # Self-contained SFT for HuggingFace Jobs
│ ├── grpo.py # Self-contained GRPO for HuggingFace Jobs
│ ├── eval.py # Self-contained eval for HuggingFace Jobs │ ├── eval.py # Self-contained eval for HuggingFace Jobs
│ └── eval_common.py # Shared eval utilities │ └── eval_common.py # Shared eval utilities
├── configs/ ├── configs/
│ ├── sft.yaml # SFT hyperparameters for Qwen3-1.7B │ └── sft.yaml # SFT hyperparameters for Qwen3-1.7B
│ └── grpo.yaml # GRPO hyperparameters for Qwen3-1.7B
├── evals/ ├── evals/
│ └── queries.txt # 31 test queries across 8 categories │ └── queries.txt # 31 test queries across 8 categories
├── experiments/
│ └── grpo/ # Experimental GRPO configuration and script (optional)
├── data/ # Training JSONL files (all concatenated for training) ├── data/ # Training JSONL files (all concatenated for training)
├── dataset/ ├── dataset/
│ ├── prepare_data.py # Format for Qwen3 chat template, dedup, split │ ├── prepare_data.py # Format for Qwen3 chat template, dedup, split
@ -130,29 +131,14 @@ uv run train.py sft --config configs/sft.yaml
uv run train.py sft --config configs/sft.yaml --dry-run # preview config uv run train.py sft --config configs/sft.yaml --dry-run # preview config
``` ```
### Stage 2: GRPO (Group Relative Policy Optimization) ### Stage 2: (Experimental) GRPO
Reinforcement learning on top of the merged SFT weights. The model generates GRPO is currently treated as experimental and kept under `experiments/grpo/`.
multiple expansions per query, they are scored by the reward function, and the It is not part of the default production path for this repository.
model is updated to prefer higher-scoring outputs.
| Parameter | Value |
|-----------|-------|
| Base | Merged SFT checkpoint |
| Method | LoRA (rank 4, alpha 8) — smaller for RL stability |
| Target modules | q_proj, v_proj only |
| Reward | `reward.py` (rule-based, 5 dimensions) |
| KL beta | 0.04 — prevents drift from SFT checkpoint |
| Generations per prompt | 4 |
| Max steps | 200 |
| Learning rate | 5e-7 |
**Important:** `beta > 0` is critical. With `beta=0` the model experiences
catastrophic drift and scores drop to 0%.
```bash ```bash
uv run train.py grpo --config configs/grpo.yaml # Optional experimental GRPO run
uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward function uv run train.py grpo --config experiments/grpo/grpo.yaml
``` ```
## Evaluation ## Evaluation
@ -160,24 +146,26 @@ uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward functio
`eval.py` generates expansions from a model and scores them against test queries: `eval.py` generates expansions from a model and scores them against test queries:
```bash ```bash
# Evaluate an SFT model # Evaluate a SFT model
uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft
# Evaluate a GRPO model (needs SFT adapter merged first) # Evaluate an SFT output dir
uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \ uv run eval.py outputs/sft
--sft-model tobil/qmd-query-expansion-1.7B-sft
# Verbose output with deduction details # Verbose output with deduction details
uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -v uv run eval.py tobil/qmd-query-expansion-1.7B -v
# Optional: evaluate GRPO experimental output (if run)
uv run eval.py outputs/grpo
# Save detailed scores to JSON # Save detailed scores to JSON
uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -o scores.json uv run eval.py tobil/qmd-query-expansion-1.7B -o scores.json
``` ```
## Reward Function ## Reward Function
`reward.py` is the single source of truth for scoring. It is used both as the `reward.py` is the single source of truth for scoring. It is used for evaluation
GRPO reward signal during training and for evaluation. and (optionally) as the GRPO reward signal in the experimental path.
Five scoring dimensions (max 120 without hyde, 140 with): Five scoring dimensions (max 120 without hyde, 140 with):
@ -201,8 +189,8 @@ uv run reward.py
## GGUF Conversion ## GGUF Conversion
Merges base + SFT + GRPO adapters into a single model and produces Merges base + SFT and (optionally) GRPO adapters into a single model, then
quantized GGUF files for deployment: produces quantized GGUF files for deployment:
```bash ```bash
# Use preset for 1.7B # Use preset for 1.7B
@ -240,15 +228,14 @@ just validate
## Architecture Notes ## Architecture Notes
The two-stage training approach (SFT -> GRPO) is standard for structured-output models: The production training approach is currently **SFT-only**:
1. **SFT** establishes format compliance and basic query understanding. It uses 1. **SFT** establishes format compliance and basic query understanding. It uses
a large LoRA (rank 16, all projection layers) because it needs to learn a a large LoRA (rank 16, all projection layers) because it needs to learn a
new output format from scratch. new output format from scratch.
2. **GRPO** refines quality within the learned format. It uses a small LoRA 2. **GRPO** exists as an optional experimental path under `experiments/grpo/`
(rank 4, q/v only) and KL regularization to make incremental improvements and is not in the production training pipeline.
without losing what SFT taught.
The reward function is entirely rule-based (no LLM judge) which makes it fast, The reward function is entirely rule-based (no LLM judge) which makes it fast,
deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubric. deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubric.
@ -266,20 +253,12 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri
| Epochs | 5 | | Epochs | 5 |
| Hardware | A10G (24 GB VRAM) | | Hardware | A10G (24 GB VRAM) |
### GRPO
| Metric | Value |
|--------|-------|
| Mean reward | 0.757 |
| Final loss | 0.0005 |
| KL divergence | 0.00048 |
| Mean completion length | ~58 tokens |
| Training time | ~19 min (200 steps) |
| Hardware | A10G (24 GB VRAM) |
### Evaluation Scores ### Evaluation Scores
| Model | Average Score | Excellent (30) | | Model | Average Score | Excellent (30) |
|-------|--------------|-----------------| |-------|--------------|-----------------|
| SFT | 92.0% | 30/30 | | SFT | 92.0% | 30/30 |
| GRPO | 91.7% | 30/30 |
> GRPO scores are not tracked in this branch; see `experiments/grpo/` for historical
> experimental results.

View File

@ -0,0 +1,26 @@
# GRPO (Experimental)
This folder contains the **experimental** GRPO training path for query expansion.
It is not part of the default production pipeline.
## Files
- `grpo.yaml` experimental GRPO hyperparameters
- `grpo.py` standalone GRPO training script
## Run
```bash
# Recommended default: run from repo root
cd /home/tobi/qmd
uv run finetune/experiments/grpo/grpo.py
# Or use unified entrypoint (deprecated in main pipeline):
uv run train.py grpo --config finetune/experiments/grpo/grpo.yaml
```
## Notes
- Current mainline focuses on SFT-only quality and benchmarks.
- Keep this workflow isolated unless you are explicitly experimenting with
reinforcement-learning refinement.

View File

@ -14,8 +14,10 @@
""" """
GRPO training for QMD query expansion (Qwen3-1.7B). GRPO training for QMD query expansion (Qwen3-1.7B).
Runs on top of merged SFT weights. Self-contained for HuggingFace Jobs: Experimental recipe run on top of merged SFT weights. Self-contained runner:
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py uv run experiments/grpo/grpo.py
(If using HF Jobs, run this script as the job entrypoint.)
""" """
import os import os
@ -42,7 +44,7 @@ if not os.path.exists(_eval_common_path):
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from eval_common import QMDRewardFunction, run_eval from eval_common import QMDRewardFunction, run_eval
# --- Config (inlined from configs/grpo.yaml) --- # --- Config (inlined from experiments/grpo/grpo.yaml) ---
BASE_MODEL = "Qwen/Qwen3-1.7B" BASE_MODEL = "Qwen/Qwen3-1.7B"
SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft" SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft"
OUTPUT_MODEL = "tobil/qmd-query-expansion-1.7B-grpo" OUTPUT_MODEL = "tobil/qmd-query-expansion-1.7B-grpo"

View File

@ -1,7 +1,7 @@
# GRPO Training Config for QMD Query Expansion # GRPO Training Config for QMD Query Expansion
# Target: Qwen3-1.7B, trained on top of merged SFT weights # Target: Qwen3-1.7B, trained on top of merged SFT weights
# #
# Usage: uv run train.py grpo --config configs/grpo.yaml # Usage: uv run train.py grpo --config experiments/grpo/grpo.yaml
# #
# The reward function (reward.py) scores expansions on format compliance, # The reward function (reward.py) scores expansions on format compliance,
# diversity, hyde quality, content quality, and named entity preservation. # diversity, hyde quality, content quality, and named entity preservation.

View File

@ -18,14 +18,14 @@
""" """
Unified training script for QMD query expansion models. Unified training script for QMD query expansion models.
Supports two stages: Primary pipeline is SFT-only:
sft - Supervised fine-tuning on labeled examples sft - Supervised fine-tuning on labeled examples
grpo - Group Relative Policy Optimization (RL) on top of merged SFT weights
GRPO was moved to `experiments/grpo/` and is not part of the main training
pipeline by default.
Usage: Usage:
uv run train.py sft --config configs/sft.yaml uv run train.py sft --config configs/sft.yaml
uv run train.py grpo --config configs/grpo.yaml
uv run train.py grpo --config configs/grpo.yaml --dry-run
""" """
import argparse import argparse
@ -412,6 +412,15 @@ def cmd_sft(args):
def cmd_grpo(args): def cmd_grpo(args):
"""Run GRPO reinforcement learning on top of merged SFT weights.""" """Run GRPO reinforcement learning on top of merged SFT weights."""
print(
"GRPO is not part of the main training pipeline and has been moved to `experiments/grpo/`."
)
print("To run experimental GRPO, use:")
print(" cd finetune && uv run python experiments/grpo/grpo.py")
print("Or, if you have local config wiring ready:")
print(" uv run train.py grpo --config experiments/grpo/grpo.yaml")
return
import torch import torch
import torch.distributed as dist import torch.distributed as dist
import os import os
@ -645,8 +654,6 @@ def main():
epilog=""" epilog="""
Examples: Examples:
uv run train.py sft --config configs/sft.yaml uv run train.py sft --config configs/sft.yaml
uv run train.py grpo --config configs/grpo.yaml
uv run train.py grpo --config configs/grpo.yaml --dry-run
""", """,
) )
sub = parser.add_subparsers(dest="stage", required=True) sub = parser.add_subparsers(dest="stage", required=True)
@ -657,7 +664,10 @@ Examples:
"--dry-run", action="store_true", help="Print config and exit" "--dry-run", action="store_true", help="Print config and exit"
) )
grpo_parser = sub.add_parser("grpo", help="GRPO reinforcement learning") grpo_parser = sub.add_parser(
"grpo",
help="Experimental: GRPO reinforcement learning (moved to experiments/grpo/)",
)
grpo_parser.add_argument("--config", required=True, help="Path to GRPO config YAML") grpo_parser.add_argument("--config", required=True, help="Path to GRPO config YAML")
grpo_parser.add_argument( grpo_parser.add_argument(
"--dry-run", action="store_true", help="Print config, test reward, and exit" "--dry-run", action="store_true", help="Print config, test reward, and exit"