Move GRPO training out of default finetune pipeline
This commit is contained in:
parent
cbeeb1f89b
commit
189916d6fb
@ -38,11 +38,11 @@ The schema is enforced by `dataset/schema.py:TrainingExample` (Pydantic model).
|
||||
|
||||
| Repository | Purpose |
|
||||
|------------|---------|
|
||||
| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT + GRPO) |
|
||||
| `tobil/qmd-query-expansion-1.7B` | Final merged model (SFT baseline) |
|
||||
| `tobil/qmd-query-expansion-1.7B-gguf` | GGUF quantized versions for deployment |
|
||||
| `tobil/qmd-query-expansion-1.7B-sft` | SFT adapter checkpoint (intermediate) |
|
||||
| `tobil/qmd-query-expansion-1.7B-grpo` | GRPO adapter checkpoint (intermediate) |
|
||||
| `tobil/qmd-query-expansion-train` | Prepared training dataset |
|
||||
| `tobil/qmd-query-expansion-1.7B-grpo` | Experimental GRPO adapter (optional) |
|
||||
|
||||
**Rules:**
|
||||
- No versioned repos (`-v1`, `-v2`, `-v4`, etc.) - update in place
|
||||
@ -80,14 +80,14 @@ uv run train.py sft --config configs/sft.yaml
|
||||
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
|
||||
```
|
||||
|
||||
### Stage 2: GRPO
|
||||
### Stage 2: (Experimental) GRPO
|
||||
|
||||
```bash
|
||||
# Local (requires CUDA)
|
||||
uv run train.py grpo --config configs/grpo.yaml
|
||||
# Local (optional; experimental)
|
||||
uv run train.py grpo --config experiments/grpo/grpo.yaml
|
||||
|
||||
# Cloud (HuggingFace Jobs)
|
||||
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
|
||||
# Experimental script
|
||||
HF_TOKEN=${HF_TOKEN} uv run experiments/grpo/grpo.py
|
||||
```
|
||||
|
||||
### HuggingFace Jobs
|
||||
@ -102,9 +102,9 @@ hf jobs cancel <job-id> # Cancel a job
|
||||
### Evaluation
|
||||
|
||||
```bash
|
||||
uv run eval.py --model ./outputs/grpo
|
||||
uv run eval.py --model tobil/qmd-query-expansion-1.7B
|
||||
uv run eval.py --model ./outputs/grpo -o eval_results.json
|
||||
uv run eval.py ./outputs/sft
|
||||
uv run eval.py tobil/qmd-query-expansion-1.7B
|
||||
uv run eval.py ./outputs/sft -o eval_results.json
|
||||
```
|
||||
|
||||
## Quality Scoring
|
||||
@ -126,6 +126,9 @@ experiments/
|
||||
├── lfm2/ # LiquidAI LFM2-1.2B (hybrid architecture, faster inference)
|
||||
│ ├── sft_lfm2.yaml
|
||||
│ └── sft_lfm2.py
|
||||
├── grpo/ # Experimental GRPO recipe and config
|
||||
│ ├── grpo.py
|
||||
│ └── grpo.yaml
|
||||
└── gepa/ # DSPy-based prompt optimization (GEPA)
|
||||
├── dspy_gepa.py
|
||||
└── ...
|
||||
@ -138,7 +141,7 @@ These are not part of the main training pipeline.
|
||||
```
|
||||
finetune/
|
||||
├── reward.py # Scoring function (single source of truth)
|
||||
├── train.py # Unified SFT + GRPO training
|
||||
├── train.py # SFT training entrypoint
|
||||
├── eval.py # Generate and score expansions
|
||||
├── convert_gguf.py # GGUF conversion
|
||||
├── SCORING.md # Detailed scoring rubric
|
||||
@ -147,8 +150,8 @@ finetune/
|
||||
├── data/ # All training JSONL files (strict schema)
|
||||
├── dataset/ # Schema + data tools (Pydantic-based)
|
||||
├── jobs/ # Self-contained HuggingFace Jobs scripts
|
||||
├── configs/ # Training configs (sft.yaml, grpo.yaml)
|
||||
├── configs/ # Training configs (sft.yaml)
|
||||
├── evals/ # Test queries
|
||||
├── experiments/ # Experimental configs (LFM2, GEPA)
|
||||
├── experiments/ # Experimental configs (LFM2, GEPA, GRPO)
|
||||
└── outputs/ # Local training outputs (gitignored)
|
||||
```
|
||||
|
||||
@ -26,6 +26,9 @@ train-local:
|
||||
HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node auto \
|
||||
train.py sft --config configs/sft_local.yaml |& tee /tmp/qmd-sft-train.log
|
||||
|
||||
grpo-local:
|
||||
CUDA_VISIBLE_DEVICES=1,2,3 HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node 3 \
|
||||
train.py grpo --config configs/grpo.yaml |& tee /tmp/qmd-grpo-train.log
|
||||
# Experimental GRPO training is in finetune/experiments/grpo and not part of
|
||||
# the default pipeline.
|
||||
#
|
||||
# grpo-local:
|
||||
# HF_TOKEN=${HF_TOKEN} uv run train.py grpo --config experiments/grpo/grpo.yaml |& tee /tmp/qmd-grpo-train.log
|
||||
|
||||
|
||||
@ -40,22 +40,23 @@ These feed into QMD's three search backends:
|
||||
# 1. SFT: teach the model the output format (~45 min on A10G, ~$1.50)
|
||||
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft.py
|
||||
|
||||
# 2. GRPO: RL refinement on top of SFT (~20 min on A10G, ~$0.50)
|
||||
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
|
||||
# 2. Evaluate against test queries (needs local GPU or use eval job)
|
||||
uv run eval.py tobil/qmd-query-expansion-1.7B
|
||||
|
||||
# 3. Evaluate against test queries (needs local GPU or use eval job)
|
||||
uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
|
||||
--sft-model tobil/qmd-query-expansion-1.7B-sft
|
||||
|
||||
# 4. Convert to GGUF for local deployment (Ollama, llama.cpp)
|
||||
# 3. Convert to GGUF for local deployment (Ollama, llama.cpp)
|
||||
uv run convert_gguf.py --size 1.7B
|
||||
|
||||
# NOTE: GRPO is currently experimental and moved to finetune/experiments/grpo
|
||||
# if you want to run it manually, use uv run python experiments/grpo/grpo.py
|
||||
```
|
||||
|
||||
### Local training (if you have a GPU)
|
||||
|
||||
```bash
|
||||
uv run train.py sft --config configs/sft.yaml
|
||||
uv run train.py grpo --config configs/grpo.yaml
|
||||
|
||||
# Experimental GRPO
|
||||
uv run train.py grpo --config experiments/grpo/grpo.yaml
|
||||
```
|
||||
|
||||
### Monitoring HF Jobs
|
||||
@ -85,19 +86,19 @@ direct `lex:/vec:/hyde:` output without `<think>` blocks.
|
||||
```
|
||||
finetune/
|
||||
├── reward.py # Scoring/reward function (single source of truth)
|
||||
├── train.py # Unified SFT + GRPO training (two subcommands)
|
||||
├── train.py # SFT training entrypoint
|
||||
├── eval.py # Generate expansions and score them
|
||||
├── convert_gguf.py # GGUF conversion for Ollama/llama.cpp
|
||||
├── jobs/
|
||||
│ ├── sft.py # Self-contained SFT for HuggingFace Jobs
|
||||
│ ├── grpo.py # Self-contained GRPO for HuggingFace Jobs
|
||||
│ ├── eval.py # Self-contained eval for HuggingFace Jobs
|
||||
│ └── eval_common.py # Shared eval utilities
|
||||
├── configs/
|
||||
│ ├── sft.yaml # SFT hyperparameters for Qwen3-1.7B
|
||||
│ └── grpo.yaml # GRPO hyperparameters for Qwen3-1.7B
|
||||
│ └── sft.yaml # SFT hyperparameters for Qwen3-1.7B
|
||||
├── evals/
|
||||
│ └── queries.txt # 31 test queries across 8 categories
|
||||
├── experiments/
|
||||
│ └── grpo/ # Experimental GRPO configuration and script (optional)
|
||||
├── data/ # Training JSONL files (all concatenated for training)
|
||||
├── dataset/
|
||||
│ ├── prepare_data.py # Format for Qwen3 chat template, dedup, split
|
||||
@ -130,29 +131,14 @@ uv run train.py sft --config configs/sft.yaml
|
||||
uv run train.py sft --config configs/sft.yaml --dry-run # preview config
|
||||
```
|
||||
|
||||
### Stage 2: GRPO (Group Relative Policy Optimization)
|
||||
### Stage 2: (Experimental) GRPO
|
||||
|
||||
Reinforcement learning on top of the merged SFT weights. The model generates
|
||||
multiple expansions per query, they are scored by the reward function, and the
|
||||
model is updated to prefer higher-scoring outputs.
|
||||
|
||||
| Parameter | Value |
|
||||
|-----------|-------|
|
||||
| Base | Merged SFT checkpoint |
|
||||
| Method | LoRA (rank 4, alpha 8) — smaller for RL stability |
|
||||
| Target modules | q_proj, v_proj only |
|
||||
| Reward | `reward.py` (rule-based, 5 dimensions) |
|
||||
| KL beta | 0.04 — prevents drift from SFT checkpoint |
|
||||
| Generations per prompt | 4 |
|
||||
| Max steps | 200 |
|
||||
| Learning rate | 5e-7 |
|
||||
|
||||
**Important:** `beta > 0` is critical. With `beta=0` the model experiences
|
||||
catastrophic drift and scores drop to 0%.
|
||||
GRPO is currently treated as experimental and kept under `experiments/grpo/`.
|
||||
It is not part of the default production path for this repository.
|
||||
|
||||
```bash
|
||||
uv run train.py grpo --config configs/grpo.yaml
|
||||
uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward function
|
||||
# Optional experimental GRPO run
|
||||
uv run train.py grpo --config experiments/grpo/grpo.yaml
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
@ -160,24 +146,26 @@ uv run train.py grpo --config configs/grpo.yaml --dry-run # test reward functio
|
||||
`eval.py` generates expansions from a model and scores them against test queries:
|
||||
|
||||
```bash
|
||||
# Evaluate an SFT model
|
||||
# Evaluate a SFT model
|
||||
uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft
|
||||
|
||||
# Evaluate a GRPO model (needs SFT adapter merged first)
|
||||
uv run eval.py --model tobil/qmd-query-expansion-1.7B-grpo \
|
||||
--sft-model tobil/qmd-query-expansion-1.7B-sft
|
||||
# Evaluate an SFT output dir
|
||||
uv run eval.py outputs/sft
|
||||
|
||||
# Verbose output with deduction details
|
||||
uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -v
|
||||
uv run eval.py tobil/qmd-query-expansion-1.7B -v
|
||||
|
||||
# Optional: evaluate GRPO experimental output (if run)
|
||||
uv run eval.py outputs/grpo
|
||||
|
||||
# Save detailed scores to JSON
|
||||
uv run eval.py --model tobil/qmd-query-expansion-1.7B-sft -o scores.json
|
||||
uv run eval.py tobil/qmd-query-expansion-1.7B -o scores.json
|
||||
```
|
||||
|
||||
## Reward Function
|
||||
|
||||
`reward.py` is the single source of truth for scoring. It is used both as the
|
||||
GRPO reward signal during training and for evaluation.
|
||||
`reward.py` is the single source of truth for scoring. It is used for evaluation
|
||||
and (optionally) as the GRPO reward signal in the experimental path.
|
||||
|
||||
Five scoring dimensions (max 120 without hyde, 140 with):
|
||||
|
||||
@ -201,8 +189,8 @@ uv run reward.py
|
||||
|
||||
## GGUF Conversion
|
||||
|
||||
Merges base + SFT + GRPO adapters into a single model and produces
|
||||
quantized GGUF files for deployment:
|
||||
Merges base + SFT and (optionally) GRPO adapters into a single model, then
|
||||
produces quantized GGUF files for deployment:
|
||||
|
||||
```bash
|
||||
# Use preset for 1.7B
|
||||
@ -240,15 +228,14 @@ just validate
|
||||
|
||||
## Architecture Notes
|
||||
|
||||
The two-stage training approach (SFT -> GRPO) is standard for structured-output models:
|
||||
The production training approach is currently **SFT-only**:
|
||||
|
||||
1. **SFT** establishes format compliance and basic query understanding. It uses
|
||||
a large LoRA (rank 16, all projection layers) because it needs to learn a
|
||||
new output format from scratch.
|
||||
|
||||
2. **GRPO** refines quality within the learned format. It uses a small LoRA
|
||||
(rank 4, q/v only) and KL regularization to make incremental improvements
|
||||
without losing what SFT taught.
|
||||
2. **GRPO** exists as an optional experimental path under `experiments/grpo/`
|
||||
and is not in the production training pipeline.
|
||||
|
||||
The reward function is entirely rule-based (no LLM judge) which makes it fast,
|
||||
deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubric.
|
||||
@ -266,20 +253,12 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri
|
||||
| Epochs | 5 |
|
||||
| Hardware | A10G (24 GB VRAM) |
|
||||
|
||||
### GRPO
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Mean reward | 0.757 |
|
||||
| Final loss | 0.0005 |
|
||||
| KL divergence | 0.00048 |
|
||||
| Mean completion length | ~58 tokens |
|
||||
| Training time | ~19 min (200 steps) |
|
||||
| Hardware | A10G (24 GB VRAM) |
|
||||
|
||||
### Evaluation Scores
|
||||
|
||||
| Model | Average Score | Excellent (30) |
|
||||
|-------|--------------|-----------------|
|
||||
| SFT | 92.0% | 30/30 |
|
||||
| GRPO | 91.7% | 30/30 |
|
||||
|
||||
> GRPO scores are not tracked in this branch; see `experiments/grpo/` for historical
|
||||
> experimental results.
|
||||
|
||||
|
||||
26
finetune/experiments/grpo/README.md
Normal file
26
finetune/experiments/grpo/README.md
Normal file
@ -0,0 +1,26 @@
|
||||
# GRPO (Experimental)
|
||||
|
||||
This folder contains the **experimental** GRPO training path for query expansion.
|
||||
It is not part of the default production pipeline.
|
||||
|
||||
## Files
|
||||
|
||||
- `grpo.yaml` – experimental GRPO hyperparameters
|
||||
- `grpo.py` – standalone GRPO training script
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
# Recommended default: run from repo root
|
||||
cd /home/tobi/qmd
|
||||
uv run finetune/experiments/grpo/grpo.py
|
||||
|
||||
# Or use unified entrypoint (deprecated in main pipeline):
|
||||
uv run train.py grpo --config finetune/experiments/grpo/grpo.yaml
|
||||
```
|
||||
|
||||
## Notes
|
||||
|
||||
- Current mainline focuses on SFT-only quality and benchmarks.
|
||||
- Keep this workflow isolated unless you are explicitly experimenting with
|
||||
reinforcement-learning refinement.
|
||||
@ -14,8 +14,10 @@
|
||||
"""
|
||||
GRPO training for QMD query expansion (Qwen3-1.7B).
|
||||
|
||||
Runs on top of merged SFT weights. Self-contained for HuggingFace Jobs:
|
||||
hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 4h jobs/grpo.py
|
||||
Experimental recipe run on top of merged SFT weights. Self-contained runner:
|
||||
uv run experiments/grpo/grpo.py
|
||||
|
||||
(If using HF Jobs, run this script as the job entrypoint.)
|
||||
"""
|
||||
|
||||
import os
|
||||
@ -42,7 +44,7 @@ if not os.path.exists(_eval_common_path):
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from eval_common import QMDRewardFunction, run_eval
|
||||
|
||||
# --- Config (inlined from configs/grpo.yaml) ---
|
||||
# --- Config (inlined from experiments/grpo/grpo.yaml) ---
|
||||
BASE_MODEL = "Qwen/Qwen3-1.7B"
|
||||
SFT_MODEL = "tobil/qmd-query-expansion-1.7B-sft"
|
||||
OUTPUT_MODEL = "tobil/qmd-query-expansion-1.7B-grpo"
|
||||
@ -1,7 +1,7 @@
|
||||
# GRPO Training Config for QMD Query Expansion
|
||||
# Target: Qwen3-1.7B, trained on top of merged SFT weights
|
||||
#
|
||||
# Usage: uv run train.py grpo --config configs/grpo.yaml
|
||||
# Usage: uv run train.py grpo --config experiments/grpo/grpo.yaml
|
||||
#
|
||||
# The reward function (reward.py) scores expansions on format compliance,
|
||||
# diversity, hyde quality, content quality, and named entity preservation.
|
||||
@ -18,14 +18,14 @@
|
||||
"""
|
||||
Unified training script for QMD query expansion models.
|
||||
|
||||
Supports two stages:
|
||||
Primary pipeline is SFT-only:
|
||||
sft - Supervised fine-tuning on labeled examples
|
||||
grpo - Group Relative Policy Optimization (RL) on top of merged SFT weights
|
||||
|
||||
GRPO was moved to `experiments/grpo/` and is not part of the main training
|
||||
pipeline by default.
|
||||
|
||||
Usage:
|
||||
uv run train.py sft --config configs/sft.yaml
|
||||
uv run train.py grpo --config configs/grpo.yaml
|
||||
uv run train.py grpo --config configs/grpo.yaml --dry-run
|
||||
"""
|
||||
|
||||
import argparse
|
||||
@ -412,6 +412,15 @@ def cmd_sft(args):
|
||||
|
||||
def cmd_grpo(args):
|
||||
"""Run GRPO reinforcement learning on top of merged SFT weights."""
|
||||
print(
|
||||
"GRPO is not part of the main training pipeline and has been moved to `experiments/grpo/`."
|
||||
)
|
||||
print("To run experimental GRPO, use:")
|
||||
print(" cd finetune && uv run python experiments/grpo/grpo.py")
|
||||
print("Or, if you have local config wiring ready:")
|
||||
print(" uv run train.py grpo --config experiments/grpo/grpo.yaml")
|
||||
return
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import os
|
||||
@ -645,8 +654,6 @@ def main():
|
||||
epilog="""
|
||||
Examples:
|
||||
uv run train.py sft --config configs/sft.yaml
|
||||
uv run train.py grpo --config configs/grpo.yaml
|
||||
uv run train.py grpo --config configs/grpo.yaml --dry-run
|
||||
""",
|
||||
)
|
||||
sub = parser.add_subparsers(dest="stage", required=True)
|
||||
@ -657,7 +664,10 @@ Examples:
|
||||
"--dry-run", action="store_true", help="Print config and exit"
|
||||
)
|
||||
|
||||
grpo_parser = sub.add_parser("grpo", help="GRPO reinforcement learning")
|
||||
grpo_parser = sub.add_parser(
|
||||
"grpo",
|
||||
help="Experimental: GRPO reinforcement learning (moved to experiments/grpo/)",
|
||||
)
|
||||
grpo_parser.add_argument("--config", required=True, help="Path to GRPO config YAML")
|
||||
grpo_parser.add_argument(
|
||||
"--dry-run", action="store_true", help="Print config, test reward, and exit"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user