Consolidate ~2,800 lines of duplicated code across 12 files into 5 clean, well-documented files targeting Qwen3-1.7B end-to-end. Key changes: - Extract reward function into single source of truth (reward.py) Previously duplicated 3x with divergent bugs across rl.py, train_1.7B_grpo.py, and train_4B_grpo.py - Unify training into one script with sft/grpo subcommands (train.py) Replaces train.py + rl.py + train_1.7B_grpo.py + train_4B_grpo.py - Merge eval generate+score into single eval.py Replaces evals/run.py + evals/score.py - Parameterize GGUF conversion by --size (convert_gguf.py) Replaces convert_1.7B_gguf.py + convert_4B_gguf.py - Fix critical bug: rl.py silently ignored beta/temperature from config, causing the exact catastrophic drift its own comments warned about - Fix prompt consistency: all files use /no_think chat template format - Retarget configs from 0.6B to 1.7B - Comprehensive README documenting the full pipeline Removed: rl.py, train_1.7B_grpo.py, train_4B_grpo.py, convert_1.7B_gguf.py, convert_4B_gguf.py, tui.py, evals/run.py, evals/score.py Net: -3,429 lines, +382 lines Co-Authored-By: Claude (claude-fudge-eap-cc) <noreply@anthropic.com>
222 lines
7.9 KiB
Python
222 lines
7.9 KiB
Python
#!/usr/bin/env python3
|
|
# /// script
|
|
# requires-python = ">=3.10"
|
|
# dependencies = [
|
|
# "transformers>=4.36.0",
|
|
# "peft>=0.7.0",
|
|
# "torch>=2.0.0",
|
|
# "accelerate>=0.24.0",
|
|
# "huggingface_hub>=0.20.0",
|
|
# "sentencepiece>=0.1.99",
|
|
# "protobuf>=3.20.0",
|
|
# "numpy",
|
|
# "gguf",
|
|
# ]
|
|
# ///
|
|
"""
|
|
Convert QMD query expansion model to GGUF format.
|
|
|
|
Loads the base model, merges SFT and GRPO adapters, then converts to
|
|
GGUF with multiple quantizations for use with Ollama/llama.cpp/LM Studio.
|
|
|
|
Usage:
|
|
uv run convert_gguf.py --size 1.7B
|
|
uv run convert_gguf.py --size 4B --skip-quantize
|
|
uv run convert_gguf.py --base Qwen/Qwen3-1.7B \
|
|
--sft tobil/qmd-query-expansion-1.7B-sft \
|
|
--grpo tobil/qmd-query-expansion-1.7B-grpo \
|
|
--output tobil/qmd-query-expansion-1.7B-gguf
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
|
|
import torch
|
|
from huggingface_hub import HfApi, login
|
|
from peft import PeftModel
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
|
# Preset configurations for each model size
|
|
PRESETS = {
|
|
"1.7B": {
|
|
"base": "Qwen/Qwen3-1.7B",
|
|
"sft": "tobil/qmd-query-expansion-1.7B-sft",
|
|
"grpo": "tobil/qmd-query-expansion-1.7B-grpo",
|
|
"output": "tobil/qmd-query-expansion-1.7B-gguf",
|
|
"ollama_name": "qmd-expand",
|
|
},
|
|
"4B": {
|
|
"base": "Qwen/Qwen3-4B",
|
|
"sft": "tobil/qmd-query-expansion-4B-sft",
|
|
"grpo": "tobil/qmd-query-expansion-4B-grpo",
|
|
"output": "tobil/qmd-query-expansion-4B-gguf",
|
|
"ollama_name": "qmd-expand-4b",
|
|
},
|
|
}
|
|
|
|
|
|
def run_cmd(cmd, description):
|
|
"""Run a shell command with error handling."""
|
|
print(f" {description}...")
|
|
try:
|
|
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
return True
|
|
except subprocess.CalledProcessError as e:
|
|
print(f" FAILED: {' '.join(cmd)}")
|
|
if e.stderr:
|
|
print(f" {e.stderr[:500]}")
|
|
return False
|
|
except FileNotFoundError:
|
|
print(f" Command not found: {cmd[0]}")
|
|
return False
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Convert QMD model to GGUF")
|
|
parser.add_argument("--size", choices=PRESETS.keys(), help="Use preset config for model size")
|
|
parser.add_argument("--base", help="Base model (overrides preset)")
|
|
parser.add_argument("--sft", help="SFT adapter (overrides preset)")
|
|
parser.add_argument("--grpo", help="GRPO adapter (overrides preset)")
|
|
parser.add_argument("--output", help="Output HF repo (overrides preset)")
|
|
parser.add_argument("--skip-quantize", action="store_true", help="Only produce FP16 GGUF")
|
|
parser.add_argument("--no-upload", action="store_true", help="Don't upload to HF Hub")
|
|
args = parser.parse_args()
|
|
|
|
# Resolve config
|
|
if args.size:
|
|
preset = PRESETS[args.size]
|
|
base_model = args.base or preset["base"]
|
|
sft_model = args.sft or preset["sft"]
|
|
grpo_model = args.grpo or preset["grpo"]
|
|
output_repo = args.output or preset["output"]
|
|
elif args.base and args.sft and args.grpo and args.output:
|
|
base_model = args.base
|
|
sft_model = args.sft
|
|
grpo_model = args.grpo
|
|
output_repo = args.output
|
|
else:
|
|
parser.error("Either --size or all of --base/--sft/--grpo/--output are required")
|
|
|
|
model_name = output_repo.split("/")[-1].replace("-gguf", "")
|
|
print(f"QMD GGUF Conversion: {model_name}")
|
|
print("=" * 60)
|
|
|
|
# Install build tools (for Colab/cloud environments)
|
|
print("\nInstalling build dependencies...")
|
|
subprocess.run(["apt-get", "update", "-qq"], capture_output=True)
|
|
subprocess.run(["apt-get", "install", "-y", "-qq", "build-essential", "cmake", "git"], capture_output=True)
|
|
|
|
# Login
|
|
hf_token = os.environ.get("HF_TOKEN")
|
|
if hf_token:
|
|
print("Logging in to HuggingFace...")
|
|
login(token=hf_token)
|
|
|
|
# Step 1: Load and merge
|
|
print(f"\nStep 1: Loading base model {base_model}...")
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
base_model, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True,
|
|
)
|
|
|
|
print(f"Step 2: Merging SFT adapter {sft_model}...")
|
|
model = PeftModel.from_pretrained(model, sft_model)
|
|
model = model.merge_and_unload()
|
|
|
|
print(f"Step 3: Merging GRPO adapter {grpo_model}...")
|
|
model = PeftModel.from_pretrained(model, grpo_model)
|
|
model = model.merge_and_unload()
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
|
|
|
# Step 2: Save merged model
|
|
merged_dir = "/tmp/merged_model"
|
|
print(f"\nStep 4: Saving merged model to {merged_dir}...")
|
|
model.save_pretrained(merged_dir, safe_serialization=True)
|
|
tokenizer.save_pretrained(merged_dir)
|
|
|
|
# Step 3: Setup llama.cpp
|
|
print("\nStep 5: Setting up llama.cpp...")
|
|
if not os.path.exists("/tmp/llama.cpp"):
|
|
run_cmd(["git", "clone", "--depth", "1", "https://github.com/ggerganov/llama.cpp.git", "/tmp/llama.cpp"],
|
|
"Cloning llama.cpp")
|
|
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "-r", "/tmp/llama.cpp/requirements.txt"],
|
|
capture_output=True)
|
|
|
|
# Step 4: Convert to FP16 GGUF
|
|
gguf_dir = "/tmp/gguf_output"
|
|
os.makedirs(gguf_dir, exist_ok=True)
|
|
gguf_file = f"{gguf_dir}/{model_name}-f16.gguf"
|
|
|
|
print(f"\nStep 6: Converting to FP16 GGUF...")
|
|
if not run_cmd([sys.executable, "/tmp/llama.cpp/convert_hf_to_gguf.py",
|
|
merged_dir, "--outfile", gguf_file, "--outtype", "f16"],
|
|
"Converting"):
|
|
sys.exit(1)
|
|
|
|
size_mb = os.path.getsize(gguf_file) / (1024 * 1024)
|
|
print(f" FP16: {size_mb:.1f} MB")
|
|
|
|
# Step 5: Quantize
|
|
quantized_files = []
|
|
if not args.skip_quantize:
|
|
print("\nStep 7: Building quantize tool...")
|
|
os.makedirs("/tmp/llama.cpp/build", exist_ok=True)
|
|
run_cmd(["cmake", "-B", "/tmp/llama.cpp/build", "-S", "/tmp/llama.cpp", "-DGGML_CUDA=OFF"],
|
|
"CMake configure")
|
|
run_cmd(["cmake", "--build", "/tmp/llama.cpp/build", "--target", "llama-quantize", "-j", "4"],
|
|
"Building llama-quantize")
|
|
quantize_bin = "/tmp/llama.cpp/build/bin/llama-quantize"
|
|
|
|
print("\nStep 8: Quantizing...")
|
|
for quant_type, desc in [("Q4_K_M", "4-bit"), ("Q5_K_M", "5-bit"), ("Q8_0", "8-bit")]:
|
|
qfile = f"{gguf_dir}/{model_name}-{quant_type.lower()}.gguf"
|
|
if run_cmd([quantize_bin, gguf_file, qfile, quant_type], f"{quant_type} ({desc})"):
|
|
qsize = os.path.getsize(qfile) / (1024 * 1024)
|
|
print(f" {quant_type}: {qsize:.1f} MB")
|
|
quantized_files.append((qfile, quant_type))
|
|
|
|
# Step 6: Upload
|
|
if not args.no_upload:
|
|
print(f"\nStep 9: Uploading to {output_repo}...")
|
|
api = HfApi()
|
|
api.create_repo(repo_id=output_repo, repo_type="model", exist_ok=True)
|
|
|
|
api.upload_file(path_or_fileobj=gguf_file,
|
|
path_in_repo=f"{model_name}-f16.gguf", repo_id=output_repo)
|
|
for qfile, qtype in quantized_files:
|
|
api.upload_file(path_or_fileobj=qfile,
|
|
path_in_repo=f"{model_name}-{qtype.lower()}.gguf", repo_id=output_repo)
|
|
|
|
# Upload README
|
|
readme = f"""---
|
|
base_model: {base_model}
|
|
tags: [gguf, llama.cpp, quantized, query-expansion, qmd]
|
|
---
|
|
# {model_name} (GGUF)
|
|
|
|
GGUF conversion of the QMD Query Expansion model.
|
|
|
|
## Details
|
|
- **Base:** {base_model}
|
|
- **SFT:** {sft_model}
|
|
- **GRPO:** {grpo_model}
|
|
- **Task:** Query expansion (lex/vec/hyde format)
|
|
|
|
## Prompt Format
|
|
```
|
|
<|im_start|>user
|
|
/no_think Expand this search query: your query here<|im_end|>
|
|
<|im_start|>assistant
|
|
```
|
|
"""
|
|
api.upload_file(path_or_fileobj=readme.encode(),
|
|
path_in_repo="README.md", repo_id=output_repo)
|
|
|
|
print(f"\nDone! Repository: https://huggingface.co/{output_repo}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|