diff --git a/finetune/README.md b/finetune/README.md index fec6a1e..99685e9 100644 --- a/finetune/README.md +++ b/finetune/README.md @@ -297,3 +297,42 @@ deterministic, and suitable as an RL signal. See `SCORING.md` for the full rubri |-------|--------------|-----------------| | SFT | 92.0% | 30/30 | | GRPO | 91.7% | 30/30 | + +## Alternative Base Models + +### LiquidAI LFM2 (Experimental) + +[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) +is a hybrid architecture from Liquid AI optimized for on-device inference. It uses +a novel combination of convolutions and attention that achieves 2x faster decode +and prefill speed compared to standard transformers. + +**Why LFM2 for query expansion:** +- **Faster inference**: Lower latency for real-time search applications +- **Memory efficient**: Smaller memory footprint than equivalent transformers +- **Edge-optimized**: Can run on mobile devices and embedded systems +- **Good at agentic tasks**: LiquidAI recommends LFM2 for RAG and data extraction + +**Training with LFM2:** + +```bash +# SFT with LFM2-1.2B base model +uv run train.py sft --config configs/sft_lfm2.yaml + +# Evaluate the trained model +uv run eval.py --model outputs/sft-lfm2 + +# Convert to GGUF for deployment +uv run convert_gguf.py --base LiquidAI/LFM2-1.2B \ + --sft outputs/sft-lfm2 \ + --output tobil/qmd-query-expansion-lfm2-gguf +``` + +**Key differences from Qwen3:** +- Different LoRA target modules: `q_proj, k_proj, v_proj, out_proj, in_proj, w1, w2, w3` +- Recommended generation parameters: `temp=0.3, min_p=0.15, repetition_penalty=1.05` +- Requires transformers >= 4.55.0 for architecture support + +**Pre-trained GGUF models:** +- Base: `hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf` (~731 MB) +- Instruct: `hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf` (~731 MB) diff --git a/finetune/configs/sft_lfm2.yaml b/finetune/configs/sft_lfm2.yaml new file mode 100644 index 0000000..7ece2f5 --- /dev/null +++ b/finetune/configs/sft_lfm2.yaml @@ -0,0 +1,60 @@ +# SFT Training Config for QMD Query Expansion with LiquidAI LFM2 +# Target: LFM2-1.2B with LoRA (hybrid architecture: convolutions + attention) +# +# LFM2 is optimized for on-device inference with fast decode/prefill. +# Recommended for: agentic tasks, data extraction, RAG, creative writing. +# +# Usage: uv run train.py sft --config configs/sft_lfm2.yaml +# +# Requirements: +# - transformers >= 4.55.0 (LFM2 architecture support) +# - May need: pip install -U transformers + +model: + base: "LiquidAI/LFM2-1.2B" + output: "outputs/sft-lfm2" # Local training output (push to HF manually after eval) + +dataset: + # Local: run `uv run dataset/prepare_data.py` first, then use "data/train/" + # HuggingFace: use "tobil/qmd-query-expansion-train" (already prepared) + name: "data/train/" + text_field: "text" + split: "train" + eval_split: 0.1 + +training: + epochs: 5 + batch_size: 4 + gradient_accumulation_steps: 4 + learning_rate: 2e-4 + max_length: 512 + warmup_ratio: 0.03 + lr_scheduler: "cosine" + +lora: + rank: 16 + alpha: 32 + dropout: 0.0 + # LFM2 uses different architecture than standard transformers: + # - Attention layers: q_proj, k_proj, v_proj, out_proj + # - Input projection: in_proj + # - FFN/MLP gates: w1, w2, w3 (SwiGLU activation) + target_modules: + - "q_proj" + - "k_proj" + - "v_proj" + - "out_proj" + - "in_proj" + - "w1" + - "w2" + - "w3" + +tracking: + project: "qmd-query-expansion" + run_name: "sft-lfm2-1.2B" + +# LFM2-specific generation settings (recommended by LiquidAI) +generation: + temperature: 0.3 + min_p: 0.15 + repetition_penalty: 1.05 diff --git a/finetune/jobs/sft_lfm2.py b/finetune/jobs/sft_lfm2.py new file mode 100644 index 0000000..35a2b24 --- /dev/null +++ b/finetune/jobs/sft_lfm2.py @@ -0,0 +1,106 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "trl>=0.12.0", +# "peft>=0.7.0", +# "transformers>=4.55.0", +# "accelerate>=0.24.0", +# "huggingface_hub>=0.20.0", +# "datasets", +# "bitsandbytes", +# "torch", +# ] +# /// +""" +SFT training for QMD query expansion with LiquidAI LFM2-1.2B. + +LFM2 is a hybrid architecture optimized for edge/on-device inference. +Uses different LoRA target modules than standard transformers. + +Self-contained script for HuggingFace Jobs: + hf jobs uv run --flavor a10g-large --secrets HF_TOKEN --timeout 2h jobs/sft_lfm2.py +""" + +import os +from huggingface_hub import login + +# --- Config (inlined from configs/sft_lfm2.yaml) --- +BASE_MODEL = "LiquidAI/LFM2-1.2B" +OUTPUT_MODEL = "tobil/qmd-query-expansion-lfm2-sft" +DATASET = "tobil/qmd-query-expansion-train" + +hf_token = os.environ.get("HF_TOKEN") +if hf_token: + login(token=hf_token) + +from datasets import load_dataset +from peft import LoraConfig +from transformers import AutoTokenizer +from trl import SFTTrainer, SFTConfig + +# Load and split dataset +print(f"Loading dataset: {DATASET}...") +dataset = load_dataset(DATASET, split="train") +print(f"Dataset loaded: {len(dataset)} examples") + +split = dataset.train_test_split(test_size=0.1, seed=42) +train_dataset = split["train"] +eval_dataset = split["test"] +print(f" Train: {len(train_dataset)}, Eval: {len(eval_dataset)}") + +# SFT config +config = SFTConfig( + output_dir="qmd-query-expansion-lfm2-sft", + push_to_hub=True, + hub_model_id=OUTPUT_MODEL, + hub_strategy="every_save", + + num_train_epochs=5, + per_device_train_batch_size=4, + gradient_accumulation_steps=4, + learning_rate=2e-4, + max_length=512, + + logging_steps=10, + save_strategy="steps", + save_steps=200, + save_total_limit=2, + eval_strategy="steps", + eval_steps=200, + + warmup_ratio=0.03, + lr_scheduler_type="cosine", + bf16=True, + + report_to="none", +) + +# LoRA config for LFM2 architecture +# LFM2 uses different layer names than standard transformers: +# - Attention: q_proj, k_proj, v_proj, out_proj +# - Input projection: in_proj +# - FFN/MLP gates (SwiGLU): w1, w2, w3 +peft_config = LoraConfig( + r=16, + lora_alpha=32, + lora_dropout=0.0, + bias="none", + task_type="CAUSAL_LM", + target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "in_proj", "w1", "w2", "w3"], +) + +print("Initializing SFT trainer...") +trainer = SFTTrainer( + model=BASE_MODEL, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + args=config, + peft_config=peft_config, +) + +print("Starting SFT training (LFM2-1.2B)...") +trainer.train() + +print("Pushing to Hub...") +trainer.push_to_hub() +print(f"Done! Model: https://huggingface.co/{OUTPUT_MODEL}") diff --git a/src/llm.ts b/src/llm.ts index ab39c86..b065935 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -178,6 +178,12 @@ const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-re // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; +// Alternative generation models for query expansion: +// LiquidAI LFM2 - hybrid architecture optimized for edge/on-device inference +// Use these as base for fine-tuning with configs/sft_lfm2.yaml +export const LFM2_GENERATE_MODEL = "hf:LiquidAI/LFM2-1.2B-GGUF/LFM2-1.2B-Q4_K_M.gguf"; +export const LFM2_INSTRUCT_MODEL = "hf:LiquidAI/LFM2.5-1.2B-Instruct-GGUF/LFM2.5-1.2B-Instruct-Q4_K_M.gguf"; + export const DEFAULT_EMBED_MODEL_URI = DEFAULT_EMBED_MODEL; export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL; export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL;