qmd/finetune/benchmark.py

#!/usr/bin/env python3
"""Benchmark QMD query expansion: LFM2.5 vs Qwen3 finetuned models."""

import json
import time
import torch
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
from peft import PeftModel

QUERIES = [
    "kubernetes pod networking",
    "best practices for React server components",
    "how to optimize PostgreSQL queries for large tables",
    "what is retrieval augmented generation",
    "python async await concurrency patterns",
    "nginx reverse proxy load balancing",
    "git rebase vs merge workflow",
    "rust ownership and borrowing explained",
    "docker compose multi-stage builds",
    "elasticsearch full text search performance",
    "shopify liquid template customization",
    "machine learning feature engineering techniques",
    "aws lambda cold start optimization",
    "typescript generics and utility types",
    "redis caching strategies for web apps",
]

def load_model(base_name, adapter_dir, device, trust_remote=False):
    tokenizer = AutoTokenizer.from_pretrained(base_name, trust_remote_code=trust_remote)
    base = AutoModelForCausalLM.from_pretrained(
        base_name, dtype=torch.bfloat16, device_map=device, trust_remote_code=trust_remote
    )
    model = PeftModel.from_pretrained(base, adapter_dir, local_files_only=True)
    model = model.merge_and_unload()
    model.eval()

    gen_config_path = Path(adapter_dir) / "generation_config.json"
    if gen_config_path.exists():
        gen_config = GenerationConfig.from_pretrained(adapter_dir)
    else:
        gen_config = GenerationConfig(
            temperature=0.1, top_k=50, top_p=0.1,
            repetition_penalty=1.05, do_sample=True, max_new_tokens=300,
        )
    return model, tokenizer, gen_config

def run_inference(model, tokenizer, gen_config, query, device):
    messages = [{"role": "user", "content": f"Expand this search query: {query}"}]
    text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    inputs = tokenizer(text, return_tensors="pt").to(device)

    start = time.perf_counter()
    with torch.no_grad():
        out = model.generate(**inputs, generation_config=gen_config, max_new_tokens=300)
    elapsed = time.perf_counter() - start

    new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
    result = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
    return result, elapsed, new_tokens

def score_output(output):
    """Simple quality scoring: check for lex/vec/hyde presence and specificity."""
    score = 0
    lines = output.strip().split("\n")
    has_lex = has_vec = has_hyde = False
    hyde_text = ""

    for line in lines:
        l = line.strip()
        if l.startswith("lex:"):
            has_lex = True
            score += 1
        elif l.startswith("vec:"):
            has_vec = True
            score += 1
        elif l.startswith("hyde:"):
            has_hyde = True
            hyde_text = l[5:].strip()
            score += 2  # hyde is worth more

    # Bonus for hyde length in sweet spot (80-200 chars)
    if hyde_text:
        hlen = len(hyde_text)
        if 80 <= hlen <= 200:
            score += 2
        elif 50 <= hlen <= 250:
            score += 1

    # Penalty for generic/template hyde
    generic_phrases = ["comprehensive guide", "everything you need to know", "beginners and advanced users"]
    for phrase in generic_phrases:
        if phrase in hyde_text.lower():
            score -= 1

    return score, {"has_lex": has_lex, "has_vec": has_vec, "has_hyde": has_hyde, "hyde_len": len(hyde_text)}

def main():
    device = "cuda:0"

    models = {
        "LFM2.5-1.2B (finetuned)": {
            "base": "LiquidAI/LFM2.5-1.2B-Instruct",
            "adapter": "outputs/sft-lfm2",
            "trust_remote": True,
        },
        "Qwen3-1.7B (finetuned)": {
            "base": "Qwen/Qwen3-1.7B",
            "adapter": "outputs/sft",
            "trust_remote": False,
        },
    }

    results = {}

    for name, cfg in models.items():
        print(f"\n{'='*60}")
        print(f"Loading {name}...")
        model, tokenizer, gen_config = load_model(
            cfg["base"], cfg["adapter"], device, cfg["trust_remote"]
        )

        model_results = []
        total_time = 0
        total_tokens = 0
        total_score = 0

        for query in QUERIES:
            output, elapsed, n_tokens = run_inference(model, tokenizer, gen_config, query, device)
            score, details = score_output(output)

            model_results.append({
                "query": query,
                "output": output,
                "time_s": round(elapsed, 3),
                "tokens": n_tokens,
                "score": score,
                "details": details,
            })
            total_time += elapsed
            total_tokens += n_tokens
            total_score += score

            tok_s = n_tokens / elapsed if elapsed > 0 else 0
            print(f"  [{score:2d}] {query[:40]:<40} {elapsed:.2f}s {n_tokens:3d}tok {tok_s:.0f}tok/s")

        avg_time = total_time / len(QUERIES)
        avg_score = total_score / len(QUERIES)
        avg_toks = total_tokens / total_time if total_time > 0 else 0

        results[name] = {
            "queries": model_results,
            "avg_time_s": round(avg_time, 3),
            "avg_score": round(avg_score, 2),
            "avg_tok_s": round(avg_toks, 1),
            "total_score": total_score,
        }

        print(f"\n  Summary: avg_score={avg_score:.2f} avg_time={avg_time:.2f}s avg_tok/s={avg_toks:.0f}")

        # Free GPU memory
        del model
        torch.cuda.empty_cache()

    # Print comparison
    print(f"\n{'='*60}")
    print("COMPARISON")
    print(f"{'='*60}")
    for name, r in results.items():
        print(f"\n{name}:")
        print(f"  Total Score: {r['total_score']} / {len(QUERIES) * 8}")  # max ~8 per query
        print(f"  Avg Score:   {r['avg_score']}")
        print(f"  Avg Time:    {r['avg_time_s']}s")
        print(f"  Throughput:  {r['avg_tok_s']} tok/s")

    # Save full results
    with open("outputs/benchmark_results.json", "w") as f:
        json.dump(results, f, indent=2)
    print("\nFull results saved to outputs/benchmark_results.json")

if __name__ == "__main__":
    main()