Resolve conflicts: combine AST chunking args (filepath, chunkStrategy) with abort signal parameter from #458. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
183 lines
6.3 KiB
Python
183 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Benchmark QMD query expansion: LFM2.5 vs Qwen3 finetuned models."""
|
|
|
|
import json
|
|
import time
|
|
import torch
|
|
from pathlib import Path
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
|
|
from peft import PeftModel
|
|
|
|
QUERIES = [
|
|
"kubernetes pod networking",
|
|
"best practices for React server components",
|
|
"how to optimize PostgreSQL queries for large tables",
|
|
"what is retrieval augmented generation",
|
|
"python async await concurrency patterns",
|
|
"nginx reverse proxy load balancing",
|
|
"git rebase vs merge workflow",
|
|
"rust ownership and borrowing explained",
|
|
"docker compose multi-stage builds",
|
|
"elasticsearch full text search performance",
|
|
"shopify liquid template customization",
|
|
"machine learning feature engineering techniques",
|
|
"aws lambda cold start optimization",
|
|
"typescript generics and utility types",
|
|
"redis caching strategies for web apps",
|
|
]
|
|
|
|
def load_model(base_name, adapter_dir, device, trust_remote=False):
|
|
tokenizer = AutoTokenizer.from_pretrained(base_name, trust_remote_code=trust_remote)
|
|
base = AutoModelForCausalLM.from_pretrained(
|
|
base_name, dtype=torch.bfloat16, device_map=device, trust_remote_code=trust_remote
|
|
)
|
|
model = PeftModel.from_pretrained(base, adapter_dir, local_files_only=True)
|
|
model = model.merge_and_unload()
|
|
model.eval()
|
|
|
|
gen_config_path = Path(adapter_dir) / "generation_config.json"
|
|
if gen_config_path.exists():
|
|
gen_config = GenerationConfig.from_pretrained(adapter_dir)
|
|
else:
|
|
gen_config = GenerationConfig(
|
|
temperature=0.1, top_k=50, top_p=0.1,
|
|
repetition_penalty=1.05, do_sample=True, max_new_tokens=300,
|
|
)
|
|
return model, tokenizer, gen_config
|
|
|
|
def run_inference(model, tokenizer, gen_config, query, device):
|
|
messages = [{"role": "user", "content": f"Expand this search query: {query}"}]
|
|
text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
|
inputs = tokenizer(text, return_tensors="pt").to(device)
|
|
|
|
start = time.perf_counter()
|
|
with torch.no_grad():
|
|
out = model.generate(**inputs, generation_config=gen_config, max_new_tokens=300)
|
|
elapsed = time.perf_counter() - start
|
|
|
|
new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1]
|
|
result = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
|
|
return result, elapsed, new_tokens
|
|
|
|
def score_output(output):
|
|
"""Simple quality scoring: check for lex/vec/hyde presence and specificity."""
|
|
score = 0
|
|
lines = output.strip().split("\n")
|
|
has_lex = has_vec = has_hyde = False
|
|
hyde_text = ""
|
|
|
|
for line in lines:
|
|
l = line.strip()
|
|
if l.startswith("lex:"):
|
|
has_lex = True
|
|
score += 1
|
|
elif l.startswith("vec:"):
|
|
has_vec = True
|
|
score += 1
|
|
elif l.startswith("hyde:"):
|
|
has_hyde = True
|
|
hyde_text = l[5:].strip()
|
|
score += 2 # hyde is worth more
|
|
|
|
# Bonus for hyde length in sweet spot (80-200 chars)
|
|
if hyde_text:
|
|
hlen = len(hyde_text)
|
|
if 80 <= hlen <= 200:
|
|
score += 2
|
|
elif 50 <= hlen <= 250:
|
|
score += 1
|
|
|
|
# Penalty for generic/template hyde
|
|
generic_phrases = ["comprehensive guide", "everything you need to know", "beginners and advanced users"]
|
|
for phrase in generic_phrases:
|
|
if phrase in hyde_text.lower():
|
|
score -= 1
|
|
|
|
return score, {"has_lex": has_lex, "has_vec": has_vec, "has_hyde": has_hyde, "hyde_len": len(hyde_text)}
|
|
|
|
def main():
|
|
device = "cuda:0"
|
|
|
|
models = {
|
|
"LFM2.5-1.2B (finetuned)": {
|
|
"base": "LiquidAI/LFM2.5-1.2B-Instruct",
|
|
"adapter": "outputs/sft-lfm2",
|
|
"trust_remote": True,
|
|
},
|
|
"Qwen3-1.7B (finetuned)": {
|
|
"base": "Qwen/Qwen3-1.7B",
|
|
"adapter": "outputs/sft",
|
|
"trust_remote": False,
|
|
},
|
|
}
|
|
|
|
results = {}
|
|
|
|
for name, cfg in models.items():
|
|
print(f"\n{'='*60}")
|
|
print(f"Loading {name}...")
|
|
model, tokenizer, gen_config = load_model(
|
|
cfg["base"], cfg["adapter"], device, cfg["trust_remote"]
|
|
)
|
|
|
|
model_results = []
|
|
total_time = 0
|
|
total_tokens = 0
|
|
total_score = 0
|
|
|
|
for query in QUERIES:
|
|
output, elapsed, n_tokens = run_inference(model, tokenizer, gen_config, query, device)
|
|
score, details = score_output(output)
|
|
|
|
model_results.append({
|
|
"query": query,
|
|
"output": output,
|
|
"time_s": round(elapsed, 3),
|
|
"tokens": n_tokens,
|
|
"score": score,
|
|
"details": details,
|
|
})
|
|
total_time += elapsed
|
|
total_tokens += n_tokens
|
|
total_score += score
|
|
|
|
tok_s = n_tokens / elapsed if elapsed > 0 else 0
|
|
print(f" [{score:2d}] {query[:40]:<40} {elapsed:.2f}s {n_tokens:3d}tok {tok_s:.0f}tok/s")
|
|
|
|
avg_time = total_time / len(QUERIES)
|
|
avg_score = total_score / len(QUERIES)
|
|
avg_toks = total_tokens / total_time if total_time > 0 else 0
|
|
|
|
results[name] = {
|
|
"queries": model_results,
|
|
"avg_time_s": round(avg_time, 3),
|
|
"avg_score": round(avg_score, 2),
|
|
"avg_tok_s": round(avg_toks, 1),
|
|
"total_score": total_score,
|
|
}
|
|
|
|
print(f"\n Summary: avg_score={avg_score:.2f} avg_time={avg_time:.2f}s avg_tok/s={avg_toks:.0f}")
|
|
|
|
# Free GPU memory
|
|
del model
|
|
torch.cuda.empty_cache()
|
|
|
|
# Print comparison
|
|
print(f"\n{'='*60}")
|
|
print("COMPARISON")
|
|
print(f"{'='*60}")
|
|
for name, r in results.items():
|
|
print(f"\n{name}:")
|
|
print(f" Total Score: {r['total_score']} / {len(QUERIES) * 8}") # max ~8 per query
|
|
print(f" Avg Score: {r['avg_score']}")
|
|
print(f" Avg Time: {r['avg_time_s']}s")
|
|
print(f" Throughput: {r['avg_tok_s']} tok/s")
|
|
|
|
# Save full results
|
|
with open("outputs/benchmark_results.json", "w") as f:
|
|
json.dump(results, f, indent=2)
|
|
print("\nFull results saved to outputs/benchmark_results.json")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|