Refactor evals into separate run and score scripts

New structure: - evals/run.py: Generate model outputs to JSONL - evals/score.py: Score outputs with detailed breakdown - evals/queries.txt: Test queries (26 total) Features: - Supports both HF Hub and local model paths - Named entity preservation scoring - Chat template leakage detection - Strict format validation (every line must be lex:/vec:/hyde:) - Generic phrase detection Usage: uv run evals/run.py --model tobil/qmd-query-expansion-0.6B-v4 uv run evals/score.py evals/results_*.jsonl Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 00:40:33 -05:00 · 2026-01-25 00:40:33 -05:00 · f6a6716c44
commit f6a6716c44
parent 857a85ab58
8 changed files with 688 additions and 1794 deletions
--- a/finetune/README.md
+++ b/finetune/README.md
@ -80,11 +80,14 @@ See `PROMPT_FORMAT.md` for complete specification.
 finetune/
 ├── train.py              # SFT training (uses YAML config)
 ├── rl.py                 # GRPO/RL training (uses YAML config)
-├── evaluate_model.py     # Evaluate finetuned models
 ├── tui.py                # Interactive testing interface
 ├── configs/
 │   ├── sft_v4.yaml       # SFT training config
 │   └── grpo_v4.yaml      # GRPO training config
+├── evals/
+│   ├── run.py            # Generate model outputs to JSONL
+│   ├── score.py          # Score outputs from JSONL
+│   └── queries.txt       # Test queries
 ├── dataset/
 │   ├── prepare_data.py   # Prepare training data
 │   ├── clean_data.py     # Data quality improvements
@ -118,7 +121,11 @@ hf jobs uv run --flavor a10g-large --timeout 2h --secrets HF_TOKEN \
 ### 3. Evaluate

 ```bash
-uv run evaluate_model.py --model tobil/qmd-query-expansion-0.6B-v4
+# Generate outputs
+uv run evals/run.py --model tobil/qmd-query-expansion-0.6B-v4
+
+# Score them
+uv run evals/score.py evals/results_tobil_qmd-query-expansion-0.6B-v4.jsonl
 ```

 ### 4. Interactive Testing
--- a/finetune/evals/.gitignore
+++ b/finetune/evals/.gitignore
@ -0,0 +1,3 @@
+# Generated results (re-run evals locally)
+results_*.jsonl
+scores_*.json
--- a/finetune/evals/queries.txt
+++ b/finetune/evals/queries.txt
@ -0,0 +1,42 @@
+# Test queries for QMD query expansion evaluation
+# One query per line, comments start with #
+
+# Technical documentation
+how to configure authentication
+typescript async await
+docker compose networking
+git rebase vs merge
+react useEffect cleanup
+
+# Short/ambiguous queries
+auth
+config
+setup
+api
+
+# Named entities (critical for entity preservation testing)
+who is TDS motorsports
+React hooks tutorial
+Docker container networking
+Kubernetes pod deployment
+AWS Lambda functions
+
+# Personal notes / journals style
+meeting notes project kickoff
+ideas for new feature
+todo list app architecture
+
+# Research / learning
+what is dependency injection
+difference between sql and nosql
+kubernetes vs docker swarm
+
+# Error/debugging
+connection timeout error
+memory leak debugging
+cors error fix
+
+# Complex queries
+how to implement caching with redis in nodejs
+best practices for api rate limiting
+setting up ci cd pipeline with github actions
--- a/finetune/evals/run.py
+++ b/finetune/evals/run.py
@ -0,0 +1,195 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "transformers>=4.45.0",
+#     "peft>=0.7.0",
+#     "torch",
+#     "huggingface_hub",
+#     "accelerate",
+# ]
+# ///
+"""
+Generate query expansions from a model and save to JSONL.
+
+Usage:
+    uv run evals/run.py --model tobil/qmd-query-expansion-0.6B-v4
+    uv run evals/run.py --model ./local-model --queries evals/queries.txt
+    uv run evals/run.py --model tobil/qmd-query-expansion-0.6B-v4 --output results.jsonl
+"""
+
+import argparse
+import json
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def load_queries(path: str) -> list[str]:
+    """Load queries from file, one per line, ignoring comments."""
+    queries = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                queries.append(line)
+    return queries
+
+
+def load_model(model_path: str, base_model: str = None):
+    """
+    Load the model (supports Hub URLs, local adapters, and merged models).
+
+    Args:
+        model_path: HF Hub ID (tobil/model), local adapter dir, or merged model dir
+        base_model: Base model for tokenizer (auto-detected if not provided)
+    """
+    model_path = Path(model_path) if not model_path.startswith(("http", "hf://")) and "/" not in model_path[:20] else model_path
+    is_local = isinstance(model_path, Path) and model_path.exists()
+
+    # Check if it's an adapter or full model
+    is_adapter = False
+    if is_local:
+        adapter_config = model_path / "adapter_config.json"
+        is_adapter = adapter_config.exists()
+        if is_adapter and not base_model:
+            # Read base model from adapter config
+            import json
+            with open(adapter_config) as f:
+                config = json.load(f)
+                base_model = config.get("base_model_name_or_path", "Qwen/Qwen3-0.6B")
+    else:
+        # For Hub models, assume adapter unless --base-model not provided
+        is_adapter = base_model is not None
+
+    # Default base model
+    if not base_model:
+        base_model = "Qwen/Qwen3-0.6B"
+
+    print(f"Loading tokenizer from {base_model}...", file=sys.stderr)
+    tokenizer = AutoTokenizer.from_pretrained(base_model)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    if is_adapter:
+        print(f"Loading base model {base_model}...", file=sys.stderr)
+        base = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+        print(f"Loading adapter from {model_path}...", file=sys.stderr)
+        model = PeftModel.from_pretrained(base, str(model_path))
+    else:
+        print(f"Loading model from {model_path}...", file=sys.stderr)
+        model = AutoModelForCausalLM.from_pretrained(
+            str(model_path),
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+        )
+
+    model.eval()
+    return model, tokenizer
+
+
+def generate_expansion(model, tokenizer, query: str, max_new_tokens: int = 200) -> str:
+    """Generate query expansion using Qwen3 chat template with /no_think."""
+    messages = [{"role": "user", "content": f"/no_think Expand this search query: {query}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.7,
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+
+    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    # Extract assistant response
+    if "\nassistant\n" in full_output:
+        expansion = full_output.split("\nassistant\n")[-1].strip()
+    elif "assistant\n" in full_output:
+        expansion = full_output.split("assistant\n")[-1].strip()
+    else:
+        expansion = full_output[len(prompt):].strip()
+
+    # Remove any <think> tags
+    if expansion.startswith("<think>"):
+        think_end = expansion.find("</think>")
+        if think_end != -1:
+            expansion = expansion[think_end + 8:].strip()
+
+    return expansion
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate query expansions")
+    parser.add_argument("--model", required=True, help="Model path (Hub or local)")
+    parser.add_argument("--base-model", default="Qwen/Qwen3-0.6B", help="Base model for tokenizer")
+    parser.add_argument("--queries", default="evals/queries.txt", help="Queries file")
+    parser.add_argument("--output", help="Output JSONL file (default: evals/results_{model_name}.jsonl)")
+    parser.add_argument("--max-tokens", type=int, default=200, help="Max tokens to generate")
+    args = parser.parse_args()
+
+    # Determine output path
+    if args.output:
+        output_path = Path(args.output)
+    else:
+        model_name = args.model.replace("/", "_").replace(".", "_")
+        output_path = Path(f"evals/results_{model_name}.jsonl")
+
+    # Load queries
+    queries = load_queries(args.queries)
+    print(f"Loaded {len(queries)} queries from {args.queries}", file=sys.stderr)
+
+    # Load model
+    model, tokenizer = load_model(args.model, args.base_model)
+
+    # Generate expansions
+    print(f"Generating expansions...", file=sys.stderr)
+    results = []
+
+    for i, query in enumerate(queries, 1):
+        print(f"[{i}/{len(queries)}] {query}", file=sys.stderr)
+        expansion = generate_expansion(model, tokenizer, query, args.max_tokens)
+        results.append({
+            "query": query,
+            "expansion": expansion,
+        })
+
+    # Write results
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        # Write metadata as first line
+        metadata = {
+            "_meta": True,
+            "model": args.model,
+            "base_model": args.base_model,
+            "timestamp": datetime.now().isoformat(),
+            "num_queries": len(queries),
+        }
+        f.write(json.dumps(metadata) + "\n")
+
+        # Write results
+        for result in results:
+            f.write(json.dumps(result) + "\n")
+
+    print(f"Results saved to: {output_path}", file=sys.stderr)
+    print(str(output_path))  # Print path to stdout for piping
+
+
+if __name__ == "__main__":
+    main()
--- a/finetune/evals/score.py
+++ b/finetune/evals/score.py
@ -0,0 +1,439 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = []
+# ///
+"""
+Score query expansion results from a JSONL file.
+
+Usage:
+    uv run evals/score.py evals/results_model.jsonl
+    uv run evals/score.py evals/results_model.jsonl --output scores.json
+    uv run evals/score.py evals/results_model.jsonl --verbose
+"""
+
+import argparse
+import json
+import re
+import sys
+from collections import Counter
+from pathlib import Path
+
+# ============== SCORING CONSTANTS ==============
+STOPWORDS = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or', 'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by'}
+KEY_TERM_STOPWORDS = {'what', 'is', 'how', 'to', 'the', 'a', 'an', 'in', 'on', 'for', 'of',
+                      'and', 'or', 'with', 'my', 'your', 'do', 'does', 'can', 'i', 'me', 'we',
+                      'who', 'where', 'when', 'why', 'which', 'find', 'get', 'show', 'tell'}
+GENERIC_LEX_PHRASES = {
+    'find information about', 'search for', 'look up', 'get information',
+    'learn about', 'information on', 'details about', 'find out about',
+    'what is', 'how to', 'guide to', 'help with'
+}
+
+
+# ============== HELPER FUNCTIONS ==============
+def extract_named_entities(query: str) -> set:
+    """Extract named entities from query using simple heuristics."""
+    entities = set()
+    words = query.split()
+    prev_was_entity = False
+
+    for i, word in enumerate(words):
+        clean = word.strip('.,!?:;()[]"\'')
+        if not clean:
+            prev_was_entity = False
+            continue
+
+        is_entity = False
+
+        # All-caps words (acronyms): TDS, API, GPU
+        if clean.isupper() and len(clean) >= 2:
+            entities.add(clean.lower())
+            is_entity = True
+        # Capitalized words (not first word)
+        elif i > 0 and clean[0].isupper() and clean.lower() not in KEY_TERM_STOPWORDS:
+            entities.add(clean.lower())
+            is_entity = True
+        # Technical terms: node.js, C++
+        elif any(c in clean for c in '.+-#@') and len(clean) >= 2:
+            entities.add(clean.lower())
+            is_entity = True
+        # CamelCase: JavaScript
+        elif len(clean) > 1 and any(c.isupper() for c in clean[1:]) and clean[0].isupper():
+            entities.add(clean.lower())
+            is_entity = True
+        # Word following an entity (compound names)
+        elif prev_was_entity and clean.lower() not in KEY_TERM_STOPWORDS:
+            entities.add(clean.lower())
+            is_entity = True
+
+        prev_was_entity = is_entity
+
+    return entities
+
+
+def get_key_terms(query: str) -> set:
+    """Get key terms (non-stopwords) from query."""
+    words = set(query.lower().split())
+    return words - KEY_TERM_STOPWORDS
+
+
+def lex_preserves_key_terms(lex_line: str, query: str) -> bool:
+    """Check if lex line preserves key terms from query."""
+    key_terms = get_key_terms(query)
+    if not key_terms:
+        return True
+    lex_words = set(lex_line.lower().split())
+    return bool(key_terms & lex_words)
+
+
+def lex_preserves_entities(lex_line: str, entities: set) -> bool:
+    """Check if lex line contains at least one named entity."""
+    if not entities:
+        return True
+    lex_lower = lex_line.lower()
+    return any(entity in lex_lower for entity in entities)
+
+
+def lex_is_generic(lex_line: str) -> bool:
+    """Check if lex line is a generic filler phrase."""
+    lex_lower = lex_line.lower().strip()
+    for phrase in GENERIC_LEX_PHRASES:
+        if phrase in lex_lower or lex_lower.startswith(phrase.split()[0]):
+            remaining = lex_lower
+            for word in phrase.split():
+                remaining = remaining.replace(word, '', 1).strip()
+            if len(remaining) < 3:
+                return True
+    return False
+
+
+def parse_expansion(text: str) -> dict:
+    """Parse expansion into structured format."""
+    lines = text.strip().split("\n")
+    result = {"lex": [], "vec": [], "hyde": [], "invalid": []}
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        if line.startswith("lex:"):
+            result["lex"].append(line[4:].strip())
+        elif line.startswith("vec:"):
+            result["vec"].append(line[4:].strip())
+        elif line.startswith("hyde:"):
+            result["hyde"].append(line[5:].strip())
+        else:
+            result["invalid"].append(line)
+    return result
+
+
+def edit_distance_simple(a: str, b: str) -> int:
+    """Simple word-level edit distance."""
+    words_a = set(a.lower().split())
+    words_b = set(b.lower().split())
+    return len(words_a ^ words_b)
+
+
+def is_diverse(a: str, b: str, min_distance: int = 2) -> bool:
+    """Check if two strings are sufficiently different."""
+    a, b = a.lower().strip(), b.lower().strip()
+    if a == b:
+        return False
+    if a in b or b in a:
+        return False
+    return edit_distance_simple(a, b) >= min_distance
+
+
+def echoes_query(expansion: str, query: str) -> bool:
+    """Check if expansion is just echoing the query."""
+    exp = expansion.lower().strip()
+    q = query.lower().strip()
+    if exp == q:
+        return True
+    if q in exp and len(exp) < len(q) + 10:
+        return True
+    return False
+
+
+def word_repetition_penalty(text: str) -> int:
+    """Count penalty for repeated words."""
+    words = re.findall(r'\b\w+\b', text.lower())
+    counts = Counter(words)
+    penalty = 0
+    for word, count in counts.items():
+        if count >= 3 and word not in STOPWORDS and len(word) > 2:
+            penalty += (count - 2) * 2
+    return penalty
+
+
+# ============== MAIN SCORING FUNCTION ==============
+def score_expansion(query: str, expansion: str) -> dict:
+    """Score an expansion. Returns detailed breakdown."""
+    text = expansion.strip()
+    deductions = []
+
+    # HARD FAIL: Chat template artifacts
+    if any(token in text for token in ['<|im_start|>', '<|im_end|>', '<think>', '</think>',
+                                        '\nassistant\n', '\nuser\n', '<|endoftext|>']):
+        return {
+            "format": 0, "diversity": 0, "hyde": 0, "quality": 0, "entity": 0,
+            "total": 0, "max_possible": 100, "percentage": 0, "rating": "Failed",
+            "deductions": ["CHAT TEMPLATE LEAKAGE"],
+            "parsed": {"lex": [], "vec": [], "hyde": [], "invalid": [text[:100]]},
+            "entities_detected": [],
+        }
+
+    # HARD FAIL: Every line must start with lex:, vec:, or hyde:
+    for line in text.split("\n"):
+        line = line.strip()
+        if not line:
+            continue
+        if not line.startswith(("lex:", "vec:", "hyde:")):
+            return {
+                "format": 0, "diversity": 0, "hyde": 0, "quality": 0, "entity": 0,
+                "total": 0, "max_possible": 100, "percentage": 0, "rating": "Failed",
+                "deductions": [f"INVALID LINE: {line[:50]}"],
+                "parsed": parse_expansion(expansion),
+                "entities_detected": [],
+            }
+
+    parsed = parse_expansion(expansion)
+
+    # FORMAT (0-30)
+    format_score = 0
+    if parsed["lex"]:
+        format_score += 10
+    else:
+        deductions.append("missing lex:")
+    if parsed["vec"]:
+        format_score += 10
+    else:
+        deductions.append("missing vec:")
+    format_score += 10  # No invalid lines (guaranteed by hard fail)
+
+    # DIVERSITY (0-30)
+    diversity_score = 0
+    types_present = sum(1 for t in ["lex", "vec"] if parsed[t])
+    if types_present >= 2:
+        diversity_score += 10
+    else:
+        deductions.append("only one type")
+
+    total_expansions = len(parsed["lex"]) + len(parsed["vec"])
+    if total_expansions >= 2:
+        diversity_score += 5
+
+    lex_score = 5
+    for i, a in enumerate(parsed["lex"]):
+        for b in parsed["lex"][i+1:]:
+            if not is_diverse(a, b, 2):
+                lex_score -= 2
+                deductions.append(f"lex duplicate: {a[:20]}...")
+    diversity_score += max(0, lex_score)
+
+    vec_score = 5
+    for i, a in enumerate(parsed["vec"]):
+        for b in parsed["vec"][i+1:]:
+            if not is_diverse(a, b, 3):
+                vec_score -= 2
+                deductions.append(f"vec duplicate: {a[:20]}...")
+    diversity_score += max(0, vec_score)
+
+    echo_score = 5
+    for exp in parsed["lex"] + parsed["vec"]:
+        if echoes_query(exp, query):
+            echo_score -= 3
+            deductions.append(f"echoes query: {exp[:20]}...")
+    diversity_score += max(0, echo_score)
+
+    # HYDE (0-20)
+    hyde_score = 0
+    if parsed["hyde"]:
+        hyde_text = parsed["hyde"][0]
+        hyde_score += 5
+        hyde_len = len(hyde_text)
+        if 50 <= hyde_len <= 200:
+            hyde_score += 5
+        elif hyde_len < 50:
+            hyde_score += 2
+            deductions.append(f"hyde too short ({hyde_len})")
+        else:
+            deductions.append(f"hyde too long ({hyde_len})")
+        if "\n" not in hyde_text:
+            hyde_score += 5
+        rep_penalty = word_repetition_penalty(hyde_text)
+        hyde_score += max(0, 5 - rep_penalty)
+
+    # QUALITY (0-20)
+    quality_score = 5
+    if parsed["lex"] and parsed["vec"]:
+        avg_lex = sum(len(l) for l in parsed["lex"]) / len(parsed["lex"])
+        avg_vec = sum(len(v) for v in parsed["vec"]) / len(parsed["vec"])
+        if avg_lex <= avg_vec:
+            quality_score += 5
+        else:
+            deductions.append("lex longer than vec")
+    if parsed["vec"]:
+        natural = sum(1 for v in parsed["vec"] if " " in v and len(v) > 15)
+        if natural == len(parsed["vec"]):
+            quality_score += 5
+        else:
+            quality_score += 2
+    if parsed["lex"]:
+        lex_with_terms = sum(1 for l in parsed["lex"] if lex_preserves_key_terms(l, query))
+        if lex_with_terms == len(parsed["lex"]):
+            quality_score += 5
+        elif lex_with_terms > 0:
+            quality_score += 2
+        else:
+            deductions.append("lex missing key terms")
+
+    # NAMED ENTITY PRESERVATION (0-20, can go negative)
+    entity_score = 0
+    entities = extract_named_entities(query)
+    if entities and parsed["lex"]:
+        lex_with_entities = sum(1 for l in parsed["lex"] if lex_preserves_entities(l, entities))
+        if lex_with_entities == len(parsed["lex"]):
+            entity_score += 15
+        elif lex_with_entities > 0:
+            entity_score += 5
+        else:
+            entity_score -= 30
+            deductions.append(f"lex missing entities: {entities}")
+
+        generic_count = sum(1 for l in parsed["lex"] if lex_is_generic(l))
+        if generic_count > 0:
+            entity_score -= generic_count * 15
+            deductions.append(f"{generic_count} generic lex phrases")
+
+        if parsed["vec"]:
+            vec_with_entities = sum(1 for v in parsed["vec"] if lex_preserves_entities(v, entities))
+            if vec_with_entities > 0:
+                entity_score += 5
+    elif not entities:
+        entity_score = 10
+
+    # TOTAL
+    total = format_score + diversity_score + hyde_score + quality_score + entity_score
+    max_possible = 120 if parsed["hyde"] else 100
+    percentage = max(0.0, min(100.0, total / max_possible * 100))
+
+    # Rating
+    if percentage >= 80:
+        rating = "Excellent"
+    elif percentage >= 60:
+        rating = "Good"
+    elif percentage >= 40:
+        rating = "Acceptable"
+    elif percentage >= 20:
+        rating = "Poor"
+    else:
+        rating = "Failed"
+
+    return {
+        "format": format_score,
+        "diversity": diversity_score,
+        "hyde": hyde_score,
+        "quality": quality_score,
+        "entity": max(0, entity_score),
+        "total": max(0, total),
+        "max_possible": max_possible,
+        "percentage": round(percentage, 1),
+        "rating": rating,
+        "deductions": deductions,
+        "parsed": parsed,
+        "entities_detected": list(entities) if entities else [],
+    }
+
+
+def print_result(query: str, expansion: str, scores: dict, verbose: bool = False):
+    """Print a single result."""
+    print(f"\n{'='*60}")
+    print(f"Query: {query}")
+    print(f"{'─'*60}")
+    print(expansion)
+    print(f"{'─'*60}")
+    print(f"Score: {scores['percentage']:.0f}% ({scores['rating']})")
+    print(f"  Format: {scores['format']}/30  Diversity: {scores['diversity']}/30  "
+          f"Hyde: {scores['hyde']}/20  Quality: {scores['quality']}/20  Entity: {scores['entity']}/20")
+
+    if verbose and scores["deductions"]:
+        print(f"  Deductions: {', '.join(scores['deductions'][:5])}")
+    if verbose and scores["entities_detected"]:
+        print(f"  Entities: {scores['entities_detected']}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Score query expansion results")
+    parser.add_argument("input", help="Input JSONL file from run.py")
+    parser.add_argument("--output", help="Output JSON file with scores")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Show detailed output")
+    parser.add_argument("--summary-only", action="store_true", help="Only show summary")
+    args = parser.parse_args()
+
+    # Load results
+    results = []
+    metadata = None
+    with open(args.input) as f:
+        for line in f:
+            data = json.loads(line)
+            if data.get("_meta"):
+                metadata = data
+            else:
+                results.append(data)
+
+    print(f"Scoring {len(results)} results from {args.input}", file=sys.stderr)
+    if metadata:
+        print(f"Model: {metadata.get('model', 'unknown')}", file=sys.stderr)
+
+    # Score each result
+    scored_results = []
+    for result in results:
+        query = result["query"]
+        expansion = result["expansion"]
+        scores = score_expansion(query, expansion)
+
+        if not args.summary_only:
+            print_result(query, expansion, scores, args.verbose)
+
+        scored_results.append({
+            "query": query,
+            "expansion": expansion,
+            "scores": {k: v for k, v in scores.items() if k not in ["parsed", "deductions", "entities_detected"]},
+            "deductions": scores["deductions"],
+            "entities_detected": scores["entities_detected"],
+        })
+
+    # Summary
+    print(f"\n{'='*60}")
+    print("SUMMARY")
+    print(f"{'='*60}")
+
+    avg_score = sum(r["scores"]["percentage"] for r in scored_results) / len(scored_results)
+    ratings = Counter(r["scores"]["rating"] for r in scored_results)
+
+    print(f"  Total queries: {len(scored_results)}")
+    print(f"  Average score: {avg_score:.1f}%")
+    print(f"  Ratings:")
+    for rating in ["Excellent", "Good", "Acceptable", "Poor", "Failed"]:
+        count = ratings.get(rating, 0)
+        bar = "█" * count
+        print(f"    {rating:10s}: {count:2d} {bar}")
+
+    # Save output
+    if args.output:
+        output_data = {
+            "metadata": metadata,
+            "summary": {
+                "total": len(scored_results),
+                "average_score": round(avg_score, 1),
+                "ratings": dict(ratings),
+            },
+            "results": scored_results,
+        }
+        with open(args.output, "w") as f:
+            json.dump(output_data, f, indent=2)
+        print(f"\nScores saved to: {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/finetune/evaluate_model.py
+++ b/finetune/evaluate_model.py
@ -1,580 +0,0 @@
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#     "transformers>=4.45.0",
-#     "peft>=0.7.0",
-#     "torch",
-#     "huggingface_hub",
-#     "accelerate",
-# ]
-# ///
-"""
-Evaluate QMD query expansion model quality.
-
-See SCORING.md for detailed scoring criteria.
-"""
-
-import json
-import re
-import torch
-from collections import Counter
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from peft import PeftModel
-
-# Test queries covering different QMD use cases
-TEST_QUERIES = [
-    # Technical documentation
-    "how to configure authentication",
-    "typescript async await",
-    "docker compose networking",
-    "git rebase vs merge",
-    "react useEffect cleanup",
-    # Short/ambiguous queries
-    "auth",
-    "config",
-    "setup",
-    "api",
-    # Personal notes / journals style
-    "meeting notes project kickoff",
-    "ideas for new feature",
-    "todo list app architecture",
-    # Research / learning
-    "what is dependency injection",
-    "difference between sql and nosql",
-    "kubernetes vs docker swarm",
-    # Error/debugging
-    "connection timeout error",
-    "memory leak debugging",
-    "cors error fix",
-    # Complex queries
-    "how to implement caching with redis in nodejs",
-    "best practices for api rate limiting",
-    "setting up ci cd pipeline with github actions",
-]
-
-# Prompt is generated via tokenizer.apply_chat_template() - see generate_expansion()
-# Don't manually construct <|im_start|> tags
-
-STOPWORDS = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or', 'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by'}
-
-
-def parse_expansion(text: str) -> dict:
-    """Parse expansion into structured format."""
-    lines = text.strip().split("\n")
-    result = {"lex": [], "vec": [], "hyde": [], "invalid": []}
-
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        if line.startswith("lex:"):
-            result["lex"].append(line[4:].strip())
-        elif line.startswith("vec:"):
-            result["vec"].append(line[4:].strip())
-        elif line.startswith("hyde:"):
-            result["hyde"].append(line[5:].strip())
-        else:
-            result["invalid"].append(line)
-
-    return result
-
-
-def edit_distance_simple(a: str, b: str) -> int:
-    """Simple word-level edit distance."""
-    words_a = set(a.lower().split())
-    words_b = set(b.lower().split())
-    return len(words_a ^ words_b)  # Symmetric difference
-
-
-def is_diverse(a: str, b: str, min_distance: int = 2) -> bool:
-    """Check if two strings are sufficiently different."""
-    a, b = a.lower().strip(), b.lower().strip()
-    if a == b:
-        return False
-    if a in b or b in a:
-        return False
-    return edit_distance_simple(a, b) >= min_distance
-
-
-def echoes_query(expansion: str, query: str) -> bool:
-    """Check if expansion is just echoing the query."""
-    exp = expansion.lower().strip()
-    q = query.lower().strip()
-    # Exact match or very close
-    if exp == q:
-        return True
-    # Query is contained in expansion with little else
-    if q in exp and len(exp) < len(q) + 10:
-        return True
-    return False
-
-
-def get_key_terms(query: str) -> set:
-    """Extract key terms from query (excluding stopwords)."""
-    stopwords = {'what', 'is', 'how', 'to', 'the', 'a', 'an', 'in', 'on', 'for', 'of',
-                 'and', 'or', 'with', 'my', 'your', 'do', 'does', 'can', 'i', 'me', 'we'}
-    words = set(query.lower().split())
-    return words - stopwords
-
-
-def lex_preserves_key_terms(lex_line: str, query: str) -> bool:
-    """Check if lex line contains at least one key term from query."""
-    key_terms = get_key_terms(query)
-    if not key_terms:  # Very short query
-        return True
-    lex_words = set(lex_line.lower().split())
-    return bool(key_terms & lex_words)
-
-
-def word_repetition_penalty(text: str) -> int:
-    """Count penalty for repeated words (excluding stopwords)."""
-    words = re.findall(r'\b\w+\b', text.lower())
-    counts = Counter(words)
-    penalty = 0
-    for word, count in counts.items():
-        if count >= 3 and word not in STOPWORDS and len(word) > 2:
-            penalty += (count - 2) * 2
-    return penalty
-
-
-def is_continuation(expansion: str) -> bool:
-    """
-    Detect if output is a continuation rather than proper expansion.
-
-    A continuation is when the model continues the query as prose
-    instead of outputting lex:/vec:/hyde: lines.
-    """
-    text = expansion.strip()
-    if not text:
-        return True
-
-    # Check first non-empty line
-    first_line = text.split("\n")[0].strip()
-
-    # Valid outputs must start with a prefix
-    valid_prefixes = ("lex:", "vec:", "hyde:")
-    if first_line.startswith(valid_prefixes):
-        return False
-
-    # If first line doesn't have a valid prefix, it's a continuation
-    # Exception: empty first line (check second)
-    lines = [l.strip() for l in text.split("\n") if l.strip()]
-    if lines and not lines[0].startswith(valid_prefixes):
-        return True
-
-    return False
-
-
-def score_expansion(query: str, expansion: str) -> dict:
-    """
-    Score an expansion based on SCORING.md criteria.
-
-    Returns dict with score breakdown and total (0-100).
-    """
-    # HARD FAIL: Continuation detection
-    if is_continuation(expansion):
-        return {
-            "format": 0,
-            "diversity": 0,
-            "hyde": 0,
-            "quality": 0,
-            "total": 0,
-            "max_possible": 80,
-            "percentage": 0,
-            "rating": "Failed",
-            "deductions": ["CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"],
-            "parsed": {"lex": [], "vec": [], "hyde": [], "invalid": [expansion[:100]]},
-            "is_continuation": True,
-        }
-
-    parsed = parse_expansion(expansion)
-    scores = {
-        "format": 0,
-        "diversity": 0,
-        "hyde": 0,
-        "quality": 0,
-        "deductions": [],
-    }
-
-    # === FORMAT (0-30) ===
-    format_score = 0
-
-    # Has at least one lex: line (+10)
-    if parsed["lex"]:
-        format_score += 10
-    else:
-        scores["deductions"].append("missing lex: (-10)")
-
-    # Has at least one vec: line (+10)
-    if parsed["vec"]:
-        format_score += 10
-    else:
-        scores["deductions"].append("missing vec: (-10)")
-
-    # All lines have valid prefix (+10, -5 per invalid)
-    if not parsed["invalid"]:
-        format_score += 10
-    else:
-        invalid_penalty = min(10, len(parsed["invalid"]) * 5)
-        format_score += (10 - invalid_penalty)
-        scores["deductions"].append(f"{len(parsed['invalid'])} invalid lines (-{invalid_penalty})")
-
-    scores["format"] = max(0, format_score)
-
-    # === DIVERSITY (0-30) ===
-    diversity_score = 0
-
-    # 2+ different types present (+10)
-    types_present = sum(1 for t in ["lex", "vec"] if parsed[t])
-    if types_present >= 2:
-        diversity_score += 10
-    else:
-        scores["deductions"].append("only one type present (-10)")
-
-    # 2+ total expansions (+5)
-    total_expansions = len(parsed["lex"]) + len(parsed["vec"])
-    if total_expansions >= 2:
-        diversity_score += 5
-    else:
-        scores["deductions"].append("fewer than 2 expansions (-5)")
-
-    # Multiple lex: lines are diverse (+5, -2 per duplicate pair)
-    lex_diverse_score = 5
-    for i, a in enumerate(parsed["lex"]):
-        for b in parsed["lex"][i+1:]:
-            if not is_diverse(a, b, min_distance=2):
-                lex_diverse_score -= 2
-                scores["deductions"].append(f"lex duplicates: '{a[:20]}...' ~ '{b[:20]}...'")
-    diversity_score += max(0, lex_diverse_score)
-
-    # Multiple vec: lines are diverse (+5, -2 per duplicate pair)
-    vec_diverse_score = 5
-    for i, a in enumerate(parsed["vec"]):
-        for b in parsed["vec"][i+1:]:
-            if not is_diverse(a, b, min_distance=3):
-                vec_diverse_score -= 2
-                scores["deductions"].append(f"vec duplicates: '{a[:20]}...' ~ '{b[:20]}...'")
-    diversity_score += max(0, vec_diverse_score)
-
-    # lex/vec not identical to original query (+5, -5 per echo)
-    echo_score = 5
-    for exp in parsed["lex"] + parsed["vec"]:
-        if echoes_query(exp, query):
-            echo_score -= 5
-            scores["deductions"].append(f"echoes query: '{exp[:30]}...'")
-    diversity_score += max(0, echo_score)
-
-    scores["diversity"] = max(0, diversity_score)
-
-    # === HYDE QUALITY (0-20, optional bonus) ===
-    hyde_score = 0
-
-    if parsed["hyde"]:
-        hyde_text = parsed["hyde"][0]  # Only first hyde counts
-
-        # Hyde present and well-formed (+5)
-        hyde_score += 5
-
-        # Hyde is concise: 50-200 chars (+5)
-        hyde_len = len(hyde_text)
-        if 50 <= hyde_len <= 200:
-            hyde_score += 5
-        elif hyde_len < 50:
-            hyde_score += 2
-            scores["deductions"].append(f"hyde too short ({hyde_len} chars)")
-        else:
-            scores["deductions"].append(f"hyde too long ({hyde_len} chars)")
-
-        # Hyde has no newlines (+5)
-        if "\n" not in hyde_text:
-            hyde_score += 5
-        else:
-            scores["deductions"].append("hyde contains newlines")
-
-        # Hyde has no excessive repetition (+5)
-        rep_penalty = word_repetition_penalty(hyde_text)
-        if rep_penalty == 0:
-            hyde_score += 5
-        else:
-            hyde_score += max(0, 5 - rep_penalty)
-            scores["deductions"].append(f"hyde repetition penalty (-{min(5, rep_penalty)})")
-
-    scores["hyde"] = hyde_score
-
-    # === QUALITY (0-20) ===
-    quality_score = 10  # Base relevance (assume relevant unless obvious garbage)
-
-    # Lex lines should be keyword-focused (shorter than vec on average)
-    if parsed["lex"] and parsed["vec"]:
-        avg_lex = sum(len(l) for l in parsed["lex"]) / len(parsed["lex"])
-        avg_vec = sum(len(v) for v in parsed["vec"]) / len(parsed["vec"])
-        if avg_lex <= avg_vec:
-            quality_score += 5
-        else:
-            scores["deductions"].append("lex longer than vec (should be keywords)")
-    else:
-        quality_score += 2  # Partial credit
-
-    # Vec lines should be natural language (contain spaces, longer)
-    if parsed["vec"]:
-        vec_natural = sum(1 for v in parsed["vec"] if " " in v and len(v) > 15)
-        if vec_natural == len(parsed["vec"]):
-            quality_score += 5
-        else:
-            quality_score += 2
-            scores["deductions"].append("some vec lines too short/keyword-like")
-
-    # Lex lines must preserve key terms from query (not be generic)
-    if parsed["lex"]:
-        lex_with_terms = sum(1 for l in parsed["lex"] if lex_preserves_key_terms(l, query))
-        if lex_with_terms == len(parsed["lex"]):
-            quality_score += 5
-        elif lex_with_terms > 0:
-            quality_score += 2
-        else:
-            scores["deductions"].append("lex lines too generic - missing key terms from query")
-
-    scores["quality"] = min(20, quality_score)  # Cap at 20
-
-    # === TOTAL ===
-    scores["total"] = scores["format"] + scores["diversity"] + scores["hyde"] + scores["quality"]
-    scores["max_possible"] = 100 if parsed["hyde"] else 80
-    scores["percentage"] = scores["total"] / scores["max_possible"] * 100
-
-    # Rating
-    pct = scores["percentage"]
-    if pct >= 80:
-        scores["rating"] = "Excellent"
-    elif pct >= 60:
-        scores["rating"] = "Good"
-    elif pct >= 40:
-        scores["rating"] = "Acceptable"
-    elif pct >= 20:
-        scores["rating"] = "Poor"
-    else:
-        scores["rating"] = "Failed"
-
-    scores["parsed"] = parsed
-    scores["is_continuation"] = False
-    return scores
-
-
-def load_model(model_name: str, base_model: str = "Qwen/Qwen3-0.6B"):
-    """Load the finetuned model."""
-    print(f"Loading tokenizer from {base_model}...")
-    tokenizer = AutoTokenizer.from_pretrained(base_model)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    print(f"Loading base model...")
-    base = AutoModelForCausalLM.from_pretrained(
-        base_model,
-        torch_dtype=torch.bfloat16,
-        device_map="auto",
-    )
-
-    print(f"Loading adapter from {model_name}...")
-    model = PeftModel.from_pretrained(base, model_name)
-    model.eval()
-
-    return model, tokenizer
-
-
-def generate_expansion(model, tokenizer, query: str, max_new_tokens: int = 200) -> str:
-    """Generate query expansion using proper Qwen3 chat template."""
-    # Use tokenizer's chat template with /no_think to disable thinking mode
-    messages = [{"role": "user", "content": f"/no_think Expand this search query: {query}"}]
-    prompt = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            temperature=0.7,
-            do_sample=True,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-        )
-
-    # Decode and extract expansion
-    # skip_special_tokens=True strips <|im_start|> etc, leaving "user\n...\nassistant\n..."
-    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
-    # Extract assistant response
-    if "\nassistant\n" in full_output:
-        expansion = full_output.split("\nassistant\n")[-1].strip()
-    elif "assistant\n" in full_output:
-        expansion = full_output.split("assistant\n")[-1].strip()
-    else:
-        # Fallback: strip prompt length
-        expansion = full_output[len(prompt):].strip()
-
-    # Remove any <think> tags that might remain
-    if expansion.startswith("<think>"):
-        # Find end of thinking block
-        think_end = expansion.find("</think>")
-        if think_end != -1:
-            expansion = expansion[think_end + 8:].strip()
-
-    return expansion
-
-
-def print_score_breakdown(scores: dict):
-    """Pretty print score breakdown."""
-    print(f"  Score: {scores['total']}/{scores['max_possible']} ({scores['percentage']:.0f}%) - {scores['rating']}")
-    print(f"    Format:    {scores['format']}/30")
-    print(f"    Diversity: {scores['diversity']}/30")
-    print(f"    Hyde:      {scores['hyde']}/20")
-    print(f"    Quality:   {scores['quality']}/20")
-    if scores["deductions"]:
-        print(f"  Deductions:")
-        for d in scores["deductions"][:5]:  # Show top 5
-            print(f"    - {d}")
-        if len(scores["deductions"]) > 5:
-            print(f"    ... and {len(scores['deductions']) - 5} more")
-
-
-def run_examples():
-    """Run good and bad examples to demonstrate scoring."""
-    print("=" * 70)
-    print("SCORING EXAMPLES")
-    print("=" * 70)
-
-    # Good example
-    good_expansion = """lex: react hooks tutorial
-lex: usestate useeffect
-vec: how to use react hooks in functional components
-vec: react hooks best practices guide
-hyde: React Hooks allow you to use state and lifecycle features in functional components without writing a class."""
-
-    print("\n[GOOD EXAMPLE]")
-    print(f"Query: react hooks")
-    print(f"Output:\n{good_expansion}")
-    scores = score_expansion("react hooks", good_expansion)
-    print_score_breakdown(scores)
-
-    # Bad example
-    bad_expansion = """auth is an important concept that relates to authentication.
-The answer should be in Chinese.
-The answer should be in Chinese."""
-
-    print("\n[BAD EXAMPLE]")
-    print(f"Query: auth")
-    print(f"Output:\n{bad_expansion}")
-    scores = score_expansion("auth", bad_expansion)
-    print_score_breakdown(scores)
-
-    # Medium example - repetitive hyde
-    medium_expansion = """lex: docker networking
-vec: docker networking
-hyde: Docker networking is an important concept. Docker networking is used for container communication. Docker networking configuration is essential."""
-
-    print("\n[MEDIUM EXAMPLE - Repetitive]")
-    print(f"Query: docker networking")
-    print(f"Output:\n{medium_expansion}")
-    scores = score_expansion("docker networking", medium_expansion)
-    print_score_breakdown(scores)
-
-    # Medium example - echoes query
-    echo_expansion = """lex: auth
-lex: authentication
-vec: auth
-vec: authentication configuration
-hyde: Authentication is the process of verifying identity."""
-
-    print("\n[MEDIUM EXAMPLE - Echoes Query]")
-    print(f"Query: auth")
-    print(f"Output:\n{echo_expansion}")
-    scores = score_expansion("auth", echo_expansion)
-    print_score_breakdown(scores)
-
-    print("\n" + "=" * 70)
-
-
-def main():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="tobil/qmd-query-expansion-0.6B",
-                        help="Model to evaluate")
-    parser.add_argument("--base-model", default="Qwen/Qwen3-0.6B",
-                        help="Base model")
-    parser.add_argument("--output", default="evaluation_results.json",
-                        help="Output file for results")
-    parser.add_argument("--queries", type=str, help="Custom queries file (one per line)")
-    parser.add_argument("--examples", action="store_true", help="Run scoring examples only")
-    args = parser.parse_args()
-
-    # Run examples if requested
-    if args.examples:
-        run_examples()
-        return
-
-    # Load custom queries if provided
-    queries = TEST_QUERIES
-    if args.queries:
-        with open(args.queries) as f:
-            queries = [l.strip() for l in f if l.strip()]
-
-    # Load model
-    model, tokenizer = load_model(args.model, args.base_model)
-
-    # Run evaluation
-    results = []
-    print(f"\n{'='*70}")
-    print("EVALUATION RESULTS")
-    print(f"{'='*70}\n")
-
-    for i, query in enumerate(queries, 1):
-        print(f"[{i}/{len(queries)}] Query: {query}")
-        print("-" * 50)
-
-        expansion = generate_expansion(model, tokenizer, query)
-        scores = score_expansion(query, expansion)
-
-        print(expansion)
-        print()
-        print_score_breakdown(scores)
-        print()
-
-        results.append({
-            "query": query,
-            "expansion": expansion,
-            "scores": {k: v for k, v in scores.items() if k != "parsed"},
-            "parsed": scores["parsed"],
-        })
-
-    # Summary
-    print(f"\n{'='*70}")
-    print("SUMMARY")
-    print(f"{'='*70}")
-
-    avg_score = sum(r["scores"]["percentage"] for r in results) / len(results)
-    excellent = sum(1 for r in results if r["scores"]["rating"] == "Excellent")
-    good = sum(1 for r in results if r["scores"]["rating"] == "Good")
-    acceptable = sum(1 for r in results if r["scores"]["rating"] == "Acceptable")
-    poor = sum(1 for r in results if r["scores"]["rating"] == "Poor")
-    failed = sum(1 for r in results if r["scores"]["rating"] == "Failed")
-
-    print(f"  Total queries: {len(results)}")
-    print(f"  Average score: {avg_score:.1f}%")
-    print(f"  Ratings:")
-    print(f"    Excellent: {excellent}")
-    print(f"    Good:      {good}")
-    print(f"    Acceptable: {acceptable}")
-    print(f"    Poor:      {poor}")
-    print(f"    Failed:    {failed}")
-
-    # Save results
-    with open(args.output, "w") as f:
-        json.dump(results, f, indent=2)
-    print(f"\n  Results saved to: {args.output}")
-
-
-if __name__ == "__main__":
-    main()
--- a/finetune/evaluation_grpo_failed.json
+++ b/finetune/evaluation_grpo_failed.json
@ -1,548 +0,0 @@
-[
-  {
-    "query": "how to configure authentication",
-    "expansion": "Here is an expanded version of your search query for configuring authentication:\n\n**\"How to configure authentication settings in [specific platform or system]?\"**\n\nIf you have a specific platform or system in mind (such as web apps, mobile apps, APIs, or a database system), please clarify, and I can tailor the query further.",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here is an expanded version of your search query for configuring authentication:\n\n**\"How to configur"
-      ]
-    }
-  },
-  {
-    "query": "typescript async await",
-    "expansion": "The query \"typescript async await\" can be expanded in several ways depending on the context in which it's used:\n\n1. **Programming Language Use**:  \n   - \"Typescript: Async/await with Await\"  \n   - \"Typescript Async/await Example\"  \n   - \"How to use Async/await in Typescript\"\n\n2. **Learning or Documentation**:  \n   - \"Typescript Async/await: Explained\"  \n   - \"Async/await in Typescript: Learn and Example\"\n\n3. **Technical Support or Guides**:  \n   - \"Typescript Async/await: Learn, Use, and Example\"  \n   - \"Async/await in Typescript: Guide and Example\"\n\n4. **Code Examples**:  \n   - \"How to use async/await in Typescript\"  \n   - \"Async/await in Typescript: Code Example\"\n\nIf you need further assistance, feel free to let me know!",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "The query \"typescript async await\" can be expanded in several ways depending on the context in which"
-      ]
-    }
-  },
-  {
-    "query": "docker compose networking",
-    "expansion": "Here's an expanded version of the search query \"docker compose networking\":\n\n**\"docker compose networking query\"**\n\nThis query can be used to find information related to Docker Compose, networking configurations, or network settings in Docker.",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here's an expanded version of the search query \"docker compose networking\":\n\n**\"docker compose netwo"
-      ]
-    }
-  },
-  {
-    "query": "git rebase vs merge",
-    "expansion": "Here's an expanded version of the search query \"git rebase vs merge\" with more context and explanation:\n\n**\"git rebase vs merge**  \nThis question is commonly asked when working with Git, and it's important to understand the difference between **rebase** and **merge** in order to make informed decisions about how to proceed with your Git workflow.  \n\n### What is **git rebase**?\n- **git rebase** is a command used to **fix up** and **merge** changes back into a common branch or commit history.  \n- It is typically used to apply changes to the current branch, and it often **reverts** or **corrects** changes that have been made to a different branch.  \n- It is useful when you want to **correct** or **update** your branch history to match the main branch or a specific commit.\n\n### What is **git merge**?\n- **git merge** is a command used to **",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here's an expanded version of the search query \"git rebase vs merge\" with more context and explanati"
-      ]
-    }
-  },
-  {
-    "query": "react useEffect cleanup",
-    "expansion": "To expand the search query **\"react useEffect cleanup\"**, you can consider the following variations:\n\n### 1. **React Hook Clean Up**\n- **\"react useEffect cleanup code\"**\n- **\"react useEffect cleanup best practices\"**\n- **\"react useEffect cleanup function\"**\n\n### 2. **React Clean Up with useEffect**\n- **\"react cleanup useEffect best\"**\n- **\"react cleanup useEffect code\"**\n- **\"react cleanup useEffect example\"**\n\n### 3. **React Clean Up and useEffect**\n- **\"react cleanup useEffect code\"**\n- **\"react cleanup useEffect code example\"**\n- **\"react cleanup useEffect best practices\"**\n\n### 4. **React Cleanup with useEffect**\n- **\"react cleanup useEffect code\"**\n- **\"react cleanup useEffect example\"**\n\n### 5. **React Clean Up Function with useEffect**\n- **\"react cleanup useEffect function\"**\n- **\"react cleanup useEffect code\"**\n\nThese variations help",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "To expand the search query **\"react useEffect cleanup\"**, you can consider the following variations:"
-      ]
-    }
-  },
-  {
-    "query": "auth",
-    "expansion": "The search query \"auth\" can be expanded to include various related terms and concepts. Here are some possible expansions:\n\n1. **Authentication** - This refers to the process of verifying a user or system's identity.  \n2. **User Authentication** - A specific type of authentication mechanism used to verify user access.  \n3. **Secure Authentication** - A method to ensure that only authorized users can access a system or service.  \n4. **Authentication and Authorization** - Combining the concepts of authentication and authorization.  \n5. **Role-Based Access Control** - A system that determines access rights based on user roles.  \n6. **User Verification** - The act of confirming a user's identity.  \n7. **Multi-Factor Authentication (MFA)** - An additional layer of security beyond a password.  \n8. **API Authentication** - Authentication required for accessing an API.  \n9. **OAuth** - A protocol for authorization that allows apps to access user",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "The search query \"auth\" can be expanded to include various related terms and concepts. Here are some"
-      ]
-    }
-  },
-  {
-    "query": "config",
-    "expansion": "Expanding the search query \"config\" would involve modifying the search to include more context or details. Here are some possible expansions:\n\n1. **config details** - to find information about specific configuration settings  \n2. **config settings** - to find configuration parameters  \n3. **configurations** - to find configuration options or settings  \n4. **config file** - to find configuration files  \n5. **config management** - to find configuration management tools or practices  \n6. **config file structure** - to find configuration file structures  \n7. **config values** - to find configuration values  \n8. **config options** - to find configuration options  \n9. **config parameters** - to find configuration parameters  \n10. **config settings** - to find configuration settings  \n\nLet me know if you meant to search for a specific type of configuration!",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Expanding the search query \"config\" would involve modifying the search to include more context or de"
-      ]
-    }
-  },
-  {
-    "query": "setup",
-    "expansion": "Expanding the search query **\"setup\"** would involve adding more context or specific terms to provide a more accurate result. Here are some possibilities:\n\n1. **\"Setup a project\"**  \n2. **\"Set up a system\"**  \n3. **\"Configure a setup\"**  \n4. **\"Configuration of a setup\"**  \n5. **\"Install a setup\"**  \n6. **\"Create a setup\"**  \n7. **\"Install or configure a setup\"**  \n8. **\"Setup a configuration\"**  \n9. **\"Set up a system configuration\"**  \n10. **\"Configuration setup\"**  \n\nLet me know if you meant a specific task or context!",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Expanding the search query **\"setup\"** would involve adding more context or specific terms to provid"
-      ]
-    }
-  },
-  {
-    "query": "api",
-    "expansion": "Certainly! Here's an expanded version of your search query:\n\n**\"api | search api | api details | api documentation | api usage | api integration | api development | api best practices | api implementation | api testing | api troubleshooting | api documentation | api examples | api reference | api installation | api configuration | api security | api optimization | api maintenance | api architecture | api deployment | api testing tools | api monitoring | api performance | api scalability | api troubleshooting | api best practices | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference |",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Certainly! Here's an expanded version of your search query:\n\n**\"api | search api | api details | api"
-      ]
-    }
-  },
-  {
-    "query": "meeting notes project kickoff",
-    "expansion": "Expanding the search query **\"meeting notes project kickoff\"**, you can refine it to:\n\n- **\"meeting notes for project kickoff\"**  \n- **\"project kickoff meeting notes\"**  \n- **\"meeting notes project kickoff details\"**  \n- **\"meeting notes on project kickoff\"**  \n- **\"meeting notes for project launch\"**  \n\nThese variations provide different ways to phrase or search for meeting notes related to a project's kickoff event.",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Expanding the search query **\"meeting notes project kickoff\"**, you can refine it to:\n\n- **\"meeting "
-      ]
-    }
-  },
-  {
-    "query": "ideas for new feature",
-    "expansion": "Here are some **ideas** for new features you could consider:\n\n1. **User Authentication & Authorization**: Add new methods for users to log in (e.g., social media, email, phone) and manage access.\n\n2. **Real-time Data Integration**: Integrate with APIs or databases to fetch real-time data and update the app accordingly.\n\n3. **Mobile-First Design**: Focus on mobile-friendly UI and functionality, especially for a mobile app.\n\n4. **Customization & Personalization**: Allow users to customize their experience (e.g., themes, preferences).\n\n5. **Multilingual Support**: Offer features to support users from different countries or regions.\n\n6. **Gamification Elements**: Integrate game-like features (e.g., points, badges) to encourage user engagement.\n\n7. **AI & Machine Learning**: Incorporate AI to improve features (e.g., personalized recommendations, chatbots).\n\n8. **Cross-Platform Compatibility**: Ensure the app works seamlessly across different",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here are some **ideas** for new features you could consider:\n\n1. **User Authentication & Authorizati"
-      ]
-    }
-  },
-  {
-    "query": "todo list app architecture",
-    "expansion": "Here's an expanded version of the search query:  \n\n**\"Todo list app architecture\"**  \n\nThis query seeks information on the structure, design, and implementation of a **todo list application**. It covers topics such as:\n\n- **Architecture Overview**  \n- **Key Components** (e.g., UI, backend, data storage)  \n- **Design Patterns** used in todo apps  \n- **Scalability and performance**  \n- **User Interface and User Experience (UIUX)**  \n- **Database Systems** (e.g., SQLite, MongoDB)  \n- **APIs and Integration**  \n- **Security considerations**  \n- **Development tools** and frameworks used (e.g., React, Flutter, Node.js)  \n\nLet me know if you'd like a breakdown of specific components or case studies!",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here's an expanded version of the search query:  \n\n**\"Todo list app architecture\"**  \n\nThis query se"
-      ]
-    }
-  },
-  {
-    "query": "what is dependency injection",
-    "expansion": "Dependency injection (DI) is a design pattern used in software development to **independently manage the dependencies** of a class or module. The idea is that the program **involves no direct dependency between classes**, but instead, the implementation of these classes is **provided by a container**\u2014a system that automatically manages the dependencies.\n\n### Key Concepts:\n- **Inversion of Control (IoC)**: This is the core principle of DI. Instead of the programmer defining the dependencies, the container handles the dependency relationships.\n- **DI Containers**: These are the systems that manage the dependencies, such as Spring, Unity, or others.\n- **DI Inverses**: These are the classes that are \"injected\" into other classes, such as a `Person` class that receives an `Email` and `Name`.\n\n### Example:\n```java\nclass Person {\n    private String name;\n    private Email email;\n\n    public Person(String name, Email email) {",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Dependency injection (DI) is a design pattern used in software development to **independently manage"
-      ]
-    }
-  },
-  {
-    "query": "difference between sql and nosql",
-    "expansion": "The difference between SQL and NoSQL is a fundamental distinction in the world of databases. Here's a breakdown:\n\n### **1. SQL (Structured Query Language)**\n- **Purpose**: SQL is designed for structured data and is used to manage and manipulate relational databases. It is built on the relational model where data is stored in tables and organized in rows and columns.\n- **Key Features**:\n  - **Strong typing** and **data integrity**.\n  - **Support for complex queries** and transactions.\n  - **Standardized syntax** and conventions.\n- **Used in**: Relational databases like MySQL, PostgreSQL, and SQL Server.\n- **Best for**: Data that is structured, static, and requires complex queries.\n\n### **2. NoSQL**\n- **Purpose**: NoSQL is a family of non-relational databases that do not follow a fixed schema. They are designed to store and retrieve data in a flexible, scalable, and dynamic way.\n- **Key",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "The difference between SQL and NoSQL is a fundamental distinction in the world of databases. Here's "
-      ]
-    }
-  },
-  {
-    "query": "kubernetes vs docker swarm",
-    "expansion": "Here is an expanded version of your search query:\n\n**\"Kubernetes vs Docker Swarm\"**\n\nThis query highlights the comparison between **Kubernetes** and **Docker Swarm**, focusing on their roles, features, and use cases in cloud computing. If you'd like to refine it further, feel free to provide more context!",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here is an expanded version of your search query:\n\n**\"Kubernetes vs Docker Swarm\"**\n\nThis query high"
-      ]
-    }
-  },
-  {
-    "query": "connection timeout error",
-    "expansion": "Expanding the search query \"connection timeout error\" to provide more context and detail, here are some possible expansions:\n\n1. **\"Connection timeout error in web services\"**  \n   This focuses on the issue occurring in web-based applications or services.\n\n2. **\"How to resolve a connection timeout error in programming\"**  \n   This expands the query to include troubleshooting steps.\n\n3. **\"Connection timeout error symptoms and solutions\"**  \n   This highlights the symptoms and possible solutions related to the error.\n\n4. **\"Common causes and resolutions for connection timeout errors\"**  \n   This provides a comprehensive overview of the problem and how to address it.\n\n5. **\"Troubleshooting connection timeout error in different technologies\"**  \n   This expands to include different programming languages or platforms.\n\nLet me know if you'd like help with a specific area (e.g., web development, network, programming) or need further details!",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Expanding the search query \"connection timeout error\" to provide more context and detail, here are s"
-      ]
-    }
-  },
-  {
-    "query": "memory leak debugging",
-    "expansion": "\"Memory leak debugging\" is a common issue in software development, particularly in applications that use memory management. Here's an expanded version of the search query to include more context:\n\n**Expanded Search Query:**\n\n- Memory leak debugging techniques\n- How to debug memory leaks in software\n- Best practices for identifying and resolving memory leaks\n- Common causes of memory leaks in applications\n- Tools for memory leak debugging\n- Techniques to prevent memory leaks\n- Debugging memory leaks in different programming languages\n\nThis query can help you find resources, tutorials, or guides on effectively debugging memory leaks in your code.",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "\"Memory leak debugging\" is a common issue in software development, particularly in applications that"
-      ]
-    }
-  },
-  {
-    "query": "cors error fix",
-    "expansion": "Here\u2019s an expanded version of the search query:\n\n**\"How to fix a CORS error in your website?\"**\n\nThis query allows you to ask about the general process of resolving a CORS (Cross-Origin Resource Sharing) error. If you have a specific browser or framework in mind, you can tailor the search further.",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here\u2019s an expanded version of the search query:\n\n**\"How to fix a CORS error in your website?\"**\n\nThi"
-      ]
-    }
-  },
-  {
-    "query": "how to implement caching with redis in nodejs",
-    "expansion": "To implement caching with Redis in Node.js, you can use the Redis module from the Node.js standard library. Here's a step-by-step guide:\n\n### 1. **Install the Redis Module**\nFirst, install the `redis` package using npm:\n```bash\nnpm install redis --save\n```\n\n### 2. **Set Up Redis Connection**\nYou need to establish a connection to Redis. You can do this by creating a Redis client instance. Below is a simple example:\n\n```javascript\nconst { Redis } = require('redis');\n\n// Connect to Redis\nconst redis = Redis.createClient({\n  host: 'localhost',\n  port: 6379,\n  database: 0, // Default database\n});\n\n// Set a value in Redis\nredis.set('key1', 'value1', (err) => {\n  if (err) {\n    console.error('Error setting Redis value:', err);\n  } else {\n    console.log('Value",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "To implement caching with Redis in Node.js, you can use the Redis module from the Node.js standard l"
-      ]
-    }
-  },
-  {
-    "query": "best practices for api rate limiting",
-    "expansion": "Here is an expanded search query for \"best practices for API rate limiting\":\n\n**\"Best practices for API rate limiting\"**\n\nThis query allows for a broader search, including results that explain the principles, implementation methods, and effective strategies for API rate limiting.",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Here is an expanded search query for \"best practices for API rate limiting\":\n\n**\"Best practices for "
-      ]
-    }
-  },
-  {
-    "query": "setting up ci cd pipeline with github actions",
-    "expansion": "Setting up a CI/CD pipeline with GitHub Actions is a common practice in modern software development. Here's an expanded version of the search query to help you better understand and find solutions:\n\n**\"How to set up a CI/CD pipeline with GitHub Actions in a scalable and efficient way?\"**\n\n### Key Features and Benefits:\n1. **Scalability**: GitHub Actions can handle complex pipelines across multiple environments.\n2. **Efficiency**: It automates repetitive tasks, reducing manual intervention.\n3. **Integration**: Works seamlessly with GitHub, GitLab, and other platforms.\n4. **Version Control**: Supports multiple Git repositories.\n5. **CI/CD Best Practices**: Includes best practices for setting up and managing pipelines.\n\n### Popular Options:\n- **GitHub Actions**: A lightweight and open-source tool for CI/CD pipelines.\n- **Jenkins (or GitLab CI)**: Commercial tools with more advanced features.\n- **Azure DevOps**: For enterprise-level CI/CD",
-    "scores": {
-      "format": 0,
-      "diversity": 0,
-      "hyde": 0,
-      "quality": 0,
-      "total": 0,
-      "max_possible": 80,
-      "percentage": 0,
-      "rating": "Failed",
-      "deductions": [
-        "CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
-      ],
-      "is_continuation": true
-    },
-    "parsed": {
-      "lex": [],
-      "vec": [],
-      "hyde": [],
-      "invalid": [
-        "Setting up a CI/CD pipeline with GitHub Actions is a common practice in modern software development."
-      ]
-    }
-  }
-]
--- a/finetune/evaluation_sft.json
+++ b/finetune/evaluation_sft.json
@ -1,664 +0,0 @@
-[
-  {
-    "query": "how to configure authentication",
-    "expansion": "lex: authentication settings\nlex: user login\nvec: authentication settings\nvec: user login\nhyde: To configure authentication, start by reviewing the requirements and dependencies. Configure authentication settings in your project configuration.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 19,
-      "deductions": [
-        "some vec lines too short/keyword-like"
-      ],
-      "total": 99,
-      "max_possible": 100,
-      "percentage": 99.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "authentication settings",
-        "user login"
-      ],
-      "vec": [
-        "authentication settings",
-        "user login"
-      ],
-      "hyde": [
-        "To configure authentication, start by reviewing the requirements and dependencies. Configure authentication settings in your project configuration."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "typescript async await",
-    "expansion": "lex: asynchronous await\nlex: await typescript\nvec: asynchronous await\nvec: await typescript\nhyde: Typescript async await is an important concept that relates to asynchronous await. It provides functionality for various use cases in software development.",
-    "scores": {
-      "format": 30,
-      "diversity": 28,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "vec duplicates: 'asynchronous await...' ~ 'await typescript...'"
-      ],
-      "total": 98,
-      "max_possible": 100,
-      "percentage": 98.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "asynchronous await",
-        "await typescript"
-      ],
-      "vec": [
-        "asynchronous await",
-        "await typescript"
-      ],
-      "hyde": [
-        "Typescript async await is an important concept that relates to asynchronous await. It provides functionality for various use cases in software development."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "docker compose networking",
-    "expansion": "lex: docker-compose config\nlex: docker-compose settings\nvec: docker-compose config\nvec: docker-compose settings\nhyde: Docker compose networking is an important concept that relates to docker-compose settings. It provides functionality for various use cases in software development.",
-    "scores": {
-      "format": 30,
-      "diversity": 28,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "vec duplicates: 'docker-compose confi...' ~ 'docker-compose setti...'",
-        "lex lines too generic - missing key terms from query"
-      ],
-      "total": 98,
-      "max_possible": 100,
-      "percentage": 98.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "docker-compose config",
-        "docker-compose settings"
-      ],
-      "vec": [
-        "docker-compose config",
-        "docker-compose settings"
-      ],
-      "hyde": [
-        "Docker compose networking is an important concept that relates to docker-compose settings. It provides functionality for various use cases in software development."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "git rebase vs merge",
-    "expansion": "lex: difference between rebase\nlex: what's the\nvec: difference between rebase and merge\nvec: what's the difference between git rebase and merge\nhyde: The topic of git rebase vs merge covers what's the difference between rebase and merge.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "difference between rebase",
-        "what's the"
-      ],
-      "vec": [
-        "difference between rebase and merge",
-        "what's the difference between git rebase and merge"
-      ],
-      "hyde": [
-        "The topic of git rebase vs merge covers what's the difference between rebase and merge."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "react useEffect cleanup",
-    "expansion": "lex: react cleanup code\nlex: react cleanup example\nvec: code examples for react useEffect cleanup\nvec: practical useEffect cleanup implementation examples\nhyde: Here are some practical examples of useEffect cleanup in action. Each example demonstrates a common use case with working code.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "react cleanup code",
-        "react cleanup example"
-      ],
-      "vec": [
-        "code examples for react useEffect cleanup",
-        "practical useEffect cleanup implementation examples"
-      ],
-      "hyde": [
-        "Here are some practical examples of useEffect cleanup in action. Each example demonstrates a common use case with working code."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "auth",
-    "expansion": "lex: access\nlex: login\nvec: access\nvec: login\nhyde: The topic of auth covers sign in. Proper implementation follows established patterns and best practices.",
-    "scores": {
-      "format": 30,
-      "diversity": 28,
-      "hyde": 20,
-      "quality": 17,
-      "deductions": [
-        "vec duplicates: 'access...' ~ 'login...'",
-        "some vec lines too short/keyword-like",
-        "lex lines too generic - missing key terms from query"
-      ],
-      "total": 95,
-      "max_possible": 100,
-      "percentage": 95.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "access",
-        "login"
-      ],
-      "vec": [
-        "access",
-        "login"
-      ],
-      "hyde": [
-        "The topic of auth covers sign in. Proper implementation follows established patterns and best practices."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "config",
-    "expansion": "lex: config fix\nlex: config error\nvec: troubleshooting common config problems\nvec: how to fix config errors\nhyde: If you encounter config issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
-    "scores": {
-      "format": 30,
-      "diversity": 25,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "echoes query: 'config fix...'",
-        "echoes query: 'config error...'"
-      ],
-      "total": 95,
-      "max_possible": 100,
-      "percentage": 95.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "config fix",
-        "config error"
-      ],
-      "vec": [
-        "troubleshooting common config problems",
-        "how to fix config errors"
-      ],
-      "hyde": [
-        "If you encounter config issues, check your configuration first. Common problems include missing dependencies and incorrect settings."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "setup",
-    "expansion": "lex: install\nlex: setup guide\nvec: install setup tutorial\nvec: setup guide\nhyde: This guide covers setup configuration and usage. Follow the steps below to get started with your first implementation.",
-    "scores": {
-      "format": 30,
-      "diversity": 25,
-      "hyde": 20,
-      "quality": 19,
-      "deductions": [
-        "echoes query: 'setup guide...'",
-        "echoes query: 'setup guide...'",
-        "some vec lines too short/keyword-like"
-      ],
-      "total": 94,
-      "max_possible": 100,
-      "percentage": 94.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "install",
-        "setup guide"
-      ],
-      "vec": [
-        "install setup tutorial",
-        "setup guide"
-      ],
-      "hyde": [
-        "This guide covers setup configuration and usage. Follow the steps below to get started with your first implementation."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "api",
-    "expansion": "lex: api troubleshooting\nlex: api fixes\nvec: troubleshooting common api problems\nvec: how to fix api errors\nhyde: If you encounter api issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
-    "scores": {
-      "format": 30,
-      "diversity": 25,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "echoes query: 'api fixes...'"
-      ],
-      "total": 95,
-      "max_possible": 100,
-      "percentage": 95.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "api troubleshooting",
-        "api fixes"
-      ],
-      "vec": [
-        "troubleshooting common api problems",
-        "how to fix api errors"
-      ],
-      "hyde": [
-        "If you encounter api issues, check your configuration first. Common problems include missing dependencies and incorrect settings."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "meeting notes project kickoff",
-    "expansion": "lex: invite attendees to\nlex: agenda setup and\nvec: invite attendees to organize a meeting notes project\nvec: agenda setup and preparation for meeting notes\nhyde: Understanding meeting notes project kickoff is essential for modern development. Key aspects include agenda setup and preparation for.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "lex lines too generic - missing key terms from query"
-      ],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "invite attendees to",
-        "agenda setup and"
-      ],
-      "vec": [
-        "invite attendees to organize a meeting notes project",
-        "agenda setup and preparation for meeting notes"
-      ],
-      "hyde": [
-        "Understanding meeting notes project kickoff is essential for modern development. Key aspects include agenda setup and preparation for."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "ideas for new feature",
-    "expansion": "lex: brainstorm new enhancements\nlex: how to develop\nvec: brainstorm new enhancements\nvec: how to develop a new feature idea\nhyde: Ideas for new feature is an important concept that relates to how to develop a new feature idea.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "brainstorm new enhancements",
-        "how to develop"
-      ],
-      "vec": [
-        "brainstorm new enhancements",
-        "how to develop a new feature idea"
-      ],
-      "hyde": [
-        "Ideas for new feature is an important concept that relates to how to develop a new feature idea."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "todo list app architecture",
-    "expansion": "lex: task manager design\nlex: list app structure\nvec: task manager design\nvec: list app structure\nhyde: Todo list app architecture is an important concept that relates to task manager design.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "task manager design",
-        "list app structure"
-      ],
-      "vec": [
-        "task manager design",
-        "list app structure"
-      ],
-      "hyde": [
-        "Todo list app architecture is an important concept that relates to task manager design."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "what is dependency injection",
-    "expansion": "lex: definition of dependency\nlex: how to use\nvec: definition of dependency injection and its benefits\nvec: how to use dependency injection in code\nhyde: Dependency injection is defined as examples of dependency injection in practice. This plays a crucial role in modern development practices.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "definition of dependency",
-        "how to use"
-      ],
-      "vec": [
-        "definition of dependency injection and its benefits",
-        "how to use dependency injection in code"
-      ],
-      "hyde": [
-        "Dependency injection is defined as examples of dependency injection in practice. This plays a crucial role in modern development practices."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "difference between sql and nosql",
-    "expansion": "lex: distinctions between sql\nlex: sql vs nosql\nvec: distinctions between sql and nosql\nvec: sql vs nosql\nhyde: The topic of difference between sql and nosql covers distinctions between sql and nosql. Proper implementation follows established patterns and best practices.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "some vec lines too short/keyword-like"
-      ],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "distinctions between sql",
-        "sql vs nosql"
-      ],
-      "vec": [
-        "distinctions between sql and nosql",
-        "sql vs nosql"
-      ],
-      "hyde": [
-        "The topic of difference between sql and nosql covers distinctions between sql and nosql. Proper implementation follows established patterns and best practices."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "kubernetes vs docker swarm",
-    "expansion": "lex: docker vs\nlex: kubernetes and\nvec: docker vs kubernetes\nvec: kubernetes and docker comparison\nhyde: The topic of kubernetes vs docker swarm covers comparison between kubernetes and docker.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "docker vs",
-        "kubernetes and"
-      ],
-      "vec": [
-        "docker vs kubernetes",
-        "kubernetes and docker comparison"
-      ],
-      "hyde": [
-        "The topic of kubernetes vs docker swarm covers comparison between kubernetes and docker."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "connection timeout error",
-    "expansion": "lex: error handling for\nlex: connection timeout\nvec: error handling for connection timeouts\nvec: connection timeout handling in software\nhyde: If you encounter connection timeout error, verify that the issue is resolved by checking the configuration.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "error handling for",
-        "connection timeout"
-      ],
-      "vec": [
-        "error handling for connection timeouts",
-        "connection timeout handling in software"
-      ],
-      "hyde": [
-        "If you encounter connection timeout error, verify that the issue is resolved by checking the configuration."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "memory leak debugging",
-    "expansion": "lex: troubleshooting common memory\nlex: memory leak fix\nvec: troubleshooting common memory leaks\nvec: memory leak fix\nhyde: Memory leak debugging is an important concept that relates to memory leak fix. Proper implementation follows established patterns and best practices.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "some vec lines too short/keyword-like"
-      ],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "troubleshooting common memory",
-        "memory leak fix"
-      ],
-      "vec": [
-        "troubleshooting common memory leaks",
-        "memory leak fix"
-      ],
-      "hyde": [
-        "Memory leak debugging is an important concept that relates to memory leak fix. Proper implementation follows established patterns and best practices."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "cors error fix",
-    "expansion": "lex: cors fix\nlex: cors error handling\nvec: how to fix cors errors\nvec: common cors troubleshooting problems\nhyde: If you encounter cors error fix issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "cors fix",
-        "cors error handling"
-      ],
-      "vec": [
-        "how to fix cors errors",
-        "common cors troubleshooting problems"
-      ],
-      "hyde": [
-        "If you encounter cors error fix issues, check your configuration first. Common problems include missing dependencies and incorrect settings."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "how to implement caching with redis in nodejs",
-    "expansion": "lex: steps for setting\nlex: guide to configuring\nvec: steps for setting up redis caching in nodejs\nvec: guide to configuring redis caching in nodejs\nhyde: To implement caching with redis in nodejs, first configure the redis settings in your project configuration.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "lex lines too generic - missing key terms from query"
-      ],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "steps for setting",
-        "guide to configuring"
-      ],
-      "vec": [
-        "steps for setting up redis caching in nodejs",
-        "guide to configuring redis caching in nodejs"
-      ],
-      "hyde": [
-        "To implement caching with redis in nodejs, first configure the redis settings in your project configuration."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "best practices for api rate limiting",
-    "expansion": "lex: recommendations for implementing\nlex: guidelines for setting\nvec: recommendations for implementing rate limiting in api calls\nvec: guidelines for setting up api rate limiting\nhyde: The process of best practices for api rate limiting involves several steps. First, recommendations for implementing rate limiting in api calls.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "lex lines too generic - missing key terms from query"
-      ],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "recommendations for implementing",
-        "guidelines for setting"
-      ],
-      "vec": [
-        "recommendations for implementing rate limiting in api calls",
-        "guidelines for setting up api rate limiting"
-      ],
-      "hyde": [
-        "The process of best practices for api rate limiting involves several steps. First, recommendations for implementing rate limiting in api calls."
-      ],
-      "invalid": []
-    }
-  },
-  {
-    "query": "setting up ci cd pipeline with github actions",
-    "expansion": "lex: how to configure\nlex: guide to configuring\nvec: how to configure github actions for ci cd pipeline\nvec: guide to configuring a github actions ci cd pipeline\nhyde: Setting up ci cd pipeline with github actions requires several steps. First, setup the necessary dependencies.",
-    "scores": {
-      "format": 30,
-      "diversity": 30,
-      "hyde": 20,
-      "quality": 20,
-      "deductions": [
-        "lex lines too generic - missing key terms from query"
-      ],
-      "total": 100,
-      "max_possible": 100,
-      "percentage": 100.0,
-      "rating": "Excellent",
-      "is_continuation": false
-    },
-    "parsed": {
-      "lex": [
-        "how to configure",
-        "guide to configuring"
-      ],
-      "vec": [
-        "how to configure github actions for ci cd pipeline",
-        "guide to configuring a github actions ci cd pipeline"
-      ],
-      "hyde": [
-        "Setting up ci cd pipeline with github actions requires several steps. First, setup the necessary dependencies."
-      ],
-      "invalid": []
-    }
-  }
-]