Refactor finetune folder: train/rl scripts with YAML configs

Major changes:
- train.py: Generic SFT training script using YAML config
- rl.py: Generic GRPO training script using YAML config
- configs/: YAML configs per training run (sft_v4.yaml, grpo_v4.yaml)
- dataset/: Data preparation scripts moved here
- tui.py: Interactive model testing interface

Training results:
- SFT v4: 98.8% avg score (all Excellent)
- GRPO v4: 0% (failed - model drifted to verbose explanations)

Removed per-model scripts (train_0.6B.py, train_1.7B.py, etc)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Tobi Lutke 2026-01-24 20:26:46 -05:00
parent d32e13c172
commit 32706a720f
No known key found for this signature in database
21 changed files with 2987 additions and 939 deletions

View File

@ -56,10 +56,16 @@ hyde: Authentication can be configured by setting the AUTH_SECRET environment va
| Criterion | Points | Deduction |
|-----------|--------|-----------|
| Expansions are relevant to query | +10 | Subjective |
| Base relevance | +10 | Subjective |
| Lex lines preserve key terms from query | +5 | -5 if lex is generic (e.g., "features" without "shop.app") |
| Lex lines are keyword-focused (shorter) | +5 | -2 if lex is longer than vec |
| Vec lines are natural language (complete phrases) | +5 | -2 if vec is just keywords |
**Key Term Preservation Rule**: `lex:` lines MUST contain at least one significant word from the original query (excluding stopwords like "what", "is", "how", "the", etc.).
Bad: `what is shop.app``lex: features and benefits`
Good: `what is shop.app``lex: shop.app features`
## Score Calculation
```

View File

@ -0,0 +1,39 @@
# GRPO Training Config for QMD Query Expansion v4
# Uses SFT model as starting point with RL fine-tuning
model:
sft: "tobil/qmd-query-expansion-0.6B-v4" # Starting point
base: "Qwen/Qwen3-0.6B" # For tokenizer
output: "tobil/qmd-query-expansion-0.6B-v4-grpo"
dataset:
name: "tobil/qmd-query-expansion-train-v2"
prompt_field: "messages"
max_samples: 2000
training:
epochs: 1
batch_size: 2
gradient_accumulation_steps: 8
learning_rate: 1e-6 # Very low for RL stability
max_grad_norm: 0.5
grpo:
num_generations: 4
max_completion_length: 200
lora:
rank: 4 # Smaller for RL stability
alpha: 8
dropout: 0.05
target_modules:
- "q_proj"
- "v_proj"
tracking:
project: "qmd-query-expansion"
run_name: "grpo-v4-key-term-preservation"
# Note: GRPO v4 failed due to reward function not enforcing format strictly enough.
# The model drifted to verbose explanations instead of lex:/vec:/hyde: format.
# Recommendation: Add hard penalty when output doesn't start with valid prefix.

View File

@ -0,0 +1,38 @@
# SFT Training Config for QMD Query Expansion v4
# Uses Qwen3-0.6B with LoRA, trained with key term preservation
model:
base: "Qwen/Qwen3-0.6B"
output: "tobil/qmd-query-expansion-0.6B-v4"
dataset:
name: "tobil/qmd-query-expansion-train-v2"
text_field: "text"
split: "train"
eval_split: 0.1
training:
epochs: 3
batch_size: 4
gradient_accumulation_steps: 4
learning_rate: 2e-4
max_length: 512
warmup_ratio: 0.03
lr_scheduler: "cosine"
lora:
rank: 16
alpha: 32
dropout: 0.0
target_modules:
- "q_proj"
- "k_proj"
- "v_proj"
- "o_proj"
- "gate_proj"
- "up_proj"
- "down_proj"
tracking:
project: "qmd-query-expansion"
run_name: "sft-v4-key-term-preservation"

View File

@ -1,7 +1,8 @@
{
"dataset_name": "qmd-query-expansion",
"train_samples": 5157,
"val_samples": 573,
"train_samples": 5562,
"val_samples": 618,
"short_query_pct": 26.5,
"columns": [
"prompt",
"completion",

View File

@ -0,0 +1,595 @@
#!/usr/bin/env python3
"""
Data Quality Reviewer for Query Expansion Training Dataset
This script identifies and flags/fixes semantic errors where technical terms
are misunderstood. For example:
- "gem find" expanded as "mineral hunt" instead of "ruby gem search"
- "yarn spin" expanded as "wool twist" instead of "yarn package manager"
The script uses contextual analysis to detect when technical terms
are likely being used in a programming context vs. their everyday meaning.
"""
import json
import re
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional
from collections import defaultdict
@dataclass
class TechnicalTerm:
"""Definition of a technical term that might be misunderstood."""
term: str # The ambiguous term (e.g., "liquid", "gem", "yarn")
context_indicators: list[str] # Words that suggest tech context
wrong_expansions: list[str] # Patterns that indicate wrong interpretation
correct_domain: str # What domain this belongs to when technical
correct_lex: list[str] # Correct lex expansions
correct_vec: list[str] # Correct vec expansions
# Known technical terms that are commonly misunderstood
KNOWN_TECHNICAL_TERMS = [
TechnicalTerm(
term="liquid",
context_indicators=["shopify", "template", "filter", "tag", "theme", "jekyll"],
wrong_expansions=["fluid", "water", "pour", "drink", "beverage", "h2o", "wet"],
correct_domain="Shopify/Jekyll templating language",
correct_lex=["shopify template syntax", "liquid template filter"],
correct_vec=["shopify liquid templating language", "liquid template engine filters"],
),
TechnicalTerm(
term="gem",
context_indicators=["ruby", "bundler", "install", "gemfile", "rails", "require"],
wrong_expansions=["mineral", "crystal", "jewel", "stone", "diamond", "jewelry", "precious"],
correct_domain="Ruby package manager",
correct_lex=["ruby gem package", "gem install command"],
correct_vec=["ruby gem package manager", "rubygems library installation"],
),
TechnicalTerm(
term="yarn",
context_indicators=["npm", "package", "install", "node", "javascript", "react", "webpack"],
wrong_expansions=["thread", "wool", "knit", "spin", "textile", "fabric", "sew", "twist"],
correct_domain="JavaScript package manager",
correct_lex=["yarn package manager", "yarn install dependencies"],
correct_vec=["yarn javascript package manager", "yarn npm alternative"],
),
TechnicalTerm(
term="hook",
context_indicators=["react", "use", "state", "effect", "component", "callback", "git"],
wrong_expansions=["fish", "fishing", "bait", "catch", "hang", "pirate"],
correct_domain="React hooks or Git hooks",
correct_lex=["react hooks api", "usestate useeffect"],
correct_vec=["react hooks state management", "react functional component hooks"],
),
TechnicalTerm(
term="container",
context_indicators=["docker", "kubernetes", "k8s", "image", "orchestration", "pod"],
wrong_expansions=["box", "storage", "shipping", "cargo", "tupperware", "jar", "vessel"],
correct_domain="Docker/Kubernetes containers",
correct_lex=["docker container", "container image"],
correct_vec=["docker container virtualization", "container orchestration platform"],
),
TechnicalTerm(
term="branch",
context_indicators=["git", "merge", "checkout", "commit", "main", "master", "repo"],
wrong_expansions=["tree", "limb", "wood", "leaf", "twig", "forest"],
correct_domain="Git version control",
correct_lex=["git branch", "git checkout branch"],
correct_vec=["git branch version control", "git branching workflow"],
),
TechnicalTerm(
term="decorator",
context_indicators=["python", "@", "function", "wrapper", "class", "def"],
wrong_expansions=["interior", "design", "paint", "furniture", "decor", "ornament"],
correct_domain="Python decorators",
correct_lex=["python decorator function", "@decorator syntax"],
correct_vec=["python function decorators", "python decorator pattern"],
),
TechnicalTerm(
term="bean",
context_indicators=["java", "spring", "injection", "dependency", "servlet", "ejb"],
wrong_expansions=["coffee", "food", "vegetable", "legume", "plant", "soy"],
correct_domain="Java Beans / Spring Beans",
correct_lex=["java bean class", "spring bean injection"],
correct_vec=["java enterprise beans", "spring dependency injection beans"],
),
TechnicalTerm(
term="shell",
context_indicators=["bash", "script", "terminal", "command", "linux", "unix", "zsh"],
wrong_expansions=["seashell", "ocean", "beach", "clam", "oyster", "egg"],
correct_domain="Unix/Linux shell scripting",
correct_lex=["bash shell script", "shell command"],
correct_vec=["unix shell scripting", "bash command line shell"],
),
TechnicalTerm(
term="rust",
context_indicators=["cargo", "crate", "ownership", "borrow", "lifetime", "unsafe"],
wrong_expansions=["oxidation", "metal", "corrosion", "decay", "iron", "orange"],
correct_domain="Rust programming language",
correct_lex=["rust programming language", "rust cargo package"],
correct_vec=["rust systems programming", "rust memory safety"],
),
TechnicalTerm(
term="go",
context_indicators=["golang", "goroutine", "channel", "defer", "gofmt", "module"],
wrong_expansions=["travel", "move", "walk", "game", "board game", "leave", "depart"],
correct_domain="Go programming language",
correct_lex=["golang programming", "go language syntax"],
correct_vec=["go programming language", "golang concurrent programming"],
),
TechnicalTerm(
term="swift",
context_indicators=["ios", "xcode", "apple", "uikit", "swiftui", "cocoa"],
wrong_expansions=["fast", "quick", "bird", "speed", "rapid", "taylor"],
correct_domain="Swift programming language",
correct_lex=["swift ios development", "swift programming language"],
correct_vec=["swift apple programming language", "swift ios app development"],
),
TechnicalTerm(
term="pod",
context_indicators=["kubernetes", "k8s", "deployment", "service", "cluster", "node"],
wrong_expansions=["pea", "seed", "plant", "vegetable", "legume", "whale"],
correct_domain="Kubernetes pods",
correct_lex=["kubernetes pod", "k8s pod deployment"],
correct_vec=["kubernetes pod container group", "k8s pod orchestration"],
),
TechnicalTerm(
term="redis",
context_indicators=["cache", "database", "key-value", "memory", "pub/sub", "queue"],
wrong_expansions=[], # "redis" doesn't have common wrong meanings
correct_domain="Redis in-memory database",
correct_lex=["redis cache", "redis database"],
correct_vec=["redis in-memory data store", "redis caching solution"],
),
TechnicalTerm(
term="kafka",
context_indicators=["message", "stream", "queue", "broker", "topic", "producer", "consumer"],
wrong_expansions=["franz", "author", "writer", "novel", "metamorphosis", "literature"],
correct_domain="Apache Kafka message queue",
correct_lex=["apache kafka", "kafka message broker"],
correct_vec=["apache kafka streaming platform", "kafka message queue"],
),
TechnicalTerm(
term="elastic",
context_indicators=["elasticsearch", "search", "index", "kibana", "logstash", "query"],
wrong_expansions=["stretch", "rubber", "flexible", "band", "bouncy"],
correct_domain="Elasticsearch",
correct_lex=["elasticsearch", "elastic search index"],
correct_vec=["elasticsearch full-text search", "elastic stack"],
),
TechnicalTerm(
term="spark",
context_indicators=["apache", "hadoop", "data", "rdd", "dataframe", "pyspark"],
wrong_expansions=["fire", "ignite", "flame", "plug", "electricity"],
correct_domain="Apache Spark",
correct_lex=["apache spark", "spark data processing"],
correct_vec=["apache spark big data processing", "spark cluster computing"],
),
TechnicalTerm(
term="flask",
context_indicators=["python", "web", "route", "api", "jinja", "werkzeug"],
wrong_expansions=["bottle", "container", "lab", "chemistry", "drink", "thermos"],
correct_domain="Flask web framework",
correct_lex=["flask python web framework", "flask api"],
correct_vec=["flask python web development", "flask microframework"],
),
TechnicalTerm(
term="django",
context_indicators=["python", "web", "orm", "model", "view", "template"],
wrong_expansions=["jazz", "music", "reinhardt", "guitar", "movie", "western"],
correct_domain="Django web framework",
correct_lex=["django python framework", "django web development"],
correct_vec=["django python web framework", "django orm models"],
),
TechnicalTerm(
term="rails",
context_indicators=["ruby", "gem", "activerecord", "model", "controller", "migration"],
wrong_expansions=["train", "track", "railroad", "railway", "metal"],
correct_domain="Ruby on Rails",
correct_lex=["ruby on rails", "rails web framework"],
correct_vec=["ruby on rails framework", "rails mvc architecture"],
),
TechnicalTerm(
term="node",
context_indicators=["javascript", "npm", "express", "async", "require", "module"],
wrong_expansions=["lump", "knot", "bump", "growth", "junction"],
correct_domain="Node.js",
correct_lex=["node.js javascript", "nodejs runtime"],
correct_vec=["node.js javascript runtime", "nodejs server-side javascript"],
),
TechnicalTerm(
term="maven",
context_indicators=["java", "pom", "dependency", "build", "artifact", "repository"],
wrong_expansions=["expert", "specialist", "connoisseur"],
correct_domain="Apache Maven",
correct_lex=["apache maven", "maven build tool"],
correct_vec=["apache maven java build", "maven dependency management"],
),
TechnicalTerm(
term="gradle",
context_indicators=["java", "kotlin", "android", "build", "groovy", "task"],
wrong_expansions=["grade", "slope", "hill", "incline"],
correct_domain="Gradle build tool",
correct_lex=["gradle build tool", "gradle android"],
correct_vec=["gradle java build automation", "gradle kotlin dsl"],
),
TechnicalTerm(
term="ant",
context_indicators=["java", "build", "xml", "target", "task"],
wrong_expansions=["insect", "bug", "colony", "hill", "picnic"],
correct_domain="Apache Ant build tool",
correct_lex=["apache ant", "ant build xml"],
correct_vec=["apache ant java build", "ant build automation"],
),
]
@dataclass
class Issue:
"""Represents an issue found in a dataset example."""
line_number: int
input_text: str
output_text: str
issue_type: str
technical_term: str
wrong_expansion_found: str
suggested_fix: Optional[str] = None
@dataclass
class AnalysisResult:
"""Results of analyzing the dataset."""
total_examples: int = 0
issues_found: list[Issue] = field(default_factory=list)
examples_with_correct_tech_terms: list[tuple[int, str]] = field(default_factory=list)
term_statistics: dict = field(default_factory=lambda: defaultdict(int))
def check_for_wrong_expansion(output_text: str, term: TechnicalTerm) -> Optional[str]:
"""Check if the output contains wrong expansions for a technical term."""
output_lower = output_text.lower()
for wrong in term.wrong_expansions:
if wrong.lower() in output_lower:
return wrong
return None
def has_tech_context(input_text: str, term: TechnicalTerm) -> bool:
"""Check if the input has indicators of a technical context."""
input_lower = input_text.lower()
for indicator in term.context_indicators:
if indicator.lower() in input_lower:
return True
return False
def is_likely_tech_query(input_text: str) -> bool:
"""
Heuristic to determine if a short query is likely tech-related.
Short queries like "gem find" or "yarn spin" are ambiguous.
"""
tech_patterns = [
r'\b(install|config|setup|build|run|debug|test|deploy|compile)\b',
r'\b(api|cli|sdk|lib|pkg|npm|pip|cargo)\b',
r'\b(func|class|method|var|const|let|def)\b',
r'\b(http|https|url|port|host|server|client)\b',
r'\b(json|xml|yaml|csv|sql|html|css|js)\b',
]
input_lower = input_text.lower()
for pattern in tech_patterns:
if re.search(pattern, input_lower):
return True
return False
def has_non_tech_context(input_text: str, term: TechnicalTerm) -> bool:
"""
Check if the input clearly indicates a non-technical context.
This helps avoid false positives for words like "car rust", "yarn spin", etc.
"""
input_lower = input_text.lower()
term_lower = term.term.lower()
# Define non-tech context indicators for each ambiguous term
non_tech_contexts = {
"rust": ["car", "metal", "iron", "steel", "corrosion", "prevention", "remove", "body"],
"gem": ["gemstone", "jewelry", "jewel", "diamond", "precious", "stone", "cut", "shop", "buy", "wear"],
"yarn": ["knit", "crochet", "spin", "wool", "thread", "textile", "fabric", "sew", "weave"],
"hook": ["fishing", "crochet", "hang", "coat", "wall", "ceiling"],
"container": ["storage", "plastic", "food", "shipping", "cargo", "kitchen", "box"],
"branch": ["tree", "bank", "library", "store", "office", "organization"],
"decorator": ["interior", "home", "room", "house", "design", "party", "cake", "wedding"],
"bean": ["coffee", "soy", "kidney", "black", "green", "garden", "cooking", "food", "plant", "grow"],
"shell": ["sea", "beach", "egg", "nut", "turtle", "snail", "crab", "clam", "oyster"],
"spark": ["plug", "fire", "ignite", "car", "engine", "electric", "romance"],
"go": ["travel", "vacation", "trip", "walk", "run", "leave", "visit", "tour"],
"swift": ["taylor", "concert", "music", "singer", "speed", "fast", "bird"],
"pod": ["pea", "whale", "orca", "dolphin", "vegetable", "seed", "plant"],
"ant": ["insect", "colony", "fire", "carpenter", "pest", "bug", "picnic"],
"node": ["lymph", "medical", "body", "tree", "network point"],
"rails": ["train", "railroad", "railway", "track", "transit", "fence"],
"flask": ["lab", "chemistry", "drink", "hip", "thermos", "bottle", "water"],
"django": ["jazz", "music", "reinhardt", "guitar", "movie", "western", "unchained"],
"maven": ["expert", "connoisseur", "specialist", "guru"],
"gradle": ["grade", "school", "slope"],
"kafka": ["franz", "author", "novel", "metamorphosis", "literature", "writer", "book"],
"elastic": ["band", "rubber", "stretch", "flexible", "waist", "fabric"],
}
if term_lower in non_tech_contexts:
for context_word in non_tech_contexts[term_lower]:
if context_word.lower() in input_lower:
return True
return False
def analyze_example(line_num: int, input_text: str, output_text: str) -> list[Issue]:
"""Analyze a single example for potential issues."""
issues = []
input_lower = input_text.lower()
for term in KNOWN_TECHNICAL_TERMS:
term_lower = term.term.lower()
# Check if the input contains this technical term
if term_lower not in input_lower:
continue
# Check if output has wrong expansion
wrong_expansion = check_for_wrong_expansion(output_text, term)
if wrong_expansion is None:
continue
# Skip if the context clearly indicates non-technical usage
if has_non_tech_context(input_text, term):
continue
# Determine if this is likely a technical context
is_tech = has_tech_context(input_text, term) or is_likely_tech_query(input_text)
# For very short inputs that contain ONLY the tech term (like "gem find"),
# these are ambiguous and could be tech-related
word_count = len(input_text.split())
words = [w.lower() for w in input_text.split()]
# Only flag if it's clearly a tech context OR a very short query
# where the term appears prominently (e.g., "gem find", "yarn add")
if is_tech:
# Create suggested fix for definite tech issues
suggested_output = f"lex: {term.correct_lex[0]}\nlex: {term.correct_lex[1] if len(term.correct_lex) > 1 else term.correct_lex[0]}\nvec: {term.correct_vec[0]}\nvec: {term.correct_vec[1] if len(term.correct_vec) > 1 else term.correct_vec[0]}\nhyde: {term.correct_domain} is a concept that provides functionality for software development."
issue = Issue(
line_number=line_num,
input_text=input_text,
output_text=output_text[:200] + "..." if len(output_text) > 200 else output_text,
issue_type="wrong_tech_expansion",
technical_term=term.term,
wrong_expansion_found=wrong_expansion,
suggested_fix=suggested_output
)
issues.append(issue)
elif word_count <= 2 and term_lower in words:
# Very short query with the term as a primary word - truly ambiguous
issue = Issue(
line_number=line_num,
input_text=input_text,
output_text=output_text[:200] + "..." if len(output_text) > 200 else output_text,
issue_type="ambiguous_term",
technical_term=term.term,
wrong_expansion_found=wrong_expansion,
suggested_fix=None
)
issues.append(issue)
return issues
def analyze_dataset(filepath: Path) -> AnalysisResult:
"""Analyze the entire dataset for issues."""
result = AnalysisResult()
with open(filepath, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
example = json.loads(line)
input_text = example.get('input', '')
output_text = example.get('output', '')
result.total_examples += 1
# Analyze for issues
issues = analyze_example(line_num, input_text, output_text)
result.issues_found.extend(issues)
# Track term statistics
for term in KNOWN_TECHNICAL_TERMS:
if term.term.lower() in input_text.lower():
result.term_statistics[term.term] += 1
except json.JSONDecodeError as e:
print(f"Warning: Could not parse line {line_num}: {e}")
return result
def fix_example(example: dict, issues: list[Issue]) -> Optional[dict]:
"""
Attempt to fix an example based on identified issues.
Returns None if no fix is needed or possible.
"""
# Only fix examples with definite tech context issues
tech_issues = [i for i in issues if i.issue_type == "wrong_tech_expansion" and i.suggested_fix]
if not tech_issues:
return None
# Use the first tech issue's fix (they should be similar)
issue = tech_issues[0]
fixed = example.copy()
fixed['output'] = issue.suggested_fix
fixed['_fixed'] = True
fixed['_original_output'] = example['output']
fixed['_fix_reason'] = f"Technical term '{issue.technical_term}' was incorrectly expanded as '{issue.wrong_expansion_found}'"
return fixed
def generate_report(result: AnalysisResult) -> str:
"""Generate a human-readable report of the analysis."""
lines = []
lines.append("=" * 70)
lines.append("QUERY EXPANSION DATASET QUALITY REPORT")
lines.append("=" * 70)
lines.append("")
lines.append(f"Total examples analyzed: {result.total_examples}")
lines.append(f"Issues found: {len(result.issues_found)}")
lines.append("")
# Group issues by type
by_type = defaultdict(list)
for issue in result.issues_found:
by_type[issue.issue_type].append(issue)
lines.append("-" * 70)
lines.append("ISSUES BY TYPE:")
lines.append("-" * 70)
for issue_type, issues in by_type.items():
lines.append(f"\n{issue_type.upper()}: {len(issues)} issues")
lines.append("-" * 40)
# Show up to 10 examples per type
for issue in issues[:10]:
lines.append(f"\n Line {issue.line_number}:")
lines.append(f" Input: {issue.input_text}")
lines.append(f" Technical term: '{issue.technical_term}'")
lines.append(f" Wrong expansion found: '{issue.wrong_expansion_found}'")
if issue.suggested_fix:
lines.append(f" Suggested fix available: Yes")
if len(issues) > 10:
lines.append(f"\n ... and {len(issues) - 10} more")
# Term statistics
lines.append("\n" + "-" * 70)
lines.append("TECHNICAL TERM OCCURRENCES IN DATASET:")
lines.append("-" * 70)
for term, count in sorted(result.term_statistics.items(), key=lambda x: -x[1]):
if count > 0:
lines.append(f" {term}: {count} occurrences")
lines.append("\n" + "=" * 70)
return "\n".join(lines)
def save_cleaned_dataset(filepath: Path, output_path: Path, result: AnalysisResult):
"""Save a cleaned version of the dataset."""
issues_by_line = defaultdict(list)
for issue in result.issues_found:
issues_by_line[issue.line_number].append(issue)
fixed_count = 0
flagged_count = 0
with open(filepath, 'r', encoding='utf-8') as f_in, \
open(output_path, 'w', encoding='utf-8') as f_out:
for line_num, line in enumerate(f_in, 1):
line = line.strip()
if not line:
continue
try:
example = json.loads(line)
if line_num in issues_by_line:
issues = issues_by_line[line_num]
fixed = fix_example(example, issues)
if fixed:
f_out.write(json.dumps(fixed) + '\n')
fixed_count += 1
else:
# Flag but don't fix ambiguous cases
example['_flagged'] = True
example['_flag_reason'] = f"Ambiguous term '{issues[0].technical_term}' may need review"
f_out.write(json.dumps(example) + '\n')
flagged_count += 1
else:
f_out.write(json.dumps(example) + '\n')
except json.JSONDecodeError:
# Keep problematic lines as-is
f_out.write(line + '\n')
return fixed_count, flagged_count
def main():
"""Main entry point."""
# Paths
script_dir = Path(__file__).parent
input_path = script_dir / "data" / "qmd_expansion.jsonl"
output_path = script_dir / "data" / "qmd_expansion_cleaned.jsonl"
report_path = script_dir / "data" / "quality_report.txt"
print(f"Analyzing dataset: {input_path}")
print("-" * 50)
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
return 1
# Analyze the dataset
result = analyze_dataset(input_path)
# Generate and print report
report = generate_report(result)
print(report)
# Save report to file
with open(report_path, 'w', encoding='utf-8') as f:
f.write(report)
print(f"\nReport saved to: {report_path}")
# Save cleaned dataset
fixed_count, flagged_count = save_cleaned_dataset(input_path, output_path, result)
print(f"\nCleaned dataset saved to: {output_path}")
print(f" - Examples fixed: {fixed_count}")
print(f" - Examples flagged for review: {flagged_count}")
print(f" - Examples unchanged: {result.total_examples - fixed_count - flagged_count}")
# Summary statistics
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Total examples: {result.total_examples}")
print(f"Total issues found: {len(result.issues_found)}")
tech_issues = [i for i in result.issues_found if i.issue_type == "wrong_tech_expansion"]
ambig_issues = [i for i in result.issues_found if i.issue_type == "ambiguous_term"]
print(f" - Definite tech term errors: {len(tech_issues)}")
print(f" - Ambiguous terms needing review: {len(ambig_issues)}")
if len(result.issues_found) > 0:
error_rate = len(result.issues_found) / result.total_examples * 100
print(f"\nError rate: {error_rate:.2f}%")
return 0
if __name__ == "__main__":
exit(main())

View File

@ -0,0 +1,272 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "transformers>=4.45.0",
# "jinja2",
# ]
# ///
"""Prepare QMD query expansion data for training.
See PROMPT_FORMAT.md for format specification.
"""
import argparse
import json
import random
from pathlib import Path
from transformers import AutoTokenizer
_tokenizer = None
def get_tokenizer():
global _tokenizer
if _tokenizer is None:
_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
return _tokenizer
# Short single-word queries that need proper expansion examples
SHORT_QUERIES = [
# Technical keywords
"auth", "config", "setup", "api", "cache", "log", "test", "debug",
"deploy", "build", "lint", "format", "migrate", "backup", "restore",
"docker", "git", "npm", "pip", "brew", "curl", "ssh", "ssl", "tls",
"cors", "csrf", "jwt", "oauth", "saml", "ldap", "rbac", "acl",
"crud", "rest", "graphql", "grpc", "websocket", "sse", "http",
"redis", "mongo", "postgres", "mysql", "sqlite", "elastic", "kafka",
"nginx", "apache", "caddy", "traefik", "haproxy", "envoy",
"react", "vue", "angular", "svelte", "solid", "htmx", "alpine",
"node", "deno", "bun", "python", "rust", "golang", "java", "kotlin",
"webpack", "vite", "esbuild", "rollup", "parcel", "turbopack",
"jest", "vitest", "pytest", "mocha", "cypress", "playwright",
# Common short phrases
"env vars", "api keys", "error handling", "rate limiting",
"file upload", "user auth", "db connection", "query params",
"hot reload", "code split", "tree shake", "lazy load",
]
# Templates for generating short query expansions
# IMPORTANT: All lex lines MUST include {q} to preserve key terms
SHORT_TEMPLATES = [
{
"lex": ["{q} configuration", "{q} settings", "{q} setup"],
"vec": ["how to configure {q} in my project", "{q} setup and configuration tutorial"],
"hyde": "To set up {q}, first install the required dependencies. Then configure the settings in your project configuration file.",
},
{
"lex": ["{q} tutorial", "{q} guide", "{q} basics"],
"vec": ["beginner guide to {q}", "how to get started with {q}"],
"hyde": "This guide covers the basics of {q}. Follow the steps below to get started with your first implementation.",
},
{
"lex": ["{q} best practices", "{q} patterns", "{q} tips"],
"vec": ["best practices for using {q}", "recommended patterns for {q}"],
"hyde": "When working with {q}, follow these best practices: use consistent naming, handle errors properly, and document your code.",
},
{
"lex": ["{q} troubleshooting", "{q} fix", "{q} errors"],
"vec": ["how to fix {q} errors", "troubleshooting common {q} problems"],
"hyde": "If you encounter {q} issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
},
{
"lex": ["{q} examples", "{q} code", "{q} usage"],
"vec": ["code examples for {q}", "practical {q} implementation examples"],
"hyde": "Here are some practical examples of {q} in action. Each example demonstrates a common use case with working code.",
},
]
def truncate_hyde(hyde_text: str, max_len: int = 150) -> str:
"""Truncate hyde to max length, ending at sentence boundary."""
if len(hyde_text) <= max_len:
return hyde_text
truncated = hyde_text[:max_len]
last_period = truncated.rfind(". ")
if last_period > max_len // 2:
return truncated[:last_period + 1]
last_space = truncated.rfind(" ")
if last_space > max_len // 2:
return truncated[:last_space] + "."
return truncated[:max_len-1] + "."
def clean_output(output: str) -> str:
"""Clean output: truncate hyde, remove invalid lines."""
lines = []
for line in output.strip().split("\n"):
line = line.strip()
if not line:
continue
if line.startswith("hyde:"):
hyde_text = line[5:].strip()
hyde_text = truncate_hyde(hyde_text)
lines.append(f"hyde: {hyde_text}")
elif line.startswith(("lex:", "vec:")):
lines.append(line)
return "\n".join(lines)
def generate_short_example(query: str) -> dict:
"""Generate a training example for a short query."""
template = random.choice(SHORT_TEMPLATES)
lex_lines = random.sample(template["lex"], 2)
vec_lines = random.sample(template["vec"], 2)
hyde_line = template["hyde"]
output_lines = []
for lex in lex_lines:
output_lines.append(f"lex: {lex.format(q=query)}")
for vec in vec_lines:
output_lines.append(f"vec: {vec.format(q=query)}")
output_lines.append(f"hyde: {hyde_line.format(q=query)}")
return {"input": query, "output": "\n".join(output_lines)}
def format_for_training(input_text: str, output_text: str) -> dict:
"""Format a single example for SFT training using Qwen chat format."""
tokenizer = get_tokenizer()
# Use /no_think to disable thinking mode - we want direct output
messages = [
{"role": "user", "content": f"/no_think Expand this search query: {input_text}"},
{"role": "assistant", "content": output_text}
]
# Use tokenizer to generate proper chat format with special tokens
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=False,
)
# Strip empty <think> tags - we don't want thinking mode
# The template adds "<think>\n\n</think>\n\n" which we remove
text = text.replace("<think>\n\n</think>\n\n", "")
return {
"text": text,
"messages": messages,
}
def main():
parser = argparse.ArgumentParser(description="Prepare data for training")
parser.add_argument("--input", type=str, default="data/qmd_expansion.jsonl", help="Input JSONL file")
parser.add_argument("--output", type=str, default="data/train", help="Output directory")
parser.add_argument("--split", type=float, default=0.1, help="Validation split ratio")
parser.add_argument("--add-short", type=int, default=3, help="Variations per short query to add")
args = parser.parse_args()
input_path = Path(args.input)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
exit(1)
# Load and clean existing examples
examples = []
seen_queries = set()
long_hyde_count = 0
with open(input_path) as f:
for line in f:
if line.strip():
ex = json.loads(line)
# Clean output (truncate hyde, remove invalid lines)
original_output = ex["output"]
ex["output"] = clean_output(ex["output"])
# Track hyde truncation
if "hyde:" in original_output:
for orig_line in original_output.split("\n"):
if orig_line.strip().startswith("hyde:"):
if len(orig_line) > 160:
long_hyde_count += 1
# Validate cleaned output
has_lex = "lex:" in ex["output"]
has_vec = "vec:" in ex["output"]
if has_lex and has_vec:
examples.append(ex)
seen_queries.add(ex["input"].lower())
print(f"Loaded and cleaned {len(examples)} examples")
print(f"Truncated {long_hyde_count} long hyde sections")
# Count existing short queries
short_existing = sum(1 for ex in examples if len(ex["input"].split()) <= 2)
print(f"Existing short queries (1-2 words): {short_existing}")
# Generate additional short query examples
new_short = []
for query in SHORT_QUERIES:
if query.lower() not in seen_queries:
for _ in range(args.add_short):
new_short.append(generate_short_example(query))
seen_queries.add(query.lower())
print(f"Generated {len(new_short)} new short query examples")
# Combine and shuffle
all_examples = examples + new_short
random.shuffle(all_examples)
# Format for training
formatted = [format_for_training(ex["input"], ex["output"]) for ex in all_examples]
# Split into train/val
split_idx = int(len(formatted) * (1 - args.split))
train_data = formatted[:split_idx]
val_data = formatted[split_idx:]
# Write train set
train_path = output_dir / "train.jsonl"
with open(train_path, "w") as f:
for item in train_data:
f.write(json.dumps(item) + "\n")
# Write validation set
val_path = output_dir / "val.jsonl"
with open(val_path, "w") as f:
for item in val_data:
f.write(json.dumps(item) + "\n")
# Write chat format (for TRL)
chat_path = output_dir / "train_chat.jsonl"
with open(chat_path, "w") as f:
for item in train_data:
f.write(json.dumps({"messages": item["messages"]}) + "\n")
# Stats
short_final = sum(1 for ex in all_examples if len(ex["input"].split()) <= 2)
print(f"\n=== Summary ===")
print(f"Total examples: {len(all_examples)}")
print(f"Short queries: {short_final} ({100*short_final/len(all_examples):.1f}%)")
print(f"Train: {len(train_data)}, Val: {len(val_data)}")
print(f"Output: {output_dir}")
# Dataset info
dataset_info = {
"dataset_name": "qmd-query-expansion",
"train_samples": len(train_data),
"val_samples": len(val_data),
"short_query_pct": round(100*short_final/len(all_examples), 1),
"columns": ["prompt", "completion", "text", "messages"],
}
with open(output_dir / "dataset_info.json", "w") as f:
json.dump(dataset_info, f, indent=2)
if __name__ == "__main__":
main()

View File

@ -1,169 +0,0 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "transformers>=4.45.0",
# "torch",
# "huggingface_hub",
# "accelerate",
# ]
# ///
"""
Evaluate base model (untrained) for comparison.
"""
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Test queries covering different QMD use cases
TEST_QUERIES = [
"how to configure authentication",
"typescript async await",
"docker compose networking",
"git rebase vs merge",
"react useEffect cleanup",
"auth",
"config",
"setup",
"api",
"meeting notes project kickoff",
"ideas for new feature",
"todo list app architecture",
"what is dependency injection",
"difference between sql and nosql",
"kubernetes vs docker swarm",
"connection timeout error",
"memory leak debugging",
"cors error fix",
"how to implement caching with redis in nodejs",
"best practices for api rate limiting",
"setting up ci cd pipeline with github actions",
]
PROMPT_TEMPLATE = """Expand this search query:
{query}"""
def load_model(model_name: str):
"""Load the base model without adapter."""
print(f"Loading tokenizer and model from {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
device_map="auto",
)
model.eval()
return model, tokenizer
def generate_expansion(model, tokenizer, query: str, max_new_tokens: int = 200) -> str:
"""Generate query expansion."""
prompt = PROMPT_TEMPLATE.format(query=query)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
expansion = full_output[len(prompt):].strip()
return expansion
def evaluate_expansion(query: str, expansion: str) -> dict:
"""Basic automatic evaluation metrics."""
lines = expansion.strip().split("\n")
has_lex = any(l.strip().startswith("lex:") for l in lines)
has_vec = any(l.strip().startswith("vec:") for l in lines)
has_hyde = any(l.strip().startswith("hyde:") for l in lines)
valid_lines = sum(1 for l in lines if l.strip().startswith(("lex:", "vec:", "hyde:")))
contents = []
for l in lines:
if ":" in l:
contents.append(l.split(":", 1)[1].strip().lower())
unique_contents = len(set(contents))
return {
"has_lex": has_lex,
"has_vec": has_vec,
"has_hyde": has_hyde,
"valid_lines": valid_lines,
"total_lines": len(lines),
"unique_contents": unique_contents,
"format_score": (has_lex + has_vec + has_hyde) / 3,
}
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="Qwen/Qwen3-0.6B",
help="Base model to evaluate")
parser.add_argument("--output", default="evaluation_baseline.json",
help="Output file for results")
parser.add_argument("--num-queries", type=int, default=5,
help="Number of queries to test (for quick baseline)")
args = parser.parse_args()
model, tokenizer = load_model(args.model)
queries = TEST_QUERIES[:args.num_queries]
results = []
print(f"\n{'='*70}")
print("BASELINE EVALUATION RESULTS")
print(f"{'='*70}\n")
for i, query in enumerate(queries, 1):
print(f"[{i}/{len(queries)}] Query: {query}")
print("-" * 50)
expansion = generate_expansion(model, tokenizer, query)
metrics = evaluate_expansion(query, expansion)
print(expansion[:500] + "..." if len(expansion) > 500 else expansion)
print(f"\n Format: {'' if metrics['format_score'] == 1.0 else ''} "
f"(lex:{metrics['has_lex']}, vec:{metrics['has_vec']}, hyde:{metrics['has_hyde']})")
print()
results.append({
"query": query,
"expansion": expansion,
"metrics": metrics,
})
print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
avg_format = sum(r["metrics"]["format_score"] for r in results) / len(results)
full_format = sum(1 for r in results if r["metrics"]["format_score"] == 1.0)
print(f" Total queries: {len(results)}")
print(f" Average format score: {avg_format:.2%}")
print(f" Full format compliance: {full_format}/{len(results)} ({full_format/len(results):.0%})")
with open(args.output, "w") as f:
json.dump(results, f, indent=2)
print(f"\n Results saved to: {args.output}")
if __name__ == "__main__":
main()

View File

@ -52,9 +52,8 @@ TEST_QUERIES = [
"setting up ci cd pipeline with github actions",
]
PROMPT_TEMPLATE = """Expand this search query:
{query}"""
# Prompt is generated via tokenizer.apply_chat_template() - see generate_expansion()
# Don't manually construct <|im_start|> tags
STOPWORDS = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or', 'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by'}
@ -110,6 +109,23 @@ def echoes_query(expansion: str, query: str) -> bool:
return False
def get_key_terms(query: str) -> set:
"""Extract key terms from query (excluding stopwords)."""
stopwords = {'what', 'is', 'how', 'to', 'the', 'a', 'an', 'in', 'on', 'for', 'of',
'and', 'or', 'with', 'my', 'your', 'do', 'does', 'can', 'i', 'me', 'we'}
words = set(query.lower().split())
return words - stopwords
def lex_preserves_key_terms(lex_line: str, query: str) -> bool:
"""Check if lex line contains at least one key term from query."""
key_terms = get_key_terms(query)
if not key_terms: # Very short query
return True
lex_words = set(lex_line.lower().split())
return bool(key_terms & lex_words)
def word_repetition_penalty(text: str) -> int:
"""Count penalty for repeated words (excluding stopwords)."""
words = re.findall(r'\b\w+\b', text.lower())
@ -121,12 +137,56 @@ def word_repetition_penalty(text: str) -> int:
return penalty
def is_continuation(expansion: str) -> bool:
"""
Detect if output is a continuation rather than proper expansion.
A continuation is when the model continues the query as prose
instead of outputting lex:/vec:/hyde: lines.
"""
text = expansion.strip()
if not text:
return True
# Check first non-empty line
first_line = text.split("\n")[0].strip()
# Valid outputs must start with a prefix
valid_prefixes = ("lex:", "vec:", "hyde:")
if first_line.startswith(valid_prefixes):
return False
# If first line doesn't have a valid prefix, it's a continuation
# Exception: empty first line (check second)
lines = [l.strip() for l in text.split("\n") if l.strip()]
if lines and not lines[0].startswith(valid_prefixes):
return True
return False
def score_expansion(query: str, expansion: str) -> dict:
"""
Score an expansion based on SCORING.md criteria.
Returns dict with score breakdown and total (0-100).
"""
# HARD FAIL: Continuation detection
if is_continuation(expansion):
return {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": ["CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"],
"parsed": {"lex": [], "vec": [], "hyde": [], "invalid": [expansion[:100]]},
"is_continuation": True,
}
parsed = parse_expansion(expansion)
scores = {
"format": 0,
@ -264,7 +324,17 @@ def score_expansion(query: str, expansion: str) -> dict:
quality_score += 2
scores["deductions"].append("some vec lines too short/keyword-like")
scores["quality"] = quality_score
# Lex lines must preserve key terms from query (not be generic)
if parsed["lex"]:
lex_with_terms = sum(1 for l in parsed["lex"] if lex_preserves_key_terms(l, query))
if lex_with_terms == len(parsed["lex"]):
quality_score += 5
elif lex_with_terms > 0:
quality_score += 2
else:
scores["deductions"].append("lex lines too generic - missing key terms from query")
scores["quality"] = min(20, quality_score) # Cap at 20
# === TOTAL ===
scores["total"] = scores["format"] + scores["diversity"] + scores["hyde"] + scores["quality"]
@ -285,6 +355,7 @@ def score_expansion(query: str, expansion: str) -> dict:
scores["rating"] = "Failed"
scores["parsed"] = parsed
scores["is_continuation"] = False
return scores
@ -310,8 +381,14 @@ def load_model(model_name: str, base_model: str = "Qwen/Qwen3-0.6B"):
def generate_expansion(model, tokenizer, query: str, max_new_tokens: int = 200) -> str:
"""Generate query expansion."""
prompt = PROMPT_TEMPLATE.format(query=query)
"""Generate query expansion using proper Qwen3 chat template."""
# Use tokenizer's chat template with /no_think to disable thinking mode
messages = [{"role": "user", "content": f"/no_think Expand this search query: {query}"}]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
@ -325,8 +402,25 @@ def generate_expansion(model, tokenizer, query: str, max_new_tokens: int = 200)
eos_token_id=tokenizer.eos_token_id,
)
# Decode and extract expansion
# skip_special_tokens=True strips <|im_start|> etc, leaving "user\n...\nassistant\n..."
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
expansion = full_output[len(prompt):].strip()
# Extract assistant response
if "\nassistant\n" in full_output:
expansion = full_output.split("\nassistant\n")[-1].strip()
elif "assistant\n" in full_output:
expansion = full_output.split("assistant\n")[-1].strip()
else:
# Fallback: strip prompt length
expansion = full_output[len(prompt):].strip()
# Remove any <think> tags that might remain
if expansion.startswith("<think>"):
# Find end of thinking block
think_end = expansion.find("</think>")
if think_end != -1:
expansion = expansion[think_end + 8:].strip()
return expansion

664
finetune/evaluation_v4.json Normal file
View File

@ -0,0 +1,664 @@
[
{
"query": "how to configure authentication",
"expansion": "lex: authentication settings\nlex: user login\nvec: authentication settings\nvec: user login\nhyde: To configure authentication, start by reviewing the requirements and dependencies. Configure authentication settings in your project configuration.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 19,
"deductions": [
"some vec lines too short/keyword-like"
],
"total": 99,
"max_possible": 100,
"percentage": 99.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"authentication settings",
"user login"
],
"vec": [
"authentication settings",
"user login"
],
"hyde": [
"To configure authentication, start by reviewing the requirements and dependencies. Configure authentication settings in your project configuration."
],
"invalid": []
}
},
{
"query": "typescript async await",
"expansion": "lex: asynchronous await\nlex: await typescript\nvec: asynchronous await\nvec: await typescript\nhyde: Typescript async await is an important concept that relates to asynchronous await. It provides functionality for various use cases in software development.",
"scores": {
"format": 30,
"diversity": 28,
"hyde": 20,
"quality": 20,
"deductions": [
"vec duplicates: 'asynchronous await...' ~ 'await typescript...'"
],
"total": 98,
"max_possible": 100,
"percentage": 98.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"asynchronous await",
"await typescript"
],
"vec": [
"asynchronous await",
"await typescript"
],
"hyde": [
"Typescript async await is an important concept that relates to asynchronous await. It provides functionality for various use cases in software development."
],
"invalid": []
}
},
{
"query": "docker compose networking",
"expansion": "lex: docker-compose config\nlex: docker-compose settings\nvec: docker-compose config\nvec: docker-compose settings\nhyde: Docker compose networking is an important concept that relates to docker-compose settings. It provides functionality for various use cases in software development.",
"scores": {
"format": 30,
"diversity": 28,
"hyde": 20,
"quality": 20,
"deductions": [
"vec duplicates: 'docker-compose confi...' ~ 'docker-compose setti...'",
"lex lines too generic - missing key terms from query"
],
"total": 98,
"max_possible": 100,
"percentage": 98.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"docker-compose config",
"docker-compose settings"
],
"vec": [
"docker-compose config",
"docker-compose settings"
],
"hyde": [
"Docker compose networking is an important concept that relates to docker-compose settings. It provides functionality for various use cases in software development."
],
"invalid": []
}
},
{
"query": "git rebase vs merge",
"expansion": "lex: difference between rebase\nlex: what's the\nvec: difference between rebase and merge\nvec: what's the difference between git rebase and merge\nhyde: The topic of git rebase vs merge covers what's the difference between rebase and merge.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"difference between rebase",
"what's the"
],
"vec": [
"difference between rebase and merge",
"what's the difference between git rebase and merge"
],
"hyde": [
"The topic of git rebase vs merge covers what's the difference between rebase and merge."
],
"invalid": []
}
},
{
"query": "react useEffect cleanup",
"expansion": "lex: react cleanup code\nlex: react cleanup example\nvec: code examples for react useEffect cleanup\nvec: practical useEffect cleanup implementation examples\nhyde: Here are some practical examples of useEffect cleanup in action. Each example demonstrates a common use case with working code.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"react cleanup code",
"react cleanup example"
],
"vec": [
"code examples for react useEffect cleanup",
"practical useEffect cleanup implementation examples"
],
"hyde": [
"Here are some practical examples of useEffect cleanup in action. Each example demonstrates a common use case with working code."
],
"invalid": []
}
},
{
"query": "auth",
"expansion": "lex: access\nlex: login\nvec: access\nvec: login\nhyde: The topic of auth covers sign in. Proper implementation follows established patterns and best practices.",
"scores": {
"format": 30,
"diversity": 28,
"hyde": 20,
"quality": 17,
"deductions": [
"vec duplicates: 'access...' ~ 'login...'",
"some vec lines too short/keyword-like",
"lex lines too generic - missing key terms from query"
],
"total": 95,
"max_possible": 100,
"percentage": 95.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"access",
"login"
],
"vec": [
"access",
"login"
],
"hyde": [
"The topic of auth covers sign in. Proper implementation follows established patterns and best practices."
],
"invalid": []
}
},
{
"query": "config",
"expansion": "lex: config fix\nlex: config error\nvec: troubleshooting common config problems\nvec: how to fix config errors\nhyde: If you encounter config issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
"scores": {
"format": 30,
"diversity": 25,
"hyde": 20,
"quality": 20,
"deductions": [
"echoes query: 'config fix...'",
"echoes query: 'config error...'"
],
"total": 95,
"max_possible": 100,
"percentage": 95.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"config fix",
"config error"
],
"vec": [
"troubleshooting common config problems",
"how to fix config errors"
],
"hyde": [
"If you encounter config issues, check your configuration first. Common problems include missing dependencies and incorrect settings."
],
"invalid": []
}
},
{
"query": "setup",
"expansion": "lex: install\nlex: setup guide\nvec: install setup tutorial\nvec: setup guide\nhyde: This guide covers setup configuration and usage. Follow the steps below to get started with your first implementation.",
"scores": {
"format": 30,
"diversity": 25,
"hyde": 20,
"quality": 19,
"deductions": [
"echoes query: 'setup guide...'",
"echoes query: 'setup guide...'",
"some vec lines too short/keyword-like"
],
"total": 94,
"max_possible": 100,
"percentage": 94.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"install",
"setup guide"
],
"vec": [
"install setup tutorial",
"setup guide"
],
"hyde": [
"This guide covers setup configuration and usage. Follow the steps below to get started with your first implementation."
],
"invalid": []
}
},
{
"query": "api",
"expansion": "lex: api troubleshooting\nlex: api fixes\nvec: troubleshooting common api problems\nvec: how to fix api errors\nhyde: If you encounter api issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
"scores": {
"format": 30,
"diversity": 25,
"hyde": 20,
"quality": 20,
"deductions": [
"echoes query: 'api fixes...'"
],
"total": 95,
"max_possible": 100,
"percentage": 95.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"api troubleshooting",
"api fixes"
],
"vec": [
"troubleshooting common api problems",
"how to fix api errors"
],
"hyde": [
"If you encounter api issues, check your configuration first. Common problems include missing dependencies and incorrect settings."
],
"invalid": []
}
},
{
"query": "meeting notes project kickoff",
"expansion": "lex: invite attendees to\nlex: agenda setup and\nvec: invite attendees to organize a meeting notes project\nvec: agenda setup and preparation for meeting notes\nhyde: Understanding meeting notes project kickoff is essential for modern development. Key aspects include agenda setup and preparation for.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [
"lex lines too generic - missing key terms from query"
],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"invite attendees to",
"agenda setup and"
],
"vec": [
"invite attendees to organize a meeting notes project",
"agenda setup and preparation for meeting notes"
],
"hyde": [
"Understanding meeting notes project kickoff is essential for modern development. Key aspects include agenda setup and preparation for."
],
"invalid": []
}
},
{
"query": "ideas for new feature",
"expansion": "lex: brainstorm new enhancements\nlex: how to develop\nvec: brainstorm new enhancements\nvec: how to develop a new feature idea\nhyde: Ideas for new feature is an important concept that relates to how to develop a new feature idea.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"brainstorm new enhancements",
"how to develop"
],
"vec": [
"brainstorm new enhancements",
"how to develop a new feature idea"
],
"hyde": [
"Ideas for new feature is an important concept that relates to how to develop a new feature idea."
],
"invalid": []
}
},
{
"query": "todo list app architecture",
"expansion": "lex: task manager design\nlex: list app structure\nvec: task manager design\nvec: list app structure\nhyde: Todo list app architecture is an important concept that relates to task manager design.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"task manager design",
"list app structure"
],
"vec": [
"task manager design",
"list app structure"
],
"hyde": [
"Todo list app architecture is an important concept that relates to task manager design."
],
"invalid": []
}
},
{
"query": "what is dependency injection",
"expansion": "lex: definition of dependency\nlex: how to use\nvec: definition of dependency injection and its benefits\nvec: how to use dependency injection in code\nhyde: Dependency injection is defined as examples of dependency injection in practice. This plays a crucial role in modern development practices.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"definition of dependency",
"how to use"
],
"vec": [
"definition of dependency injection and its benefits",
"how to use dependency injection in code"
],
"hyde": [
"Dependency injection is defined as examples of dependency injection in practice. This plays a crucial role in modern development practices."
],
"invalid": []
}
},
{
"query": "difference between sql and nosql",
"expansion": "lex: distinctions between sql\nlex: sql vs nosql\nvec: distinctions between sql and nosql\nvec: sql vs nosql\nhyde: The topic of difference between sql and nosql covers distinctions between sql and nosql. Proper implementation follows established patterns and best practices.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [
"some vec lines too short/keyword-like"
],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"distinctions between sql",
"sql vs nosql"
],
"vec": [
"distinctions between sql and nosql",
"sql vs nosql"
],
"hyde": [
"The topic of difference between sql and nosql covers distinctions between sql and nosql. Proper implementation follows established patterns and best practices."
],
"invalid": []
}
},
{
"query": "kubernetes vs docker swarm",
"expansion": "lex: docker vs\nlex: kubernetes and\nvec: docker vs kubernetes\nvec: kubernetes and docker comparison\nhyde: The topic of kubernetes vs docker swarm covers comparison between kubernetes and docker.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"docker vs",
"kubernetes and"
],
"vec": [
"docker vs kubernetes",
"kubernetes and docker comparison"
],
"hyde": [
"The topic of kubernetes vs docker swarm covers comparison between kubernetes and docker."
],
"invalid": []
}
},
{
"query": "connection timeout error",
"expansion": "lex: error handling for\nlex: connection timeout\nvec: error handling for connection timeouts\nvec: connection timeout handling in software\nhyde: If you encounter connection timeout error, verify that the issue is resolved by checking the configuration.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"error handling for",
"connection timeout"
],
"vec": [
"error handling for connection timeouts",
"connection timeout handling in software"
],
"hyde": [
"If you encounter connection timeout error, verify that the issue is resolved by checking the configuration."
],
"invalid": []
}
},
{
"query": "memory leak debugging",
"expansion": "lex: troubleshooting common memory\nlex: memory leak fix\nvec: troubleshooting common memory leaks\nvec: memory leak fix\nhyde: Memory leak debugging is an important concept that relates to memory leak fix. Proper implementation follows established patterns and best practices.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [
"some vec lines too short/keyword-like"
],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"troubleshooting common memory",
"memory leak fix"
],
"vec": [
"troubleshooting common memory leaks",
"memory leak fix"
],
"hyde": [
"Memory leak debugging is an important concept that relates to memory leak fix. Proper implementation follows established patterns and best practices."
],
"invalid": []
}
},
{
"query": "cors error fix",
"expansion": "lex: cors fix\nlex: cors error handling\nvec: how to fix cors errors\nvec: common cors troubleshooting problems\nhyde: If you encounter cors error fix issues, check your configuration first. Common problems include missing dependencies and incorrect settings.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"cors fix",
"cors error handling"
],
"vec": [
"how to fix cors errors",
"common cors troubleshooting problems"
],
"hyde": [
"If you encounter cors error fix issues, check your configuration first. Common problems include missing dependencies and incorrect settings."
],
"invalid": []
}
},
{
"query": "how to implement caching with redis in nodejs",
"expansion": "lex: steps for setting\nlex: guide to configuring\nvec: steps for setting up redis caching in nodejs\nvec: guide to configuring redis caching in nodejs\nhyde: To implement caching with redis in nodejs, first configure the redis settings in your project configuration.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [
"lex lines too generic - missing key terms from query"
],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"steps for setting",
"guide to configuring"
],
"vec": [
"steps for setting up redis caching in nodejs",
"guide to configuring redis caching in nodejs"
],
"hyde": [
"To implement caching with redis in nodejs, first configure the redis settings in your project configuration."
],
"invalid": []
}
},
{
"query": "best practices for api rate limiting",
"expansion": "lex: recommendations for implementing\nlex: guidelines for setting\nvec: recommendations for implementing rate limiting in api calls\nvec: guidelines for setting up api rate limiting\nhyde: The process of best practices for api rate limiting involves several steps. First, recommendations for implementing rate limiting in api calls.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [
"lex lines too generic - missing key terms from query"
],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"recommendations for implementing",
"guidelines for setting"
],
"vec": [
"recommendations for implementing rate limiting in api calls",
"guidelines for setting up api rate limiting"
],
"hyde": [
"The process of best practices for api rate limiting involves several steps. First, recommendations for implementing rate limiting in api calls."
],
"invalid": []
}
},
{
"query": "setting up ci cd pipeline with github actions",
"expansion": "lex: how to configure\nlex: guide to configuring\nvec: how to configure github actions for ci cd pipeline\nvec: guide to configuring a github actions ci cd pipeline\nhyde: Setting up ci cd pipeline with github actions requires several steps. First, setup the necessary dependencies.",
"scores": {
"format": 30,
"diversity": 30,
"hyde": 20,
"quality": 20,
"deductions": [
"lex lines too generic - missing key terms from query"
],
"total": 100,
"max_possible": 100,
"percentage": 100.0,
"rating": "Excellent",
"is_continuation": false
},
"parsed": {
"lex": [
"how to configure",
"guide to configuring"
],
"vec": [
"how to configure github actions for ci cd pipeline",
"guide to configuring a github actions ci cd pipeline"
],
"hyde": [
"Setting up ci cd pipeline with github actions requires several steps. First, setup the necessary dependencies."
],
"invalid": []
}
}
]

View File

@ -0,0 +1,548 @@
[
{
"query": "how to configure authentication",
"expansion": "Here is an expanded version of your search query for configuring authentication:\n\n**\"How to configure authentication settings in [specific platform or system]?\"**\n\nIf you have a specific platform or system in mind (such as web apps, mobile apps, APIs, or a database system), please clarify, and I can tailor the query further.",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here is an expanded version of your search query for configuring authentication:\n\n**\"How to configur"
]
}
},
{
"query": "typescript async await",
"expansion": "The query \"typescript async await\" can be expanded in several ways depending on the context in which it's used:\n\n1. **Programming Language Use**: \n - \"Typescript: Async/await with Await\" \n - \"Typescript Async/await Example\" \n - \"How to use Async/await in Typescript\"\n\n2. **Learning or Documentation**: \n - \"Typescript Async/await: Explained\" \n - \"Async/await in Typescript: Learn and Example\"\n\n3. **Technical Support or Guides**: \n - \"Typescript Async/await: Learn, Use, and Example\" \n - \"Async/await in Typescript: Guide and Example\"\n\n4. **Code Examples**: \n - \"How to use async/await in Typescript\" \n - \"Async/await in Typescript: Code Example\"\n\nIf you need further assistance, feel free to let me know!",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"The query \"typescript async await\" can be expanded in several ways depending on the context in which"
]
}
},
{
"query": "docker compose networking",
"expansion": "Here's an expanded version of the search query \"docker compose networking\":\n\n**\"docker compose networking query\"**\n\nThis query can be used to find information related to Docker Compose, networking configurations, or network settings in Docker.",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here's an expanded version of the search query \"docker compose networking\":\n\n**\"docker compose netwo"
]
}
},
{
"query": "git rebase vs merge",
"expansion": "Here's an expanded version of the search query \"git rebase vs merge\" with more context and explanation:\n\n**\"git rebase vs merge** \nThis question is commonly asked when working with Git, and it's important to understand the difference between **rebase** and **merge** in order to make informed decisions about how to proceed with your Git workflow. \n\n### What is **git rebase**?\n- **git rebase** is a command used to **fix up** and **merge** changes back into a common branch or commit history. \n- It is typically used to apply changes to the current branch, and it often **reverts** or **corrects** changes that have been made to a different branch. \n- It is useful when you want to **correct** or **update** your branch history to match the main branch or a specific commit.\n\n### What is **git merge**?\n- **git merge** is a command used to **",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here's an expanded version of the search query \"git rebase vs merge\" with more context and explanati"
]
}
},
{
"query": "react useEffect cleanup",
"expansion": "To expand the search query **\"react useEffect cleanup\"**, you can consider the following variations:\n\n### 1. **React Hook Clean Up**\n- **\"react useEffect cleanup code\"**\n- **\"react useEffect cleanup best practices\"**\n- **\"react useEffect cleanup function\"**\n\n### 2. **React Clean Up with useEffect**\n- **\"react cleanup useEffect best\"**\n- **\"react cleanup useEffect code\"**\n- **\"react cleanup useEffect example\"**\n\n### 3. **React Clean Up and useEffect**\n- **\"react cleanup useEffect code\"**\n- **\"react cleanup useEffect code example\"**\n- **\"react cleanup useEffect best practices\"**\n\n### 4. **React Cleanup with useEffect**\n- **\"react cleanup useEffect code\"**\n- **\"react cleanup useEffect example\"**\n\n### 5. **React Clean Up Function with useEffect**\n- **\"react cleanup useEffect function\"**\n- **\"react cleanup useEffect code\"**\n\nThese variations help",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"To expand the search query **\"react useEffect cleanup\"**, you can consider the following variations:"
]
}
},
{
"query": "auth",
"expansion": "The search query \"auth\" can be expanded to include various related terms and concepts. Here are some possible expansions:\n\n1. **Authentication** - This refers to the process of verifying a user or system's identity. \n2. **User Authentication** - A specific type of authentication mechanism used to verify user access. \n3. **Secure Authentication** - A method to ensure that only authorized users can access a system or service. \n4. **Authentication and Authorization** - Combining the concepts of authentication and authorization. \n5. **Role-Based Access Control** - A system that determines access rights based on user roles. \n6. **User Verification** - The act of confirming a user's identity. \n7. **Multi-Factor Authentication (MFA)** - An additional layer of security beyond a password. \n8. **API Authentication** - Authentication required for accessing an API. \n9. **OAuth** - A protocol for authorization that allows apps to access user",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"The search query \"auth\" can be expanded to include various related terms and concepts. Here are some"
]
}
},
{
"query": "config",
"expansion": "Expanding the search query \"config\" would involve modifying the search to include more context or details. Here are some possible expansions:\n\n1. **config details** - to find information about specific configuration settings \n2. **config settings** - to find configuration parameters \n3. **configurations** - to find configuration options or settings \n4. **config file** - to find configuration files \n5. **config management** - to find configuration management tools or practices \n6. **config file structure** - to find configuration file structures \n7. **config values** - to find configuration values \n8. **config options** - to find configuration options \n9. **config parameters** - to find configuration parameters \n10. **config settings** - to find configuration settings \n\nLet me know if you meant to search for a specific type of configuration!",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Expanding the search query \"config\" would involve modifying the search to include more context or de"
]
}
},
{
"query": "setup",
"expansion": "Expanding the search query **\"setup\"** would involve adding more context or specific terms to provide a more accurate result. Here are some possibilities:\n\n1. **\"Setup a project\"** \n2. **\"Set up a system\"** \n3. **\"Configure a setup\"** \n4. **\"Configuration of a setup\"** \n5. **\"Install a setup\"** \n6. **\"Create a setup\"** \n7. **\"Install or configure a setup\"** \n8. **\"Setup a configuration\"** \n9. **\"Set up a system configuration\"** \n10. **\"Configuration setup\"** \n\nLet me know if you meant a specific task or context!",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Expanding the search query **\"setup\"** would involve adding more context or specific terms to provid"
]
}
},
{
"query": "api",
"expansion": "Certainly! Here's an expanded version of your search query:\n\n**\"api | search api | api details | api documentation | api usage | api integration | api development | api best practices | api implementation | api testing | api troubleshooting | api documentation | api examples | api reference | api installation | api configuration | api security | api optimization | api maintenance | api architecture | api deployment | api testing tools | api monitoring | api performance | api scalability | api troubleshooting | api best practices | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference | api documentation | api examples | api reference |",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Certainly! Here's an expanded version of your search query:\n\n**\"api | search api | api details | api"
]
}
},
{
"query": "meeting notes project kickoff",
"expansion": "Expanding the search query **\"meeting notes project kickoff\"**, you can refine it to:\n\n- **\"meeting notes for project kickoff\"** \n- **\"project kickoff meeting notes\"** \n- **\"meeting notes project kickoff details\"** \n- **\"meeting notes on project kickoff\"** \n- **\"meeting notes for project launch\"** \n\nThese variations provide different ways to phrase or search for meeting notes related to a project's kickoff event.",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Expanding the search query **\"meeting notes project kickoff\"**, you can refine it to:\n\n- **\"meeting "
]
}
},
{
"query": "ideas for new feature",
"expansion": "Here are some **ideas** for new features you could consider:\n\n1. **User Authentication & Authorization**: Add new methods for users to log in (e.g., social media, email, phone) and manage access.\n\n2. **Real-time Data Integration**: Integrate with APIs or databases to fetch real-time data and update the app accordingly.\n\n3. **Mobile-First Design**: Focus on mobile-friendly UI and functionality, especially for a mobile app.\n\n4. **Customization & Personalization**: Allow users to customize their experience (e.g., themes, preferences).\n\n5. **Multilingual Support**: Offer features to support users from different countries or regions.\n\n6. **Gamification Elements**: Integrate game-like features (e.g., points, badges) to encourage user engagement.\n\n7. **AI & Machine Learning**: Incorporate AI to improve features (e.g., personalized recommendations, chatbots).\n\n8. **Cross-Platform Compatibility**: Ensure the app works seamlessly across different",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here are some **ideas** for new features you could consider:\n\n1. **User Authentication & Authorizati"
]
}
},
{
"query": "todo list app architecture",
"expansion": "Here's an expanded version of the search query: \n\n**\"Todo list app architecture\"** \n\nThis query seeks information on the structure, design, and implementation of a **todo list application**. It covers topics such as:\n\n- **Architecture Overview** \n- **Key Components** (e.g., UI, backend, data storage) \n- **Design Patterns** used in todo apps \n- **Scalability and performance** \n- **User Interface and User Experience (UIUX)** \n- **Database Systems** (e.g., SQLite, MongoDB) \n- **APIs and Integration** \n- **Security considerations** \n- **Development tools** and frameworks used (e.g., React, Flutter, Node.js) \n\nLet me know if you'd like a breakdown of specific components or case studies!",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here's an expanded version of the search query: \n\n**\"Todo list app architecture\"** \n\nThis query se"
]
}
},
{
"query": "what is dependency injection",
"expansion": "Dependency injection (DI) is a design pattern used in software development to **independently manage the dependencies** of a class or module. The idea is that the program **involves no direct dependency between classes**, but instead, the implementation of these classes is **provided by a container**\u2014a system that automatically manages the dependencies.\n\n### Key Concepts:\n- **Inversion of Control (IoC)**: This is the core principle of DI. Instead of the programmer defining the dependencies, the container handles the dependency relationships.\n- **DI Containers**: These are the systems that manage the dependencies, such as Spring, Unity, or others.\n- **DI Inverses**: These are the classes that are \"injected\" into other classes, such as a `Person` class that receives an `Email` and `Name`.\n\n### Example:\n```java\nclass Person {\n private String name;\n private Email email;\n\n public Person(String name, Email email) {",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Dependency injection (DI) is a design pattern used in software development to **independently manage"
]
}
},
{
"query": "difference between sql and nosql",
"expansion": "The difference between SQL and NoSQL is a fundamental distinction in the world of databases. Here's a breakdown:\n\n### **1. SQL (Structured Query Language)**\n- **Purpose**: SQL is designed for structured data and is used to manage and manipulate relational databases. It is built on the relational model where data is stored in tables and organized in rows and columns.\n- **Key Features**:\n - **Strong typing** and **data integrity**.\n - **Support for complex queries** and transactions.\n - **Standardized syntax** and conventions.\n- **Used in**: Relational databases like MySQL, PostgreSQL, and SQL Server.\n- **Best for**: Data that is structured, static, and requires complex queries.\n\n### **2. NoSQL**\n- **Purpose**: NoSQL is a family of non-relational databases that do not follow a fixed schema. They are designed to store and retrieve data in a flexible, scalable, and dynamic way.\n- **Key",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"The difference between SQL and NoSQL is a fundamental distinction in the world of databases. Here's "
]
}
},
{
"query": "kubernetes vs docker swarm",
"expansion": "Here is an expanded version of your search query:\n\n**\"Kubernetes vs Docker Swarm\"**\n\nThis query highlights the comparison between **Kubernetes** and **Docker Swarm**, focusing on their roles, features, and use cases in cloud computing. If you'd like to refine it further, feel free to provide more context!",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here is an expanded version of your search query:\n\n**\"Kubernetes vs Docker Swarm\"**\n\nThis query high"
]
}
},
{
"query": "connection timeout error",
"expansion": "Expanding the search query \"connection timeout error\" to provide more context and detail, here are some possible expansions:\n\n1. **\"Connection timeout error in web services\"** \n This focuses on the issue occurring in web-based applications or services.\n\n2. **\"How to resolve a connection timeout error in programming\"** \n This expands the query to include troubleshooting steps.\n\n3. **\"Connection timeout error symptoms and solutions\"** \n This highlights the symptoms and possible solutions related to the error.\n\n4. **\"Common causes and resolutions for connection timeout errors\"** \n This provides a comprehensive overview of the problem and how to address it.\n\n5. **\"Troubleshooting connection timeout error in different technologies\"** \n This expands to include different programming languages or platforms.\n\nLet me know if you'd like help with a specific area (e.g., web development, network, programming) or need further details!",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Expanding the search query \"connection timeout error\" to provide more context and detail, here are s"
]
}
},
{
"query": "memory leak debugging",
"expansion": "\"Memory leak debugging\" is a common issue in software development, particularly in applications that use memory management. Here's an expanded version of the search query to include more context:\n\n**Expanded Search Query:**\n\n- Memory leak debugging techniques\n- How to debug memory leaks in software\n- Best practices for identifying and resolving memory leaks\n- Common causes of memory leaks in applications\n- Tools for memory leak debugging\n- Techniques to prevent memory leaks\n- Debugging memory leaks in different programming languages\n\nThis query can help you find resources, tutorials, or guides on effectively debugging memory leaks in your code.",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"\"Memory leak debugging\" is a common issue in software development, particularly in applications that"
]
}
},
{
"query": "cors error fix",
"expansion": "Here\u2019s an expanded version of the search query:\n\n**\"How to fix a CORS error in your website?\"**\n\nThis query allows you to ask about the general process of resolving a CORS (Cross-Origin Resource Sharing) error. If you have a specific browser or framework in mind, you can tailor the search further.",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here\u2019s an expanded version of the search query:\n\n**\"How to fix a CORS error in your website?\"**\n\nThi"
]
}
},
{
"query": "how to implement caching with redis in nodejs",
"expansion": "To implement caching with Redis in Node.js, you can use the Redis module from the Node.js standard library. Here's a step-by-step guide:\n\n### 1. **Install the Redis Module**\nFirst, install the `redis` package using npm:\n```bash\nnpm install redis --save\n```\n\n### 2. **Set Up Redis Connection**\nYou need to establish a connection to Redis. You can do this by creating a Redis client instance. Below is a simple example:\n\n```javascript\nconst { Redis } = require('redis');\n\n// Connect to Redis\nconst redis = Redis.createClient({\n host: 'localhost',\n port: 6379,\n database: 0, // Default database\n});\n\n// Set a value in Redis\nredis.set('key1', 'value1', (err) => {\n if (err) {\n console.error('Error setting Redis value:', err);\n } else {\n console.log('Value",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"To implement caching with Redis in Node.js, you can use the Redis module from the Node.js standard l"
]
}
},
{
"query": "best practices for api rate limiting",
"expansion": "Here is an expanded search query for \"best practices for API rate limiting\":\n\n**\"Best practices for API rate limiting\"**\n\nThis query allows for a broader search, including results that explain the principles, implementation methods, and effective strategies for API rate limiting.",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Here is an expanded search query for \"best practices for API rate limiting\":\n\n**\"Best practices for "
]
}
},
{
"query": "setting up ci cd pipeline with github actions",
"expansion": "Setting up a CI/CD pipeline with GitHub Actions is a common practice in modern software development. Here's an expanded version of the search query to help you better understand and find solutions:\n\n**\"How to set up a CI/CD pipeline with GitHub Actions in a scalable and efficient way?\"**\n\n### Key Features and Benefits:\n1. **Scalability**: GitHub Actions can handle complex pipelines across multiple environments.\n2. **Efficiency**: It automates repetitive tasks, reducing manual intervention.\n3. **Integration**: Works seamlessly with GitHub, GitLab, and other platforms.\n4. **Version Control**: Supports multiple Git repositories.\n5. **CI/CD Best Practices**: Includes best practices for setting up and managing pipelines.\n\n### Popular Options:\n- **GitHub Actions**: A lightweight and open-source tool for CI/CD pipelines.\n- **Jenkins (or GitLab CI)**: Commercial tools with more advanced features.\n- **Azure DevOps**: For enterprise-level CI/CD",
"scores": {
"format": 0,
"diversity": 0,
"hyde": 0,
"quality": 0,
"total": 0,
"max_possible": 80,
"percentage": 0,
"rating": "Failed",
"deductions": [
"CONTINUATION DETECTED - output does not start with lex:/vec:/hyde:"
],
"is_continuation": true
},
"parsed": {
"lex": [],
"vec": [],
"hyde": [],
"invalid": [
"Setting up a CI/CD pipeline with GitHub Actions is a common practice in modern software development."
]
}
}
]

View File

@ -1,81 +0,0 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "unsloth",
# "transformers>=4.45.0",
# "torch",
# ]
# ///
"""
Export finetuned model to GGUF format for use with node-llama-cpp.
Usage:
python export_gguf.py --model models/qmd-expansion --quantization Q8_0
python export_gguf.py --model models/qmd-expansion --quantization Q4_K_M
"""
import argparse
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="Export model to GGUF")
parser.add_argument("--model", type=str, required=True, help="Path to finetuned model")
parser.add_argument("--output", type=str, help="Output GGUF file path")
parser.add_argument("--quantization", type=str, default="Q8_0",
choices=["Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0", "F16"],
help="Quantization method")
parser.add_argument("--push-to-hub", type=str, help="Push GGUF to HuggingFace Hub repo")
args = parser.parse_args()
from unsloth import FastLanguageModel
model_path = Path(args.model)
if not model_path.exists():
print(f"Error: Model not found at {model_path}")
exit(1)
# Default output path
if args.output:
output_path = args.output
else:
output_path = str(model_path / f"qmd-expansion-{args.quantization}.gguf")
print(f"Loading model from {model_path}")
# Load the finetuned model
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=str(model_path),
max_seq_length=512,
dtype=None,
load_in_4bit=True,
)
print(f"Exporting to GGUF with {args.quantization} quantization...")
# Export to GGUF
model.save_pretrained_gguf(
output_path.replace(".gguf", ""), # Unsloth adds .gguf
tokenizer,
quantization_method=args.quantization.lower().replace("_", "-"),
)
print(f"Exported to {output_path}")
# Push to hub if requested
if args.push_to_hub:
print(f"Pushing GGUF to HuggingFace Hub: {args.push_to_hub}")
model.push_to_hub_gguf(
args.push_to_hub,
tokenizer,
quantization_method=args.quantization.lower().replace("_", "-"),
)
print("Export complete!")
print(f"\nTo use in QMD, update src/llm.ts:")
print(f' const DEFAULT_GENERATE_MODEL = "{output_path}";')
if __name__ == "__main__":
main()

View File

@ -1,103 +0,0 @@
#!/usr/bin/env python3
"""Prepare QMD query expansion data for training."""
import argparse
import json
from pathlib import Path
# Prompt template matching QMD's llm.ts format (simplified for training)
PROMPT_TEMPLATE = """You are a search query optimization expert. Transform the query into retrieval-optimized outputs.
Query: {query}
Output format:
lex: {{keyword variation}}
vec: {{semantic reformulation}}
hyde: {{hypothetical document passage}}
Output:"""
def format_for_training(input_text: str, output_text: str) -> dict:
"""Format a single example for SFT training."""
prompt = PROMPT_TEMPLATE.format(query=input_text)
return {
"prompt": prompt,
"completion": output_text,
# Alternative format for some trainers
"text": f"{prompt}\n{output_text}",
# Chat format
"messages": [
{"role": "user", "content": f"Expand this search query:\n\n{input_text}"},
{"role": "assistant", "content": output_text}
]
}
def main():
parser = argparse.ArgumentParser(description="Prepare data for training")
parser.add_argument("--input", type=str, default="data/qmd_expansion.jsonl", help="Input JSONL file")
parser.add_argument("--output", type=str, default="data/train", help="Output directory")
parser.add_argument("--split", type=float, default=0.1, help="Validation split ratio")
args = parser.parse_args()
input_path = Path(args.input)
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
if not input_path.exists():
print(f"Error: Input file not found: {input_path}")
exit(1)
# Load examples
examples = []
with open(input_path) as f:
for line in f:
if line.strip():
examples.append(json.loads(line))
print(f"Loaded {len(examples)} examples from {input_path}")
# Format for training
formatted = [format_for_training(ex["input"], ex["output"]) for ex in examples]
# Split into train/val
split_idx = int(len(formatted) * (1 - args.split))
train_data = formatted[:split_idx]
val_data = formatted[split_idx:]
# Write train set
train_path = output_dir / "train.jsonl"
with open(train_path, "w") as f:
for item in train_data:
f.write(json.dumps(item) + "\n")
# Write validation set
val_path = output_dir / "val.jsonl"
with open(val_path, "w") as f:
for item in val_data:
f.write(json.dumps(item) + "\n")
# Write chat format (for TRL/Unsloth)
chat_path = output_dir / "train_chat.jsonl"
with open(chat_path, "w") as f:
for item in train_data:
f.write(json.dumps({"messages": item["messages"]}) + "\n")
print(f"Written {len(train_data)} train examples to {train_path}")
print(f"Written {len(val_data)} validation examples to {val_path}")
print(f"Written chat format to {chat_path}")
# Also save as HuggingFace datasets format info
dataset_info = {
"dataset_name": "qmd-query-expansion",
"train_samples": len(train_data),
"val_samples": len(val_data),
"columns": ["prompt", "completion", "text", "messages"],
}
with open(output_dir / "dataset_info.json", "w") as f:
json.dump(dataset_info, f, indent=2)
if __name__ == "__main__":
main()

View File

@ -9,23 +9,24 @@
# "trackio",
# "datasets",
# "bitsandbytes",
# "pyyaml",
# ]
# ///
"""
GRPO (Group Relative Policy Optimization) training for QMD query expansion.
Uses the comprehensive scoring system from SCORING.md:
- Format (30%): Must have lex: and vec: prefixes
- Diversity (30%): No echoing query, diverse expansions
- Hyde (20%): Concise, no newlines, no repetition
- Quality (20%): lex=keywords, vec=natural language
Uses the scoring system from SCORING.md as the reward function.
Usage:
uv run train_grpo.py --sft-model tobil/qmd-query-expansion-0.6B
uv run rl.py --config configs/grpo_v4.yaml
uv run rl.py --config configs/grpo_v4.yaml --dry-run
"""
import os
import re
import argparse
import yaml
import torch
import trackio
from collections import Counter
@ -36,16 +37,26 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOTrainer, GRPOConfig
STOPWORDS = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or', 'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by'}
KEY_TERM_STOPWORDS = {'what', 'is', 'how', 'to', 'the', 'a', 'an', 'in', 'on', 'for', 'of',
'and', 'or', 'with', 'my', 'your', 'do', 'does', 'can', 'i', 'me', 'we'}
def get_key_terms(query: str) -> set:
words = set(query.lower().split())
return words - KEY_TERM_STOPWORDS
def lex_preserves_key_terms(lex_line: str, query: str) -> bool:
key_terms = get_key_terms(query)
if not key_terms:
return True
lex_words = set(lex_line.lower().split())
return bool(key_terms & lex_words)
# ============================================================================
# Scoring Functions (from SCORING.md)
# ============================================================================
def parse_expansion(text: str) -> dict:
"""Parse expansion into structured format."""
lines = text.strip().split("\n")
result = {"lex": [], "vec": [], "hyde": [], "invalid": []}
for line in lines:
line = line.strip()
if not line:
@ -58,19 +69,16 @@ def parse_expansion(text: str) -> dict:
result["hyde"].append(line[5:].strip())
else:
result["invalid"].append(line)
return result
def edit_distance_simple(a: str, b: str) -> int:
"""Simple word-level edit distance."""
words_a = set(a.lower().split())
words_b = set(b.lower().split())
return len(words_a ^ words_b)
def is_diverse(a: str, b: str, min_distance: int = 2) -> bool:
"""Check if two strings are sufficiently different."""
a, b = a.lower().strip(), b.lower().strip()
if a == b:
return False
@ -80,7 +88,6 @@ def is_diverse(a: str, b: str, min_distance: int = 2) -> bool:
def echoes_query(expansion: str, query: str) -> bool:
"""Check if expansion is just echoing the query."""
exp = expansion.lower().strip()
q = query.lower().strip()
if exp == q:
@ -91,7 +98,6 @@ def echoes_query(expansion: str, query: str) -> bool:
def word_repetition_penalty(text: str) -> int:
"""Count penalty for repeated words (excluding stopwords)."""
words = re.findall(r'\b\w+\b', text.lower())
counts = Counter(words)
penalty = 0
@ -102,13 +108,17 @@ def word_repetition_penalty(text: str) -> int:
def score_expansion(query: str, expansion: str) -> float:
"""
Score an expansion based on SCORING.md criteria.
Returns normalized score 0.0-1.0 for RL reward.
"""
"""Score expansion. Returns 0.0-1.0 for RL reward."""
text = expansion.strip()
# HARD FAIL: Must start with valid prefix (prevents verbose explanations)
first_line = text.split("\n")[0].strip() if text else ""
if not first_line.startswith(("lex:", "vec:", "hyde:")):
return 0.0 # Zero reward for wrong format
parsed = parse_expansion(expansion)
# === FORMAT (0-30) ===
# FORMAT (0-30)
format_score = 0
if parsed["lex"]:
format_score += 10
@ -119,20 +129,15 @@ def score_expansion(query: str, expansion: str) -> float:
else:
format_score += max(0, 10 - len(parsed["invalid"]) * 5)
# === DIVERSITY (0-30) ===
# DIVERSITY (0-30)
diversity_score = 0
# 2+ different types
types_present = sum(1 for t in ["lex", "vec"] if parsed[t])
if types_present >= 2:
diversity_score += 10
# 2+ total expansions
total_expansions = len(parsed["lex"]) + len(parsed["vec"])
if total_expansions >= 2:
diversity_score += 5
# Lex diversity
lex_score = 5
for i, a in enumerate(parsed["lex"]):
for b in parsed["lex"][i+1:]:
@ -140,7 +145,6 @@ def score_expansion(query: str, expansion: str) -> float:
lex_score -= 2
diversity_score += max(0, lex_score)
# Vec diversity
vec_score = 5
for i, a in enumerate(parsed["vec"]):
for b in parsed["vec"][i+1:]:
@ -148,205 +152,164 @@ def score_expansion(query: str, expansion: str) -> float:
vec_score -= 2
diversity_score += max(0, vec_score)
# Don't echo query
echo_score = 5
for exp in parsed["lex"] + parsed["vec"]:
if echoes_query(exp, query):
echo_score -= 3 # Heavier penalty for echoing
echo_score -= 3
diversity_score += max(0, echo_score)
# === HYDE (0-20) ===
# HYDE (0-20)
hyde_score = 0
if parsed["hyde"]:
hyde_text = parsed["hyde"][0]
hyde_score += 5 # Present
# Length check (50-200 chars ideal)
hyde_score += 5
hyde_len = len(hyde_text)
if 50 <= hyde_len <= 200:
hyde_score += 5
elif hyde_len < 50:
hyde_score += 2
# No newlines
if "\n" not in hyde_text:
hyde_score += 5
# No repetition
rep_penalty = word_repetition_penalty(hyde_text)
hyde_score += max(0, 5 - rep_penalty)
# === QUALITY (0-20) ===
quality_score = 10 # Base
# Lex should be shorter than vec
# QUALITY (0-20)
quality_score = 5
if parsed["lex"] and parsed["vec"]:
avg_lex = sum(len(l) for l in parsed["lex"]) / len(parsed["lex"])
avg_vec = sum(len(v) for v in parsed["vec"]) / len(parsed["vec"])
if avg_lex <= avg_vec:
quality_score += 5
# Vec should be natural language
if parsed["vec"]:
natural = sum(1 for v in parsed["vec"] if " " in v and len(v) > 15)
if natural == len(parsed["vec"]):
quality_score += 5
else:
quality_score += 2
if parsed["lex"]:
lex_with_terms = sum(1 for l in parsed["lex"] if lex_preserves_key_terms(l, query))
if lex_with_terms == len(parsed["lex"]):
quality_score += 5
elif lex_with_terms > 0:
quality_score += 2
# === TOTAL ===
total = format_score + diversity_score + hyde_score + quality_score
max_possible = 100 if parsed["hyde"] else 80
# Normalize to 0-1
return total / max_possible
def extract_query_from_prompt(prompt: str) -> str:
"""Extract the query from the prompt template."""
# Prompt format: "Expand this search query:\n\n{query}"
if "Expand this search query:" in prompt:
return prompt.split("Expand this search query:")[-1].strip()
return prompt.strip()
class QMDRewardFunction:
"""Reward function using comprehensive SCORING.md criteria."""
__name__ = "qmd_scoring_reward"
def __call__(self, completions: list[str], prompts: list[str] = None, **kwargs) -> list[float]:
"""Compute rewards for a batch of completions."""
rewards = []
for i, completion in enumerate(completions):
# Get the query from prompt if available
query = ""
if prompts and i < len(prompts):
query = extract_query_from_prompt(prompts[i])
# Score using comprehensive system
score = score_expansion(query, completion)
rewards.append(score)
return rewards
# ============================================================================
# Main Training
# ============================================================================
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--sft-model", default="tobil/qmd-query-expansion-0.6B",
help="SFT model to use as starting point")
parser.add_argument("--base-model", default="Qwen/Qwen3-0.6B",
help="Base model (for loading tokenizer)")
parser.add_argument("--output", default="tobil/qmd-query-expansion-0.6B-grpo-v2",
help="Output model name on Hub")
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--lr", type=float, default=1e-6,
help="Learning rate (lower for stability)")
parser.add_argument("--config", type=str, required=True, help="Path to config YAML")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
with open(args.config) as f:
cfg = yaml.safe_load(f)
if args.dry_run:
print("GRPO Training Config:")
print(f" SFT Model: {args.sft_model}")
print(f" Base Model: {args.base_model}")
print(f" Output: {args.output}")
print(f" Epochs: {args.epochs}")
print(f" LR: {args.lr}")
print("GRPO Training Configuration:")
print(yaml.dump(cfg, default_flow_style=False))
print("\nTesting reward function...")
test_good = "lex: auth setup\nlex: authentication config\nvec: how to configure authentication\nhyde: Configure auth by setting AUTH_SECRET."
test_bad = "auth is important for security"
print(f" Good output score: {score_expansion('auth', test_good):.2f}")
print(f" Bad output score: {score_expansion('auth', test_bad):.2f}")
return
# Login to HuggingFace Hub
# Login
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
print("Logging in to HuggingFace Hub...")
login(token=hf_token)
else:
print("Warning: HF_TOKEN not set, will try cached login")
# Load dataset (just prompts needed for GRPO)
# Load dataset
print("Loading dataset...")
dataset = load_dataset("tobil/qmd-query-expansion-train", split="train")
dataset = load_dataset(cfg["dataset"]["name"], split="train")
# Extract just the queries as prompts
def extract_prompt(example):
return {"prompt": example["messages"][0]["content"]}
return {"prompt": example[cfg["dataset"]["prompt_field"]][0]["content"]}
dataset = dataset.map(extract_prompt, remove_columns=dataset.column_names)
dataset = dataset.shuffle(seed=42).select(range(min(2000, len(dataset))))
max_samples = cfg["dataset"].get("max_samples", len(dataset))
dataset = dataset.shuffle(seed=42).select(range(min(max_samples, len(dataset))))
print(f"Using {len(dataset)} prompts for GRPO")
# Load tokenizer
print(f"Loading tokenizer from {args.base_model}...")
tokenizer = AutoTokenizer.from_pretrained(args.base_model)
# Load tokenizer and model
print(f"Loading tokenizer from {cfg['model']['base']}...")
tokenizer = AutoTokenizer.from_pretrained(cfg["model"]["base"])
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load SFT model with LoRA adapter
print(f"Loading SFT model from {args.sft_model}...")
print(f"Loading SFT model from {cfg['model']['sft']}...")
base_model = AutoModelForCausalLM.from_pretrained(
args.base_model,
cfg["model"]["base"],
torch_dtype=torch.bfloat16,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, args.sft_model)
model = PeftModel.from_pretrained(base_model, cfg["model"]["sft"])
model = model.merge_and_unload()
print("Model loaded and LoRA merged.")
# Add new LoRA adapter for GRPO training (smaller rank for stability)
# Add LoRA for GRPO
grpo_lora_config = LoraConfig(
r=4, # Smaller rank for more stable RL
lora_alpha=8,
lora_dropout=0.05,
r=cfg["lora"]["rank"],
lora_alpha=cfg["lora"]["alpha"],
lora_dropout=cfg["lora"]["dropout"],
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "v_proj"], # Fewer modules for stability
target_modules=cfg["lora"]["target_modules"],
)
model = get_peft_model(model, grpo_lora_config)
model.print_trainable_parameters()
print("Added new LoRA adapter for GRPO.")
# Initialize reward function
# Reward function
reward_fn = QMDRewardFunction()
# Test reward function
print("\nTesting reward function...")
test_good = "lex: auth setup\nlex: authentication config\nvec: how to configure authentication\nhyde: Configure auth by setting AUTH_SECRET."
test_bad = "auth is important for security"
print(f" Good output score: {score_expansion('auth', test_good):.2f}")
print(f" Bad output score: {score_expansion('auth', test_bad):.2f}")
# GRPO config with conservative settings
# GRPO config
config = GRPOConfig(
output_dir="qmd-expansion-grpo-v2",
output_dir=cfg["model"]["output"].split("/")[-1],
push_to_hub=True,
hub_model_id=args.output,
hub_model_id=cfg["model"]["output"],
# GRPO specific - conservative
num_generations=4,
max_completion_length=200, # Shorter to avoid rambling
num_generations=cfg["grpo"]["num_generations"],
max_completion_length=cfg["grpo"]["max_completion_length"],
# Training - very conservative
num_train_epochs=args.epochs,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
learning_rate=args.lr,
max_grad_norm=0.5, # Clip gradients more aggressively
num_train_epochs=cfg["training"]["epochs"],
per_device_train_batch_size=cfg["training"]["batch_size"],
gradient_accumulation_steps=cfg["training"]["gradient_accumulation_steps"],
learning_rate=cfg["training"]["learning_rate"],
max_grad_norm=cfg["training"]["max_grad_norm"],
# Logging
logging_steps=10,
save_strategy="epoch",
# Monitoring
report_to="trackio",
project="qmd-query-expansion-grpo-v2",
run_name="grpo-scoring-v2",
project=cfg["tracking"]["project"],
run_name=cfg["tracking"]["run_name"],
)
# Create trainer
# Train
print("Initializing GRPO trainer...")
trainer = GRPOTrainer(
model=model,
@ -356,16 +319,14 @@ def main():
reward_funcs=[reward_fn],
)
# Train
print("Starting GRPO training...")
trainer.train()
# Save
print("Pushing to Hub...")
trainer.push_to_hub()
trackio.finish()
print(f"Done! Model at: https://huggingface.co/{args.output}")
print(f"Done! Model at: https://huggingface.co/{cfg['model']['output']}")
if __name__ == "__main__":

117
finetune/train.py Normal file
View File

@ -0,0 +1,117 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "trl>=0.12.0",
# "peft>=0.7.0",
# "transformers>=4.45.0",
# "accelerate>=0.24.0",
# "datasets>=2.14.0",
# "trackio",
# "pyyaml",
# ]
# ///
"""
SFT Training for QMD Query Expansion.
Usage:
uv run train.py --config configs/sft_v4.yaml
uv run train.py --config configs/sft_v4.yaml --dry-run
"""
import argparse
import yaml
import trackio
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
def main():
parser = argparse.ArgumentParser(description="Train QMD query expansion model")
parser.add_argument("--config", type=str, required=True, help="Path to config YAML")
parser.add_argument("--dry-run", action="store_true", help="Print config and exit")
args = parser.parse_args()
# Load config
with open(args.config) as f:
cfg = yaml.safe_load(f)
if args.dry_run:
print("SFT Training Configuration:")
print(yaml.dump(cfg, default_flow_style=False))
return
print(f"Loading dataset: {cfg['dataset']['name']}...")
dataset = load_dataset(cfg["dataset"]["name"], split=cfg["dataset"]["split"])
print(f"Dataset loaded: {len(dataset)} examples")
# Create train/eval split
print("Creating train/eval split...")
split = dataset.train_test_split(test_size=cfg["dataset"]["eval_split"], seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]
print(f" Train: {len(train_dataset)} examples")
print(f" Eval: {len(eval_dataset)} examples")
# Training configuration
config = SFTConfig(
output_dir=cfg["model"]["output"].split("/")[-1],
push_to_hub=True,
hub_model_id=cfg["model"]["output"],
hub_strategy="every_save",
num_train_epochs=cfg["training"]["epochs"],
per_device_train_batch_size=cfg["training"]["batch_size"],
gradient_accumulation_steps=cfg["training"]["gradient_accumulation_steps"],
learning_rate=cfg["training"]["learning_rate"],
max_length=cfg["training"]["max_length"],
logging_steps=10,
save_strategy="steps",
save_steps=200,
save_total_limit=2,
eval_strategy="steps",
eval_steps=200,
warmup_ratio=cfg["training"]["warmup_ratio"],
lr_scheduler_type=cfg["training"]["lr_scheduler"],
report_to="trackio",
project=cfg["tracking"]["project"],
run_name=cfg["tracking"]["run_name"],
)
# LoRA configuration
peft_config = LoraConfig(
r=cfg["lora"]["rank"],
lora_alpha=cfg["lora"]["alpha"],
lora_dropout=cfg["lora"]["dropout"],
bias="none",
task_type="CAUSAL_LM",
target_modules=cfg["lora"]["target_modules"],
)
# Initialize and train
print("Initializing trainer...")
trainer = SFTTrainer(
model=cfg["model"]["base"],
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=config,
peft_config=peft_config,
)
print("Starting training...")
trainer.train()
print("Pushing to Hub...")
trainer.push_to_hub()
trackio.finish()
print(f"Complete! Model at: https://huggingface.co/{cfg['model']['output']}")
if __name__ == "__main__":
main()

View File

@ -1,92 +0,0 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "trl>=0.12.0",
# "peft>=0.7.0",
# "transformers>=4.45.0",
# "accelerate>=0.24.0",
# "trackio",
# "datasets",
# "bitsandbytes",
# ]
# ///
import trackio
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
# Load dataset from Hub
print("Loading dataset...")
dataset = load_dataset("tobil/qmd-query-expansion-train", split="train")
print(f"Loaded {len(dataset)} examples")
# Create train/eval split
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
# Training configuration
config = SFTConfig(
output_dir="qmd-query-expansion-0.6B",
push_to_hub=True,
hub_model_id="tobil/qmd-query-expansion-0.6B",
hub_strategy="every_save",
# Training parameters
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
max_length=512,
# Logging & checkpointing
logging_steps=25,
save_strategy="steps",
save_steps=200,
save_total_limit=2,
# Evaluation
eval_strategy="steps",
eval_steps=200,
# Optimization
warmup_ratio=0.1,
lr_scheduler_type="cosine",
bf16=True,
# Monitoring
report_to="trackio",
project="qmd-query-expansion",
run_name="qwen3-0.6B-lora",
)
# LoRA configuration
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
# Initialize trainer
print("Initializing trainer with Qwen/Qwen3-0.6B...")
trainer = SFTTrainer(
model="Qwen/Qwen3-0.6B",
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=config,
peft_config=peft_config,
)
print("Starting training...")
trainer.train()
print("Pushing to Hub...")
trainer.push_to_hub()
trackio.finish()
print("Done! Model at: https://huggingface.co/tobil/qmd-query-expansion-0.6B")

View File

@ -1,93 +0,0 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "trl>=0.12.0",
# "peft>=0.7.0",
# "transformers>=4.45.0",
# "accelerate>=0.24.0",
# "trackio",
# "datasets",
# "bitsandbytes",
# ]
# ///
import trackio
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
# Load dataset from Hub
print("Loading dataset...")
dataset = load_dataset("tobil/qmd-query-expansion-train", split="train")
print(f"Loaded {len(dataset)} examples")
# Create train/eval split
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
# Training configuration
config = SFTConfig(
output_dir="qmd-query-expansion-1.7B",
push_to_hub=True,
hub_model_id="tobil/qmd-query-expansion-1.7B",
hub_strategy="every_save",
# Training parameters - slightly smaller batch for larger model
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
learning_rate=2e-4,
max_length=512,
# Logging & checkpointing
logging_steps=25,
save_strategy="steps",
save_steps=200,
save_total_limit=2,
# Evaluation
eval_strategy="steps",
eval_steps=200,
# Optimization
warmup_ratio=0.1,
lr_scheduler_type="cosine",
bf16=True,
gradient_checkpointing=True, # Save memory for larger model
# Monitoring
report_to="trackio",
project="qmd-query-expansion",
run_name="qwen3-1.7B-lora",
)
# LoRA configuration
peft_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
# Initialize trainer
print("Initializing trainer with Qwen/Qwen3-1.7B...")
trainer = SFTTrainer(
model="Qwen/Qwen3-1.7B",
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=config,
peft_config=peft_config,
)
print("Starting training...")
trainer.train()
print("Pushing to Hub...")
trainer.push_to_hub()
trackio.finish()
print("Done! Model at: https://huggingface.co/tobil/qmd-query-expansion-1.7B")

View File

@ -1,102 +0,0 @@
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "trl>=0.12.0",
# "peft>=0.7.0",
# "transformers>=4.45.0",
# "accelerate>=0.24.0",
# "trackio",
# "datasets",
# "bitsandbytes",
# ]
# ///
"""
Improved Qwen3-1.7B training with best practices for larger models:
- Lower learning rate (1e-4 instead of 2e-4)
- Higher LoRA rank (32 instead of 16)
- More epochs (5 instead of 3)
- Weight decay for regularization
"""
import trackio
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
# Load dataset from Hub
print("Loading dataset...")
dataset = load_dataset("tobil/qmd-query-expansion-train", split="train")
print(f"Loaded {len(dataset)} examples")
# Create train/eval split
dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split["train"]
eval_dataset = dataset_split["test"]
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
# Training configuration - optimized for larger model
config = SFTConfig(
output_dir="qmd-query-expansion-1.7B-v2",
push_to_hub=True,
hub_model_id="tobil/qmd-query-expansion-1.7B-v2",
hub_strategy="every_save",
# Training parameters - lower LR, more epochs for larger model
num_train_epochs=5,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
learning_rate=1e-4, # Lowered from 2e-4
weight_decay=0.01, # Added regularization
max_length=512,
# Logging & checkpointing
logging_steps=25,
save_strategy="steps",
save_steps=200,
save_total_limit=3,
# Evaluation
eval_strategy="steps",
eval_steps=200,
# Optimization
warmup_ratio=0.1,
lr_scheduler_type="cosine",
bf16=True,
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
# Monitoring
report_to="trackio",
project="qmd-query-expansion",
run_name="qwen3-1.7B-lora-v2",
)
# LoRA configuration - higher rank for better learning
peft_config = LoraConfig(
r=32, # Increased from 16
lora_alpha=64, # Increased from 32 (2x rank)
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)
# Initialize trainer
print("Initializing trainer with Qwen/Qwen3-1.7B...")
trainer = SFTTrainer(
model="Qwen/Qwen3-1.7B",
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=config,
peft_config=peft_config,
)
print("Starting training...")
trainer.train()
print("Pushing to Hub...")
trainer.push_to_hub()
trackio.finish()
print("Done! Model at: https://huggingface.co/tobil/qmd-query-expansion-1.7B-v2")

View File

@ -1,164 +0,0 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "unsloth",
# "transformers>=4.45.0",
# "datasets",
# "trl>=0.12.0",
# "torch",
# "huggingface_hub",
# ]
# ///
"""
Train QMD query expansion model using LoRA on HuggingFace Jobs.
This script is designed to run on HuggingFace Jobs infrastructure.
Uses Unsloth for efficient LoRA finetuning.
Usage:
# Local test
python train_hf_job.py --model Qwen/Qwen3-0.6B --data data/train --dry-run
# HuggingFace Jobs (via huggingface-skills)
# See hugging-face-model-trainer skill for deployment
"""
import argparse
import os
from pathlib import Path
def main():
parser = argparse.ArgumentParser(description="Train QMD query expansion model")
parser.add_argument("--model", type=str, default="Qwen/Qwen3-0.6B", help="Base model")
parser.add_argument("--data", type=str, default="data/train", help="Training data directory")
parser.add_argument("--output", type=str, default="models/qmd-expansion", help="Output directory")
parser.add_argument("--epochs", type=int, default=3, help="Number of epochs")
parser.add_argument("--batch-size", type=int, default=4, help="Batch size")
parser.add_argument("--lr", type=float, default=2e-4, help="Learning rate")
parser.add_argument("--lora-rank", type=int, default=16, help="LoRA rank")
parser.add_argument("--max-seq-length", type=int, default=512, help="Max sequence length")
parser.add_argument("--dry-run", action="store_true", help="Print config and exit")
parser.add_argument("--push-to-hub", type=str, help="Push to HuggingFace Hub repo")
args = parser.parse_args()
config = {
"model": args.model,
"data": args.data,
"output": args.output,
"epochs": args.epochs,
"batch_size": args.batch_size,
"learning_rate": args.lr,
"lora_rank": args.lora_rank,
"lora_alpha": args.lora_rank * 2,
"max_seq_length": args.max_seq_length,
}
if args.dry_run:
print("Training configuration:")
for k, v in config.items():
print(f" {k}: {v}")
return
# Import heavy dependencies only when needed
from unsloth import FastLanguageModel
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import torch
print(f"Loading base model: {args.model}")
# Load model with Unsloth
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model,
max_seq_length=args.max_seq_length,
dtype=None, # Auto-detect
load_in_4bit=True, # QLoRA
)
# Configure LoRA
model = FastLanguageModel.get_peft_model(
model,
r=args.lora_rank,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha=args.lora_rank * 2,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=42,
)
# Load dataset
data_path = Path(args.data)
if (data_path / "train_chat.jsonl").exists():
dataset = load_dataset("json", data_files=str(data_path / "train_chat.jsonl"))["train"]
print(f"Loaded {len(dataset)} training examples (chat format)")
else:
dataset = load_dataset("json", data_files=str(data_path / "train.jsonl"))["train"]
print(f"Loaded {len(dataset)} training examples")
# Format function for chat template
def format_chat(example):
messages = example.get("messages", [])
if messages:
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
else:
text = example.get("text", "")
return {"text": text}
dataset = dataset.map(format_chat)
# Training config
output_dir = Path(args.output)
output_dir.mkdir(parents=True, exist_ok=True)
training_args = SFTConfig(
output_dir=str(output_dir),
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
gradient_accumulation_steps=4,
learning_rate=args.lr,
weight_decay=0.01,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
logging_steps=10,
save_strategy="epoch",
bf16=torch.cuda.is_bf16_supported(),
fp16=not torch.cuda.is_bf16_supported(),
optim="adamw_8bit",
seed=42,
max_seq_length=args.max_seq_length,
dataset_text_field="text",
packing=False,
)
# Create trainer
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
args=training_args,
)
# Train
print("Starting training...")
trainer.train()
# Save
print(f"Saving model to {output_dir}")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Push to hub if requested
if args.push_to_hub:
print(f"Pushing to HuggingFace Hub: {args.push_to_hub}")
model.push_to_hub(args.push_to_hub)
tokenizer.push_to_hub(args.push_to_hub)
print("Training complete!")
if __name__ == "__main__":
main()

517
finetune/tui.py Executable file
View File

@ -0,0 +1,517 @@
#!/usr/bin/env python3
# /// script
# requires-python = ">=3.10"
# dependencies = [
# "rich>=13.0.0",
# "transformers>=4.45.0",
# "peft>=0.7.0",
# "torch",
# "prompt_toolkit>=3.0.0",
# ]
# ///
"""
QMD Query Expansion Model Tester
A cyberpunk-styled TUI for testing finetuned query expansion models.
"""
from collections import deque
from dataclasses import dataclass
from typing import Optional
import torch
from peft import PeftModel
from prompt_toolkit import prompt
from prompt_toolkit.history import InMemoryHistory
from prompt_toolkit.styles import Style
from rich.columns import Columns
from rich.console import Console
from rich.panel import Panel
from rich.progress import Progress, SpinnerColumn, TextColumn
from rich.table import Table
from rich.text import Text
from transformers import AutoModelForCausalLM, AutoTokenizer
# ═══════════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════════
MODELS = {
"1": ("SFT (Hub)", "tobil/qmd-query-expansion-0.6B-v4", "v3"),
"2": ("GRPO (Hub)", "tobil/qmd-query-expansion-0.6B-v4-grpo", "v3"),
"3": ("SFT (local)", "./qmd-query-expansion-0.6B-v4", "v3"),
"4": ("GRPO (local)", "./qmd-query-expansion-0.6B-v4-grpo", "v3"),
}
BASE_MODEL = "Qwen/Qwen3-0.6B"
# v1 used simple format (before proper chat template)
PROMPT_TEMPLATE_V1 = """Expand this search query:
{query}"""
# v3+ uses tokenizer.apply_chat_template() - see generate_expansion()
# Cyberpunk color palette
CYAN = "#00ffff"
MAGENTA = "#ff00ff"
PURPLE = "#bd93f9"
DIM = "#6272a4"
BG = "#1a0a2e"
GREEN = "#50fa7b"
YELLOW = "#f1fa8c"
RED = "#ff5555"
console = Console()
# ═══════════════════════════════════════════════════════════════════════════════
# SCORING (from evaluate_model.py)
# ═══════════════════════════════════════════════════════════════════════════════
STOPWORDS = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', 'or',
'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by'}
def parse_expansion(text: str) -> dict:
lines = text.strip().split("\n")
result = {"lex": [], "vec": [], "hyde": [], "invalid": []}
for line in lines:
line = line.strip()
if not line:
continue
if line.startswith("lex:"):
result["lex"].append(line[4:].strip())
elif line.startswith("vec:"):
result["vec"].append(line[4:].strip())
elif line.startswith("hyde:"):
result["hyde"].append(line[5:].strip())
else:
result["invalid"].append(line)
return result
def score_expansion(query: str, expansion: str) -> dict:
"""Score an expansion based on format, diversity, hyde, quality."""
text = expansion.strip()
if not text:
return {"total": 0, "percentage": 0, "rating": "Failed", "format": 0,
"diversity": 0, "hyde": 0, "quality": 0, "parsed": {"lex": [], "vec": [], "hyde": [], "invalid": []}}
parsed = parse_expansion(expansion)
# Check for continuation - but only if NO valid lines were found
has_valid = parsed["lex"] or parsed["vec"] or parsed["hyde"]
if not has_valid:
return {"total": 0, "percentage": 0, "rating": "Failed", "format": 0,
"diversity": 0, "hyde": 0, "quality": 0, "is_continuation": True,
"parsed": parsed}
# Format score (0-30)
format_score = 0
if parsed["lex"]:
format_score += 10
if parsed["vec"]:
format_score += 10
if not parsed["invalid"]:
format_score += 10
else:
format_score += max(0, 10 - len(parsed["invalid"]) * 5)
# Diversity score (0-30)
diversity_score = 0
types_present = sum(1 for t in ["lex", "vec"] if parsed[t])
if types_present >= 2:
diversity_score += 10
total_exp = len(parsed["lex"]) + len(parsed["vec"])
if total_exp >= 2:
diversity_score += 5
diversity_score += 10 # Base diversity points
diversity_score += 5 # Non-echo points
# Hyde score (0-20)
hyde_score = 0
if parsed["hyde"]:
hyde_text = parsed["hyde"][0]
hyde_score += 5 # Present
hyde_len = len(hyde_text)
if 50 <= hyde_len <= 200:
hyde_score += 5
elif hyde_len < 50:
hyde_score += 2
if "\n" not in hyde_text:
hyde_score += 5
hyde_score += 5 # No repetition (simplified)
# Quality score (0-20)
quality_score = 5 # Base relevance (reduced to make room for key term check)
# Lex must preserve key terms from query
stopwords = {'what', 'is', 'how', 'to', 'the', 'a', 'an', 'in', 'on', 'for', 'of', 'and', 'or', 'with', 'my'}
key_terms = set(query.lower().split()) - stopwords
if parsed["lex"] and key_terms:
lex_with_terms = sum(1 for l in parsed["lex"] if key_terms & set(l.lower().split()))
if lex_with_terms == len(parsed["lex"]):
quality_score += 5
elif lex_with_terms > 0:
quality_score += 2
if parsed["lex"] and parsed["vec"]:
avg_lex = sum(len(l) for l in parsed["lex"]) / len(parsed["lex"])
avg_vec = sum(len(v) for v in parsed["vec"]) / len(parsed["vec"])
if avg_lex <= avg_vec:
quality_score += 5
if parsed["vec"] and all(" " in v and len(v) > 15 for v in parsed["vec"]):
quality_score += 5
total = format_score + diversity_score + hyde_score + min(20, quality_score)
max_possible = 100 if parsed["hyde"] else 80
percentage = total / max_possible * 100
if percentage >= 80:
rating = "Excellent"
elif percentage >= 60:
rating = "Good"
elif percentage >= 40:
rating = "Acceptable"
elif percentage >= 20:
rating = "Poor"
else:
rating = "Failed"
return {
"format": format_score,
"diversity": diversity_score,
"hyde": hyde_score,
"quality": quality_score,
"total": total,
"max_possible": max_possible,
"percentage": percentage,
"rating": rating,
"parsed": parsed,
}
# ═══════════════════════════════════════════════════════════════════════════════
# UI COMPONENTS
# ═══════════════════════════════════════════════════════════════════════════════
BANNER = """
[bold cyan][/]
[bold cyan][/] [bold magenta][/] [bold white]Q M D Q U E R Y E X P A N D E R[/] [bold magenta][/] [bold cyan][/]
[bold cyan][/]
"""
def show_banner():
console.print(BANNER)
def show_model_menu(current: Optional[str] = None) -> str:
"""Display model selection menu."""
console.print()
console.print(f"[bold {CYAN}]◆ SELECT MODEL[/]")
console.print(f"[{DIM}]{'' * 50}[/]")
for key, (name, path, version) in MODELS.items():
marker = "[bold green]●[/]" if path == current else f"[{DIM}]○[/]"
console.print(f" {marker} [{CYAN}]{key}[/] │ {name} [{DIM}]({version})[/]")
console.print(f" [{DIM}]{path}[/]")
console.print(f"[{DIM}]{'' * 50}[/]")
return prompt(" Enter choice (1-3): ", style=Style.from_dict({'': CYAN})).strip()
def render_expansion(expansion: str, scores: dict) -> Panel:
"""Render the expansion output with syntax highlighting."""
parsed = scores.get("parsed", parse_expansion(expansion))
content = Text()
# Lex lines
for lex in parsed["lex"]:
content.append("lex: ", style=f"bold {CYAN}")
content.append(f"{lex}\n", style="white")
# Vec lines
for vec in parsed["vec"]:
content.append("vec: ", style=f"bold {MAGENTA}")
content.append(f"{vec}\n", style="white")
# Hyde lines
for hyde in parsed["hyde"]:
content.append("hyde: ", style=f"bold {PURPLE}")
content.append(f"{hyde}\n", style=f"italic {DIM}")
# Invalid lines
for inv in parsed["invalid"]:
content.append(f"[invalid] {inv}\n", style=f"dim {RED}")
return Panel(
content,
title=f"[bold {CYAN}]◈ EXPANSION[/]",
border_style=CYAN,
padding=(0, 1),
)
def render_scores(scores: dict) -> Panel:
"""Render score breakdown as a compact table."""
rating = scores["rating"]
rating_color = {
"Excellent": GREEN,
"Good": CYAN,
"Acceptable": YELLOW,
"Poor": RED,
"Failed": RED,
}.get(rating, DIM)
table = Table(show_header=False, box=None, padding=(0, 1))
table.add_column("metric", style=DIM)
table.add_column("score", justify="right")
table.add_column("bar", width=12)
def score_bar(val: int, max_val: int) -> str:
filled = int(val / max_val * 10)
return f"[{CYAN}]{'' * filled}[/][{DIM}]{'' * (10 - filled)}[/]"
table.add_row("Format", f"[white]{scores['format']}[/]/30", score_bar(scores['format'], 30))
table.add_row("Diversity", f"[white]{scores['diversity']}[/]/30", score_bar(scores['diversity'], 30))
table.add_row("Hyde", f"[white]{scores['hyde']}[/]/20", score_bar(scores['hyde'], 20))
table.add_row("Quality", f"[white]{scores['quality']}[/]/20", score_bar(scores['quality'], 20))
table.add_row("", "", "")
table.add_row(
f"[bold]TOTAL[/]",
f"[bold white]{scores['total']}[/]/{scores.get('max_possible', 80)}",
f"[bold {rating_color}]{rating}[/]"
)
return Panel(
table,
title=f"[bold {MAGENTA}]◈ SCORES[/]",
border_style=MAGENTA,
padding=(0, 1),
)
def render_history(history: deque) -> Panel:
"""Render recent query history."""
content = Text()
for i, (query, rating) in enumerate(history):
rating_color = {
"Excellent": GREEN, "Good": CYAN, "Acceptable": YELLOW,
"Poor": RED, "Failed": RED,
}.get(rating, DIM)
content.append(f" [{DIM}]{i+1}.[/] {query[:40]}")
if len(query) > 40:
content.append(f"[{DIM}]...[/]")
content.append(f" [{rating_color}]●[/]\n")
if not history:
content.append(f" [{DIM}]No queries yet[/]")
return Panel(
content,
title=f"[bold {PURPLE}]◈ HISTORY[/]",
border_style=PURPLE,
padding=(0, 1),
)
# ═══════════════════════════════════════════════════════════════════════════════
# MODEL LOADING
# ═══════════════════════════════════════════════════════════════════════════════
@dataclass
class LoadedModel:
model: any
tokenizer: any
name: str
path: str
version: str # "v1" or "v3" - determines prompt template
def load_model(model_path: str, model_name: str, version: str) -> LoadedModel:
"""Load model with progress indicator."""
with Progress(
SpinnerColumn(spinner_name="dots", style=CYAN),
TextColumn(f"[{CYAN}]Loading {{task.description}}...[/]"),
console=console,
transient=True,
) as progress:
task = progress.add_task("tokenizer", total=None)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
progress.update(task, description="base model")
base = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.bfloat16,
device_map="auto",
)
progress.update(task, description="adapter")
model = PeftModel.from_pretrained(base, model_path)
model.eval()
return LoadedModel(model=model, tokenizer=tokenizer, name=model_name, path=model_path, version=version)
DEBUG = False # Set to True for debug output
def generate_expansion(loaded: LoadedModel, query: str) -> str:
"""Generate expansion using proper Qwen3 chat template."""
if loaded.version == "v3":
# Use tokenizer's chat template with /no_think to disable thinking mode
messages = [{"role": "user", "content": f"/no_think Expand this search query: {query}"}]
prompt_text = loaded.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
else:
# v1 used simple format
prompt_text = PROMPT_TEMPLATE_V1.format(query=query)
if DEBUG:
console.print(f"[{DIM}]─── DEBUG: Prompt ───[/]")
console.print(f"[{DIM}]{repr(prompt_text)}[/]")
inputs = loaded.tokenizer(prompt_text, return_tensors="pt").to(loaded.model.device)
with torch.no_grad():
outputs = loaded.model.generate(
**inputs,
max_new_tokens=200,
temperature=0.7,
do_sample=True,
pad_token_id=loaded.tokenizer.pad_token_id,
eos_token_id=loaded.tokenizer.eos_token_id,
)
full_output = loaded.tokenizer.decode(outputs[0], skip_special_tokens=True)
if DEBUG:
console.print(f"[{DIM}]─── DEBUG: Full output ───[/]")
console.print(f"[{DIM}]{repr(full_output[:500])}[/]")
# Extract assistant response (skip_special_tokens leaves "user\n...\nassistant\n...")
if "\nassistant\n" in full_output:
expansion = full_output.split("\nassistant\n")[-1].strip()
elif "assistant\n" in full_output:
expansion = full_output.split("assistant\n")[-1].strip()
else:
expansion = full_output[len(prompt_text):].strip()
# Remove any <think> tags that might remain
if expansion.startswith("<think>"):
think_end = expansion.find("</think>")
if think_end != -1:
expansion = expansion[think_end + 8:].strip()
if DEBUG:
console.print(f"[{DIM}]─── DEBUG: Expansion ───[/]")
console.print(f"[{DIM}]{repr(expansion[:300])}[/]")
return expansion
# ═══════════════════════════════════════════════════════════════════════════════
# MAIN LOOP
# ═══════════════════════════════════════════════════════════════════════════════
def main():
console.clear()
show_banner()
# Model selection
choice = show_model_menu()
if choice not in MODELS:
console.print(f"[{RED}]Invalid choice. Exiting.[/]")
return
model_name, model_path, model_version = MODELS[choice]
console.print()
try:
loaded = load_model(model_path, model_name, model_version)
except Exception as e:
console.print(f"[{RED}]Failed to load model: {e}[/]")
return
console.print(f"[{GREEN}]✓ Model loaded: {model_name}[/]")
console.print()
# Query history
history: deque = deque(maxlen=5)
input_history = InMemoryHistory()
# Main loop
console.print(f"[{DIM}]Enter queries to expand. Type 'quit' to exit, 'model' to switch models.[/]")
console.print()
while True:
try:
query = prompt(
f"[{CYAN}][/] ",
history=input_history,
style=Style.from_dict({'': 'ansicyan'}),
).strip()
except (KeyboardInterrupt, EOFError):
break
if not query:
continue
if query.lower() == 'quit':
break
if query.lower() == 'model':
console.clear()
show_banner()
choice = show_model_menu(loaded.path)
if choice in MODELS:
new_name, new_path, new_version = MODELS[choice]
if new_path != loaded.path:
console.print()
loaded = load_model(new_path, new_name, new_version)
console.print(f"[{GREEN}]✓ Switched to: {new_name}[/]")
console.print()
continue
if query.lower() == 'history':
console.print(render_history(history))
continue
# Generate expansion
with Progress(
SpinnerColumn(spinner_name="dots", style=MAGENTA),
TextColumn(f"[{MAGENTA}]Expanding...[/]"),
console=console,
transient=True,
) as progress:
progress.add_task("gen", total=None)
expansion = generate_expansion(loaded, query)
# Score it
scores = score_expansion(query, expansion)
# Add to history
history.appendleft((query, scores["rating"]))
# Display results
console.print()
console.print(f"[{DIM}]Query: [/][bold white]{query}[/]")
console.print()
# Side-by-side layout
console.print(Columns([
render_expansion(expansion, scores),
render_scores(scores),
], equal=True, expand=True))
console.print()
console.print(f"\n[{CYAN}]◆ Goodbye![/]\n")
if __name__ == "__main__":
main()