qmd/finetune/data/convert_to_structured.py
Tobi Lütke 48f0917269
feat(finetune): hyde-first ordering, relative paths, structured format
Dataset improvements:
- Reorder output to put hyde first for better retrieval priming
- Convert absolute paths to relative paths in scripts
- Add convert_to_structured.py for structured data format
- Add qmd_expansion_v3_structured.jsonl with type/query objects
- Update schema.py with reorder_hyde_first() helper
- Verify data now validates hyde-first ordering

Training data regenerated with new ordering (100% validation success).
2026-02-17 06:31:35 -05:00

68 lines
1.9 KiB
Python

#!/usr/bin/env python3
"""
Convert QMD expansion JSONL to structured format with type/query objects.
Also applies hyde-first ordering.
"""
import json
from pathlib import Path
def reorder_hyde_first(output_items):
"""Reorder output items to put hyde first, then lex, then vec."""
hyde_items = [item for item in output_items if item[0] == "hyde"]
lex_items = [item for item in output_items if item[0] == "lex"]
vec_items = [item for item in output_items if item[0] == "vec"]
return hyde_items + lex_items + vec_items
def convert_to_structured(entry):
"""Convert flat output format to structured searches array."""
query = entry["query"]
output_items = entry.get("output", [])
# Apply hyde-first ordering
output_items = reorder_hyde_first(output_items)
# Convert to structured format
searches = []
for item_type, content in output_items:
searches.append({
"type": item_type,
"query": content
})
return {
"query": query,
"searches": searches
}
def main():
script_dir = Path(__file__).parent
input_file = script_dir / "qmd_expansion_v3.jsonl"
output_file = script_dir / "qmd_expansion_v3_structured.jsonl"
print(f"Converting {input_file} to structured format...")
count = 0
with open(input_file, 'r', encoding='utf-8') as f_in, \
open(output_file, 'w', encoding='utf-8') as f_out:
for line in f_in:
if line.strip():
entry = json.loads(line)
structured = convert_to_structured(entry)
f_out.write(json.dumps(structured, ensure_ascii=False) + '\n')
count += 1
print(f"Converted {count} entries to {output_file}")
# Show sample
print("\nSample entry:")
with open(output_file, 'r') as f:
sample = json.loads(f.readline())
print(json.dumps(sample, indent=2))
if __name__ == "__main__":
main()