qmd/finetune/data/verify_data.py
Tobi Lütke 48f0917269
feat(finetune): hyde-first ordering, relative paths, structured format
Dataset improvements:
- Reorder output to put hyde first for better retrieval priming
- Convert absolute paths to relative paths in scripts
- Add convert_to_structured.py for structured data format
- Add qmd_expansion_v3_structured.jsonl with type/query objects
- Update schema.py with reorder_hyde_first() helper
- Verify data now validates hyde-first ordering

Training data regenerated with new ordering (100% validation success).
2026-02-17 06:31:35 -05:00

129 lines
4.5 KiB
Python

#!/usr/bin/env python3
"""
Verify the quality and correctness of the converted ChatML data.
"""
import json
import re
from pathlib import Path
def verify_chatml_format(text):
"""Verify that the text follows proper ChatML format."""
issues = []
# Check start token
if not text.startswith("<|startoftext|>"):
issues.append("Missing <|startoftext|> at beginning")
# Check user section
user_pattern = r"<\|im_start\|>user\n.*?<\|im_end\|>"
if not re.search(user_pattern, text, re.DOTALL):
issues.append("Missing or malformed user section")
# Check assistant section
assistant_pattern = r"<\|im_start\|>assistant\n.*?<\|im_end\|>"
if not re.search(assistant_pattern, text, re.DOTALL):
issues.append("Missing or malformed assistant section")
# Check for proper query format
if "Expand this search query:" not in text:
issues.append("Missing 'Expand this search query:' prompt")
# Check for required output types
assistant_content = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
if assistant_content:
content = assistant_content.group(1)
has_lex = "lex:" in content
has_vec = "vec:" in content
has_hyde = "hyde:" in content
if not has_lex:
issues.append("Missing lex: entries")
if not has_vec:
issues.append("Missing vec: entries")
if not has_hyde:
issues.append("Missing hyde: entries")
# Validate hyde-first ordering
lines = content.strip().split("\n")
if lines:
first_line = lines[0].strip()
if not first_line.startswith("hyde:"):
issues.append("Hyde not first (expected hyde-first ordering)")
return issues
def analyze_file(filepath):
"""Analyze a JSONL file for quality and issues."""
print(f"\nAnalyzing {filepath}...")
total_entries = 0
total_issues = 0
issue_counts = {}
query_lengths = []
assistant_lengths = []
with open(filepath, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
try:
entry = json.loads(line.strip())
total_entries += 1
text = entry["text"]
issues = verify_chatml_format(text)
if issues:
total_issues += 1
for issue in issues:
issue_counts[issue] = issue_counts.get(issue, 0) + 1
# Extract query and assistant response for length analysis
user_match = re.search(r"Expand this search query: (.*?)<\|im_end\|>", text, re.DOTALL)
assistant_match = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
if user_match:
query_lengths.append(len(user_match.group(1).strip()))
if assistant_match:
assistant_lengths.append(len(assistant_match.group(1)))
except json.JSONDecodeError as e:
print(f"JSON decode error on line {line_num}: {e}")
except Exception as e:
print(f"Error processing line {line_num}: {e}")
print(f"Total entries: {total_entries}")
print(f"Entries with issues: {total_issues}")
print(f"Success rate: {((total_entries - total_issues) / total_entries * 100):.1f}%")
if issue_counts:
print("\nIssue breakdown:")
for issue, count in sorted(issue_counts.items()):
print(f" {issue}: {count}")
if query_lengths:
print(f"\nQuery length stats:")
print(f" Min: {min(query_lengths)} chars")
print(f" Max: {max(query_lengths)} chars")
print(f" Avg: {sum(query_lengths) / len(query_lengths):.1f} chars")
if assistant_lengths:
print(f"\nAssistant response length stats:")
print(f" Min: {min(assistant_lengths)} chars")
print(f" Max: {max(assistant_lengths)} chars")
print(f" Avg: {sum(assistant_lengths) / len(assistant_lengths):.1f} chars")
def main():
# Use paths relative to this script's location
script_dir = Path(__file__).parent
data_dir = script_dir / "train-lfm2"
# Analyze both train and validation sets
analyze_file(data_dir / "train.jsonl")
analyze_file(data_dir / "val.jsonl")
print("\n" + "="*50)
print("DATA PREPARATION VERIFICATION COMPLETE")
print("="*50)
if __name__ == "__main__":
main()