Replace ad-hoc JSON parsing with a strict Pydantic model (TrainingExample with typed OutputPair). All data loading goes through load_examples() which fails loudly on invalid data. - Convert v3_structured.jsonl from "searches" to "output" format - Rewrite all consumer scripts (prepare, validate, score, analyze) to load through the Pydantic schema - Prepared train/val files are ephemeral build artifacts - Restore LFM2 and GEPA experiments under experiments/ - Add pydantic>=2.0 to dependencies
89 lines
2.3 KiB
Python
89 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
# /// script
|
|
# requires-python = ">=3.10"
|
|
# dependencies = ["pydantic>=2.0"]
|
|
# ///
|
|
"""Validate JSONL files against the strict QMD training schema."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from dataset.schema import TrainingExample
|
|
|
|
|
|
def validate_file(path: Path) -> tuple[int, int]:
|
|
"""Return (total_lines, error_count)."""
|
|
total = 0
|
|
errors = 0
|
|
with path.open("r", encoding="utf-8") as f:
|
|
for line_num, line in enumerate(f, 1):
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
total += 1
|
|
try:
|
|
obj = json.loads(line)
|
|
except json.JSONDecodeError as e:
|
|
print(f"{path}:{line_num}: invalid JSON ({e})")
|
|
errors += 1
|
|
continue
|
|
|
|
try:
|
|
TrainingExample.model_validate(obj)
|
|
except Exception as e:
|
|
print(f"{path}:{line_num}: {e}")
|
|
errors += 1
|
|
|
|
return total, errors
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Validate QMD JSONL schema")
|
|
parser.add_argument(
|
|
"paths",
|
|
nargs="*",
|
|
default=["finetune/data/*.jsonl"],
|
|
help="JSONL files or glob patterns (default: finetune/data/*.jsonl)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
repo_root = Path(__file__).parent.parent.parent
|
|
files: list[Path] = []
|
|
for pattern in args.paths:
|
|
if "*" in pattern:
|
|
files.extend(repo_root.glob(pattern))
|
|
else:
|
|
files.append(repo_root / pattern)
|
|
|
|
files = [p for p in files if p.exists()]
|
|
if not files:
|
|
print("No files found to validate.")
|
|
return 1
|
|
|
|
total_lines = 0
|
|
total_errors = 0
|
|
for path in sorted(files):
|
|
lines, errors = validate_file(path)
|
|
total_lines += lines
|
|
total_errors += errors
|
|
status = "OK" if errors == 0 else f"{errors} error(s)"
|
|
print(f"{path}: {lines} lines, {status}")
|
|
|
|
if total_errors:
|
|
print(
|
|
f"\nValidation failed: {total_errors} error(s) across {total_lines} lines"
|
|
)
|
|
return 1
|
|
|
|
print(f"\nValidation passed: {total_lines} lines checked")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|