Dataset improvements: - Reorder output to put hyde first for better retrieval priming - Convert absolute paths to relative paths in scripts - Add convert_to_structured.py for structured data format - Add qmd_expansion_v3_structured.jsonl with type/query objects - Update schema.py with reorder_hyde_first() helper - Verify data now validates hyde-first ordering Training data regenerated with new ordering (100% validation success).
129 lines
4.5 KiB
Python
129 lines
4.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Verify the quality and correctness of the converted ChatML data.
|
|
"""
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def verify_chatml_format(text):
|
|
"""Verify that the text follows proper ChatML format."""
|
|
issues = []
|
|
|
|
# Check start token
|
|
if not text.startswith("<|startoftext|>"):
|
|
issues.append("Missing <|startoftext|> at beginning")
|
|
|
|
# Check user section
|
|
user_pattern = r"<\|im_start\|>user\n.*?<\|im_end\|>"
|
|
if not re.search(user_pattern, text, re.DOTALL):
|
|
issues.append("Missing or malformed user section")
|
|
|
|
# Check assistant section
|
|
assistant_pattern = r"<\|im_start\|>assistant\n.*?<\|im_end\|>"
|
|
if not re.search(assistant_pattern, text, re.DOTALL):
|
|
issues.append("Missing or malformed assistant section")
|
|
|
|
# Check for proper query format
|
|
if "Expand this search query:" not in text:
|
|
issues.append("Missing 'Expand this search query:' prompt")
|
|
|
|
# Check for required output types
|
|
assistant_content = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
|
|
if assistant_content:
|
|
content = assistant_content.group(1)
|
|
has_lex = "lex:" in content
|
|
has_vec = "vec:" in content
|
|
has_hyde = "hyde:" in content
|
|
|
|
if not has_lex:
|
|
issues.append("Missing lex: entries")
|
|
if not has_vec:
|
|
issues.append("Missing vec: entries")
|
|
if not has_hyde:
|
|
issues.append("Missing hyde: entries")
|
|
|
|
# Validate hyde-first ordering
|
|
lines = content.strip().split("\n")
|
|
if lines:
|
|
first_line = lines[0].strip()
|
|
if not first_line.startswith("hyde:"):
|
|
issues.append("Hyde not first (expected hyde-first ordering)")
|
|
|
|
return issues
|
|
|
|
def analyze_file(filepath):
|
|
"""Analyze a JSONL file for quality and issues."""
|
|
print(f"\nAnalyzing {filepath}...")
|
|
|
|
total_entries = 0
|
|
total_issues = 0
|
|
issue_counts = {}
|
|
query_lengths = []
|
|
assistant_lengths = []
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
for line_num, line in enumerate(f, 1):
|
|
try:
|
|
entry = json.loads(line.strip())
|
|
total_entries += 1
|
|
|
|
text = entry["text"]
|
|
issues = verify_chatml_format(text)
|
|
|
|
if issues:
|
|
total_issues += 1
|
|
for issue in issues:
|
|
issue_counts[issue] = issue_counts.get(issue, 0) + 1
|
|
|
|
# Extract query and assistant response for length analysis
|
|
user_match = re.search(r"Expand this search query: (.*?)<\|im_end\|>", text, re.DOTALL)
|
|
assistant_match = re.search(r"<\|im_start\|>assistant\n(.*?)<\|im_end\|>", text, re.DOTALL)
|
|
|
|
if user_match:
|
|
query_lengths.append(len(user_match.group(1).strip()))
|
|
if assistant_match:
|
|
assistant_lengths.append(len(assistant_match.group(1)))
|
|
|
|
except json.JSONDecodeError as e:
|
|
print(f"JSON decode error on line {line_num}: {e}")
|
|
except Exception as e:
|
|
print(f"Error processing line {line_num}: {e}")
|
|
|
|
print(f"Total entries: {total_entries}")
|
|
print(f"Entries with issues: {total_issues}")
|
|
print(f"Success rate: {((total_entries - total_issues) / total_entries * 100):.1f}%")
|
|
|
|
if issue_counts:
|
|
print("\nIssue breakdown:")
|
|
for issue, count in sorted(issue_counts.items()):
|
|
print(f" {issue}: {count}")
|
|
|
|
if query_lengths:
|
|
print(f"\nQuery length stats:")
|
|
print(f" Min: {min(query_lengths)} chars")
|
|
print(f" Max: {max(query_lengths)} chars")
|
|
print(f" Avg: {sum(query_lengths) / len(query_lengths):.1f} chars")
|
|
|
|
if assistant_lengths:
|
|
print(f"\nAssistant response length stats:")
|
|
print(f" Min: {min(assistant_lengths)} chars")
|
|
print(f" Max: {max(assistant_lengths)} chars")
|
|
print(f" Avg: {sum(assistant_lengths) / len(assistant_lengths):.1f} chars")
|
|
|
|
def main():
|
|
# Use paths relative to this script's location
|
|
script_dir = Path(__file__).parent
|
|
data_dir = script_dir / "train-lfm2"
|
|
|
|
# Analyze both train and validation sets
|
|
analyze_file(data_dir / "train.jsonl")
|
|
analyze_file(data_dir / "val.jsonl")
|
|
|
|
print("\n" + "="*50)
|
|
print("DATA PREPARATION VERIFICATION COMPLETE")
|
|
print("="*50)
|
|
|
|
if __name__ == "__main__":
|
|
main() |