Add evaluation harness with synthetic test documents
- 6 public-style documents covering diverse topics - 18 test queries: 6 easy, 6 medium, 6 hard - Easy: exact keyword matches - Medium: semantic/conceptual queries - Hard: partial recall, indirect references - Measures Hit@1, Hit@3, Hit@5 by difficulty - Tests both search (BM25) and query (hybrid) modes Run: bun test/eval-harness.ts 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
a0bceef36a
commit
7828566333
@ -10,7 +10,7 @@
|
||||
{"id":"qmd-6s5","title":"Export current database to index.yml","description":"Write a script to export current collections and path_contexts from SQLite to ~/.config/qmd/index.yml format. Include all collection metadata and contexts.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:54:52.707844-05:00","updated_at":"2025-12-13T09:57:36.650437-05:00","closed_at":"2025-12-13T09:57:36.650437-05:00","dependencies":[{"issue_id":"qmd-6s5","depends_on_id":"qmd-3z9","type":"blocks","created_at":"2025-12-13T09:55:07.606834-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-7ss","title":"remove all the symlinks and stuff in the git repo, clean up the root directory","description":"","status":"closed","priority":4,"issue_type":"task","created_at":"2025-12-12T16:40:00.744982-05:00","updated_at":"2025-12-12T17:11:18.034215-05:00","closed_at":"2025-12-12T17:11:18.034215-05:00"}
|
||||
{"id":"qmd-8eu","title":"Update documents table schema for collection names","description":"Change documents.collection_id (integer FK) to documents.collection (text). Update all queries and indices. Keep backwards compatibility during transition.","design":"Schema change:\n- Add `collection TEXT` column\n- Migrate data: UPDATE documents SET collection = (SELECT name FROM collections WHERE id = collection_id)\n- Drop collection_id column\n- Update FTS5 trigger\n- Update all queries in store.ts","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:54:52.830305-05:00","updated_at":"2025-12-13T10:08:24.88716-05:00","closed_at":"2025-12-13T10:08:24.88716-05:00","dependencies":[{"issue_id":"qmd-8eu","depends_on_id":"qmd-6s5","type":"blocks","created_at":"2025-12-13T09:55:07.662048-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-9ij","title":"Conditional query expansion based on BM25 signal strength","description":"Query expansion helps recall but injects false positives. Skip expansion if original BM25 top-5 has strong signals (exact term hits, high proximity). Only expand when recall is weak.","status":"in_progress","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:41.806447-05:00","updated_at":"2025-12-21T12:04:20.130497-05:00"}
|
||||
{"id":"qmd-9ij","title":"Conditional query expansion based on BM25 signal strength","description":"Query expansion helps recall but injects false positives. Skip expansion if original BM25 top-5 has strong signals (exact term hits, high proximity). Only expand when recall is weak.","status":"closed","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:41.806447-05:00","updated_at":"2025-12-21T12:05:40.85997-05:00","closed_at":"2025-12-21T12:05:40.85997-05:00"}
|
||||
{"id":"qmd-9ua","title":"Update all qmd commands for YAML-based collections","description":"Update qmd.ts commands: collection add/list/remove/rename, status, update, ls. All should use collections.ts instead of store.ts collection functions.","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-13T09:54:53.14644-05:00","updated_at":"2025-12-13T10:17:39.67707-05:00","closed_at":"2025-12-13T10:17:39.67707-05:00","dependencies":[{"issue_id":"qmd-9ua","depends_on_id":"qmd-u84","type":"blocks","created_at":"2025-12-13T09:55:07.893268-05:00","created_by":"daemon"},{"issue_id":"qmd-9ua","depends_on_id":"qmd-oxy","type":"blocks","created_at":"2025-12-13T09:55:07.942221-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-afe","title":"implement qmd collection rename, which changes the global path prefix for the collection","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:55:54.779325-05:00","updated_at":"2025-12-12T16:29:24.153196-05:00","closed_at":"2025-12-12T16:29:24.153196-05:00"}
|
||||
{"id":"qmd-ama","title":"Refactor database system","description":"All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection, path, hash,\n┃ created_at, updated_at. (collection,path)\n┃\n┃\n\n┃ All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection_id, path, hash,\n┃ created_at, updated_at. (collection,path) is unique. There is also collection which stores PWD\n┃ + glob pattern, name (\\w+). Every document is treated as path qmd://collection.name/","notes":"## Completed\n- ✅ Implemented content-addressable storage (content table with hash→doc mapping)\n- ✅ Refactored documents table as file system layer (collection_id, path, hash)\n- ✅ Added collection names (e.g., \"pages\", \"journals\", \"archive\")\n- ✅ Implemented virtual paths (qmd://collection-name/path/to/file.md)\n- ✅ Added hierarchical context support (collection-scoped)\n- ✅ Successfully migrated existing database\n- ✅ Updated search functions to work with new schema\n- ✅ Updated indexing logic to use content-addressable storage\n- ✅ Orphaned content hash cleanup\n\n## Still TODO\n- Fix migration SQL to properly extract basename (currently needs manual fix)\n- Implement `qmd collection add . --name \u003cname\u003e --mask '**/*.md'`\n- Implement `qmd ls [path]` for exploring virtual file tree","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:35.497489-05:00","updated_at":"2025-12-12T15:39:48.879143-05:00","closed_at":"2025-12-12T15:39:48.879143-05:00"}
|
||||
@ -19,7 +19,7 @@
|
||||
{"id":"qmd-bx1","title":"Fix migration SQL for proper basename extraction","description":"The migration currently generates collection names incorrectly (uses full path instead of basename). Need to fix the SQL in migrateToContentAddressable to properly extract the directory basename.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-12T15:29:53.757723-05:00","updated_at":"2025-12-12T15:50:29.349134-05:00","closed_at":"2025-12-12T15:50:29.349134-05:00","dependencies":[{"issue_id":"qmd-bx1","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.758524-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-c0m","title":"Comprehensive CLI review and consistency pass","description":"Review entire CLI command structure:\n- Consistent naming (add vs create, remove vs delete)\n- Consistent flag usage (--name, --mask, etc)\n- Update help text for all commands\n- Ensure virtual paths work everywhere\n- Test all commands end-to-end","status":"closed","priority":1,"issue_type":"task","created_at":"2025-12-12T15:29:38.083564-05:00","updated_at":"2025-12-12T16:06:51.544695-05:00","closed_at":"2025-12-12T16:06:51.544695-05:00"}
|
||||
{"id":"qmd-clr","title":"fix embed","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:14:55.292114-05:00","updated_at":"2025-12-12T16:31:27.661829-05:00","closed_at":"2025-12-12T16:31:27.661829-05:00"}
|
||||
{"id":"qmd-d00","title":"Add offline evaluation harness for tuning","description":"Create a small benchmark with ~100 labeled queries from real searches. Would enable tuning: expansion on/off threshold, candidate count (30 vs 100), blending weights, reranker threshold.","notes":"Test samples must be: 1) entirely synthetic, OR 2) public documents (e.g., public podcasts, public memos). No private/personal content in eval set.","status":"open","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:42.007265-05:00","updated_at":"2025-12-21T12:04:11.951081-05:00"}
|
||||
{"id":"qmd-d00","title":"Add offline evaluation harness for tuning","description":"Create a small benchmark with ~100 labeled queries from real searches. Would enable tuning: expansion on/off threshold, candidate count (30 vs 100), blending weights, reranker threshold.","notes":"Test samples must be: 1) entirely synthetic, OR 2) public documents (e.g., public podcasts, public memos). No private/personal content in eval set.","status":"in_progress","priority":3,"issue_type":"feature","created_at":"2025-12-20T17:18:42.007265-05:00","updated_at":"2025-12-21T12:07:22.319147-05:00"}
|
||||
{"id":"qmd-deh","title":"Refactor database introduce qmd collection *","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:56:04.516137-05:00","updated_at":"2025-12-12T16:12:12.349428-05:00","closed_at":"2025-12-12T16:12:12.349428-05:00"}
|
||||
{"id":"qmd-df5","title":"Rerank multiple chunks per document with score aggregation","description":"Currently we only rerank 1 chunk per doc (selected by keyword heuristic). Should rerank top 2-3 chunks per document, then aggregate scores (max, softmax, or top-2 average). This improves ranking for long documents where the keyword-matched chunk isn't always the most relevant.","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-20T17:18:41.592575-05:00","updated_at":"2025-12-21T12:04:11.777309-05:00","closed_at":"2025-12-21T12:04:11.777309-05:00"}
|
||||
{"id":"qmd-dmi","title":"Implement 'qmd collection' commands","description":"Add explicit collection management:\n- qmd collection add . --name \u003cname\u003e --mask '**/*.md'\n- qmd collection list\n- qmd collection remove \u003cname\u003e\n\nThis gives users control over collection names and patterns.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.810666-05:00","updated_at":"2025-12-12T16:02:08.079158-05:00","closed_at":"2025-12-12T16:02:08.079158-05:00","dependencies":[{"issue_id":"qmd-dmi","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.811294-05:00","created_by":"daemon"}]}
|
||||
|
||||
223
test/eval-harness.ts
Normal file
223
test/eval-harness.ts
Normal file
@ -0,0 +1,223 @@
|
||||
/**
|
||||
* Evaluation Harness for QMD Search
|
||||
*
|
||||
* Tests search quality with synthetic queries against known documents.
|
||||
* Run: bun test/eval-harness.ts
|
||||
*/
|
||||
|
||||
import { execSync } from "child_process";
|
||||
|
||||
// Test queries with expected documents and difficulty
|
||||
const evalQueries: {
|
||||
query: string;
|
||||
expectedDoc: string; // Partial match on filename
|
||||
difficulty: "easy" | "medium" | "hard";
|
||||
description: string;
|
||||
}[] = [
|
||||
// EASY: Exact keyword matches
|
||||
{
|
||||
query: "API versioning",
|
||||
expectedDoc: "api-design",
|
||||
difficulty: "easy",
|
||||
description: "Direct keyword match"
|
||||
},
|
||||
{
|
||||
query: "Series A fundraising",
|
||||
expectedDoc: "fundraising",
|
||||
difficulty: "easy",
|
||||
description: "Direct keyword match"
|
||||
},
|
||||
{
|
||||
query: "CAP theorem",
|
||||
expectedDoc: "distributed-systems",
|
||||
difficulty: "easy",
|
||||
description: "Direct keyword match"
|
||||
},
|
||||
{
|
||||
query: "overfitting machine learning",
|
||||
expectedDoc: "machine-learning",
|
||||
difficulty: "easy",
|
||||
description: "Direct keyword match"
|
||||
},
|
||||
{
|
||||
query: "remote work VPN",
|
||||
expectedDoc: "remote-work",
|
||||
difficulty: "easy",
|
||||
description: "Direct keyword match"
|
||||
},
|
||||
{
|
||||
query: "Project Phoenix retrospective",
|
||||
expectedDoc: "product-launch",
|
||||
difficulty: "easy",
|
||||
description: "Direct keyword match"
|
||||
},
|
||||
|
||||
// MEDIUM: Semantic/conceptual queries
|
||||
{
|
||||
query: "how to structure REST endpoints",
|
||||
expectedDoc: "api-design",
|
||||
difficulty: "medium",
|
||||
description: "Conceptual - no exact match"
|
||||
},
|
||||
{
|
||||
query: "raising money for startup",
|
||||
expectedDoc: "fundraising",
|
||||
difficulty: "medium",
|
||||
description: "Conceptual - synonyms"
|
||||
},
|
||||
{
|
||||
query: "consistency vs availability tradeoffs",
|
||||
expectedDoc: "distributed-systems",
|
||||
difficulty: "medium",
|
||||
description: "Conceptual understanding"
|
||||
},
|
||||
{
|
||||
query: "how to prevent models from memorizing data",
|
||||
expectedDoc: "machine-learning",
|
||||
difficulty: "medium",
|
||||
description: "Conceptual - overfitting"
|
||||
},
|
||||
{
|
||||
query: "working from home guidelines",
|
||||
expectedDoc: "remote-work",
|
||||
difficulty: "medium",
|
||||
description: "Synonym match"
|
||||
},
|
||||
{
|
||||
query: "what went wrong with the launch",
|
||||
expectedDoc: "product-launch",
|
||||
difficulty: "medium",
|
||||
description: "Conceptual query"
|
||||
},
|
||||
|
||||
// HARD: Vague, partial memory, indirect
|
||||
{
|
||||
query: "nouns not verbs",
|
||||
expectedDoc: "api-design",
|
||||
difficulty: "hard",
|
||||
description: "Partial phrase recall"
|
||||
},
|
||||
{
|
||||
query: "Sequoia investor pitch",
|
||||
expectedDoc: "fundraising",
|
||||
difficulty: "hard",
|
||||
description: "Indirect reference"
|
||||
},
|
||||
{
|
||||
query: "Raft algorithm leader election",
|
||||
expectedDoc: "distributed-systems",
|
||||
difficulty: "hard",
|
||||
description: "Specific detail in long doc"
|
||||
},
|
||||
{
|
||||
query: "F1 score precision recall",
|
||||
expectedDoc: "machine-learning",
|
||||
difficulty: "hard",
|
||||
description: "Technical detail"
|
||||
},
|
||||
{
|
||||
query: "quarterly team gathering travel",
|
||||
expectedDoc: "remote-work",
|
||||
difficulty: "hard",
|
||||
description: "Specific policy detail"
|
||||
},
|
||||
{
|
||||
query: "beta program 47 bugs",
|
||||
expectedDoc: "product-launch",
|
||||
difficulty: "hard",
|
||||
description: "Specific number recall"
|
||||
},
|
||||
];
|
||||
|
||||
interface SearchResult {
|
||||
file: string;
|
||||
score: number;
|
||||
title: string;
|
||||
}
|
||||
|
||||
function runSearch(query: string): SearchResult[] {
|
||||
try {
|
||||
const output = execSync(
|
||||
`bun src/qmd.ts search "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
|
||||
{ encoding: "utf-8", timeout: 30000 }
|
||||
);
|
||||
return JSON.parse(output);
|
||||
} catch (e) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function runQuery(query: string): SearchResult[] {
|
||||
try {
|
||||
const output = execSync(
|
||||
`bun src/qmd.ts query "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
|
||||
{ encoding: "utf-8", timeout: 60000 }
|
||||
);
|
||||
return JSON.parse(output);
|
||||
} catch (e) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function evaluate(mode: "search" | "query") {
|
||||
const runFn = mode === "search" ? runSearch : runQuery;
|
||||
const results = {
|
||||
easy: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
|
||||
medium: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
|
||||
hard: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
|
||||
};
|
||||
|
||||
console.log(`\n=== Evaluating ${mode.toUpperCase()} mode ===\n`);
|
||||
|
||||
for (const { query, expectedDoc, difficulty, description } of evalQueries) {
|
||||
const searchResults = runFn(query);
|
||||
const ranks = searchResults
|
||||
.map((r, i) => ({ rank: i + 1, matches: r.file.toLowerCase().includes(expectedDoc) }))
|
||||
.filter(r => r.matches);
|
||||
|
||||
const firstHit = ranks.length > 0 ? ranks[0].rank : -1;
|
||||
|
||||
results[difficulty].total++;
|
||||
if (firstHit === 1) results[difficulty].hit1++;
|
||||
if (firstHit >= 1 && firstHit <= 3) results[difficulty].hit3++;
|
||||
if (firstHit >= 1 && firstHit <= 5) results[difficulty].hit5++;
|
||||
|
||||
const status = firstHit === 1 ? "✓" : firstHit > 0 ? `@${firstHit}` : "✗";
|
||||
console.log(`[${difficulty.padEnd(6)}] ${status.padEnd(3)} "${query}" → ${description}`);
|
||||
}
|
||||
|
||||
console.log("\n--- Summary ---");
|
||||
for (const [diff, r] of Object.entries(results)) {
|
||||
const hit1Pct = ((r.hit1 / r.total) * 100).toFixed(0);
|
||||
const hit3Pct = ((r.hit3 / r.total) * 100).toFixed(0);
|
||||
const hit5Pct = ((r.hit5 / r.total) * 100).toFixed(0);
|
||||
console.log(`${diff.padEnd(8)}: Hit@1=${hit1Pct}% Hit@3=${hit3Pct}% Hit@5=${hit5Pct}% (n=${r.total})`);
|
||||
}
|
||||
|
||||
const total = evalQueries.length;
|
||||
const totalHit1 = Object.values(results).reduce((a, r) => a + r.hit1, 0);
|
||||
const totalHit3 = Object.values(results).reduce((a, r) => a + r.hit3, 0);
|
||||
console.log(`\nOverall: Hit@1=${((totalHit1/total)*100).toFixed(0)}% Hit@3=${((totalHit3/total)*100).toFixed(0)}%`);
|
||||
}
|
||||
|
||||
// Main
|
||||
console.log("QMD Evaluation Harness");
|
||||
console.log("=".repeat(50));
|
||||
console.log(`Testing ${evalQueries.length} queries across 6 documents`);
|
||||
|
||||
// Check if eval-docs collection exists
|
||||
try {
|
||||
const status = execSync("bun src/qmd.ts status --json 2>/dev/null", { encoding: "utf-8" });
|
||||
if (!status.includes("eval-docs")) {
|
||||
console.log("\n⚠️ eval-docs collection not found. Run:");
|
||||
console.log(" qmd collection add test/eval-docs --name eval-docs");
|
||||
console.log(" qmd embed");
|
||||
process.exit(1);
|
||||
}
|
||||
} catch {
|
||||
console.log("\n⚠️ Could not check status. Make sure qmd is working.");
|
||||
}
|
||||
|
||||
// Run evaluations
|
||||
evaluate("search");
|
||||
evaluate("query");
|
||||
Loading…
Reference in New Issue
Block a user