qmd/test/eval-harness.ts
Tobi Lutke c68904fe08
refactor: move CLI and MCP to subdirectories, MCP consumes SDK
Move frontends into src/cli/ and src/mcp/ to separate them from the
core library. The MCP server is fully rewritten to import only from
the SDK (src/index.ts) — zero direct store.ts/collections.ts/llm.ts
access.

- src/qmd.ts → src/cli/qmd.ts
- src/formatter.ts → src/cli/formatter.ts
- src/mcp.ts → src/mcp/server.ts (rewritten to use QMDStore SDK)
- New src/maintenance.ts: Maintenance class for CLI housekeeping
- SDK gains: getDocumentBody(), getDefaultCollectionNames(),
  extractSnippet/addLineNumbers/DEFAULT_MULTI_GET_MAX_BYTES exports,
  getDefaultDbPath re-export, InternalStore type export
- package.json bin/scripts updated for new paths
- All 692 tests pass
2026-03-10 11:39:55 -04:00

224 lines
6.2 KiB
TypeScript
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Evaluation Harness for QMD Search
*
* Tests search quality with synthetic queries against known documents.
* Run: bun test/eval-harness.ts
*/
import { execSync } from "child_process";
// Test queries with expected documents and difficulty
const evalQueries: {
query: string;
expectedDoc: string; // Partial match on filename
difficulty: "easy" | "medium" | "hard";
description: string;
}[] = [
// EASY: Exact keyword matches
{
query: "API versioning",
expectedDoc: "api-design",
difficulty: "easy",
description: "Direct keyword match"
},
{
query: "Series A fundraising",
expectedDoc: "fundraising",
difficulty: "easy",
description: "Direct keyword match"
},
{
query: "CAP theorem",
expectedDoc: "distributed-systems",
difficulty: "easy",
description: "Direct keyword match"
},
{
query: "overfitting machine learning",
expectedDoc: "machine-learning",
difficulty: "easy",
description: "Direct keyword match"
},
{
query: "remote work VPN",
expectedDoc: "remote-work",
difficulty: "easy",
description: "Direct keyword match"
},
{
query: "Project Phoenix retrospective",
expectedDoc: "product-launch",
difficulty: "easy",
description: "Direct keyword match"
},
// MEDIUM: Semantic/conceptual queries
{
query: "how to structure REST endpoints",
expectedDoc: "api-design",
difficulty: "medium",
description: "Conceptual - no exact match"
},
{
query: "raising money for startup",
expectedDoc: "fundraising",
difficulty: "medium",
description: "Conceptual - synonyms"
},
{
query: "consistency vs availability tradeoffs",
expectedDoc: "distributed-systems",
difficulty: "medium",
description: "Conceptual understanding"
},
{
query: "how to prevent models from memorizing data",
expectedDoc: "machine-learning",
difficulty: "medium",
description: "Conceptual - overfitting"
},
{
query: "working from home guidelines",
expectedDoc: "remote-work",
difficulty: "medium",
description: "Synonym match"
},
{
query: "what went wrong with the launch",
expectedDoc: "product-launch",
difficulty: "medium",
description: "Conceptual query"
},
// HARD: Vague, partial memory, indirect
{
query: "nouns not verbs",
expectedDoc: "api-design",
difficulty: "hard",
description: "Partial phrase recall"
},
{
query: "Sequoia investor pitch",
expectedDoc: "fundraising",
difficulty: "hard",
description: "Indirect reference"
},
{
query: "Raft algorithm leader election",
expectedDoc: "distributed-systems",
difficulty: "hard",
description: "Specific detail in long doc"
},
{
query: "F1 score precision recall",
expectedDoc: "machine-learning",
difficulty: "hard",
description: "Technical detail"
},
{
query: "quarterly team gathering travel",
expectedDoc: "remote-work",
difficulty: "hard",
description: "Specific policy detail"
},
{
query: "beta program 47 bugs",
expectedDoc: "product-launch",
difficulty: "hard",
description: "Specific number recall"
},
];
interface SearchResult {
file: string;
score: number;
title: string;
}
function runSearch(query: string): SearchResult[] {
try {
const output = execSync(
`bun src/cli/qmd.ts search "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
{ encoding: "utf-8", timeout: 30000 }
);
return JSON.parse(output);
} catch (e) {
return [];
}
}
function runQuery(query: string): SearchResult[] {
try {
const output = execSync(
`bun src/cli/qmd.ts query "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
{ encoding: "utf-8", timeout: 60000 }
);
return JSON.parse(output);
} catch (e) {
return [];
}
}
function evaluate(mode: "search" | "query") {
const runFn = mode === "search" ? runSearch : runQuery;
const results = {
easy: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
medium: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
hard: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
};
console.log(`\n=== Evaluating ${mode.toUpperCase()} mode ===\n`);
for (const { query, expectedDoc, difficulty, description } of evalQueries) {
const searchResults = runFn(query);
const ranks = searchResults
.map((r, i) => ({ rank: i + 1, matches: r.file.toLowerCase().includes(expectedDoc) }))
.filter(r => r.matches);
const firstHit = ranks.length > 0 ? ranks[0]!.rank : -1;
results[difficulty].total++;
if (firstHit === 1) results[difficulty].hit1++;
if (firstHit >= 1 && firstHit <= 3) results[difficulty].hit3++;
if (firstHit >= 1 && firstHit <= 5) results[difficulty].hit5++;
const status = firstHit === 1 ? "✓" : firstHit > 0 ? `@${firstHit}` : "✗";
console.log(`[${difficulty.padEnd(6)}] ${status.padEnd(3)} "${query}" → ${description}`);
}
console.log("\n--- Summary ---");
for (const [diff, r] of Object.entries(results)) {
const hit1Pct = ((r.hit1 / r.total) * 100).toFixed(0);
const hit3Pct = ((r.hit3 / r.total) * 100).toFixed(0);
const hit5Pct = ((r.hit5 / r.total) * 100).toFixed(0);
console.log(`${diff.padEnd(8)}: Hit@1=${hit1Pct}% Hit@3=${hit3Pct}% Hit@5=${hit5Pct}% (n=${r.total})`);
}
const total = evalQueries.length;
const totalHit1 = Object.values(results).reduce((a, r) => a + r.hit1, 0);
const totalHit3 = Object.values(results).reduce((a, r) => a + r.hit3, 0);
console.log(`\nOverall: Hit@1=${((totalHit1/total)*100).toFixed(0)}% Hit@3=${((totalHit3/total)*100).toFixed(0)}%`);
}
// Main
console.log("QMD Evaluation Harness");
console.log("=".repeat(50));
console.log(`Testing ${evalQueries.length} queries across 6 documents`);
// Check if eval-docs collection exists
try {
const status = execSync("bun src/cli/qmd.ts status --json 2>/dev/null", { encoding: "utf-8" });
if (!status.includes("eval-docs")) {
console.log("\n⚠ eval-docs collection not found. Run:");
console.log(" qmd collection add test/eval-docs --name eval-docs");
console.log(" qmd embed");
process.exit(1);
}
} catch {
console.log("\n⚠ Could not check status. Make sure qmd is working.");
}
// Run evaluations
evaluate("search");
evaluate("query");