Move frontends into src/cli/ and src/mcp/ to separate them from the core library. The MCP server is fully rewritten to import only from the SDK (src/index.ts) — zero direct store.ts/collections.ts/llm.ts access. - src/qmd.ts → src/cli/qmd.ts - src/formatter.ts → src/cli/formatter.ts - src/mcp.ts → src/mcp/server.ts (rewritten to use QMDStore SDK) - New src/maintenance.ts: Maintenance class for CLI housekeeping - SDK gains: getDocumentBody(), getDefaultCollectionNames(), extractSnippet/addLineNumbers/DEFAULT_MULTI_GET_MAX_BYTES exports, getDefaultDbPath re-export, InternalStore type export - package.json bin/scripts updated for new paths - All 692 tests pass
224 lines
6.2 KiB
TypeScript
224 lines
6.2 KiB
TypeScript
/**
|
||
* Evaluation Harness for QMD Search
|
||
*
|
||
* Tests search quality with synthetic queries against known documents.
|
||
* Run: bun test/eval-harness.ts
|
||
*/
|
||
|
||
import { execSync } from "child_process";
|
||
|
||
// Test queries with expected documents and difficulty
|
||
const evalQueries: {
|
||
query: string;
|
||
expectedDoc: string; // Partial match on filename
|
||
difficulty: "easy" | "medium" | "hard";
|
||
description: string;
|
||
}[] = [
|
||
// EASY: Exact keyword matches
|
||
{
|
||
query: "API versioning",
|
||
expectedDoc: "api-design",
|
||
difficulty: "easy",
|
||
description: "Direct keyword match"
|
||
},
|
||
{
|
||
query: "Series A fundraising",
|
||
expectedDoc: "fundraising",
|
||
difficulty: "easy",
|
||
description: "Direct keyword match"
|
||
},
|
||
{
|
||
query: "CAP theorem",
|
||
expectedDoc: "distributed-systems",
|
||
difficulty: "easy",
|
||
description: "Direct keyword match"
|
||
},
|
||
{
|
||
query: "overfitting machine learning",
|
||
expectedDoc: "machine-learning",
|
||
difficulty: "easy",
|
||
description: "Direct keyword match"
|
||
},
|
||
{
|
||
query: "remote work VPN",
|
||
expectedDoc: "remote-work",
|
||
difficulty: "easy",
|
||
description: "Direct keyword match"
|
||
},
|
||
{
|
||
query: "Project Phoenix retrospective",
|
||
expectedDoc: "product-launch",
|
||
difficulty: "easy",
|
||
description: "Direct keyword match"
|
||
},
|
||
|
||
// MEDIUM: Semantic/conceptual queries
|
||
{
|
||
query: "how to structure REST endpoints",
|
||
expectedDoc: "api-design",
|
||
difficulty: "medium",
|
||
description: "Conceptual - no exact match"
|
||
},
|
||
{
|
||
query: "raising money for startup",
|
||
expectedDoc: "fundraising",
|
||
difficulty: "medium",
|
||
description: "Conceptual - synonyms"
|
||
},
|
||
{
|
||
query: "consistency vs availability tradeoffs",
|
||
expectedDoc: "distributed-systems",
|
||
difficulty: "medium",
|
||
description: "Conceptual understanding"
|
||
},
|
||
{
|
||
query: "how to prevent models from memorizing data",
|
||
expectedDoc: "machine-learning",
|
||
difficulty: "medium",
|
||
description: "Conceptual - overfitting"
|
||
},
|
||
{
|
||
query: "working from home guidelines",
|
||
expectedDoc: "remote-work",
|
||
difficulty: "medium",
|
||
description: "Synonym match"
|
||
},
|
||
{
|
||
query: "what went wrong with the launch",
|
||
expectedDoc: "product-launch",
|
||
difficulty: "medium",
|
||
description: "Conceptual query"
|
||
},
|
||
|
||
// HARD: Vague, partial memory, indirect
|
||
{
|
||
query: "nouns not verbs",
|
||
expectedDoc: "api-design",
|
||
difficulty: "hard",
|
||
description: "Partial phrase recall"
|
||
},
|
||
{
|
||
query: "Sequoia investor pitch",
|
||
expectedDoc: "fundraising",
|
||
difficulty: "hard",
|
||
description: "Indirect reference"
|
||
},
|
||
{
|
||
query: "Raft algorithm leader election",
|
||
expectedDoc: "distributed-systems",
|
||
difficulty: "hard",
|
||
description: "Specific detail in long doc"
|
||
},
|
||
{
|
||
query: "F1 score precision recall",
|
||
expectedDoc: "machine-learning",
|
||
difficulty: "hard",
|
||
description: "Technical detail"
|
||
},
|
||
{
|
||
query: "quarterly team gathering travel",
|
||
expectedDoc: "remote-work",
|
||
difficulty: "hard",
|
||
description: "Specific policy detail"
|
||
},
|
||
{
|
||
query: "beta program 47 bugs",
|
||
expectedDoc: "product-launch",
|
||
difficulty: "hard",
|
||
description: "Specific number recall"
|
||
},
|
||
];
|
||
|
||
interface SearchResult {
|
||
file: string;
|
||
score: number;
|
||
title: string;
|
||
}
|
||
|
||
function runSearch(query: string): SearchResult[] {
|
||
try {
|
||
const output = execSync(
|
||
`bun src/cli/qmd.ts search "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
|
||
{ encoding: "utf-8", timeout: 30000 }
|
||
);
|
||
return JSON.parse(output);
|
||
} catch (e) {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
function runQuery(query: string): SearchResult[] {
|
||
try {
|
||
const output = execSync(
|
||
`bun src/cli/qmd.ts query "${query.replace(/"/g, '\\"')}" --json -n 5 2>/dev/null`,
|
||
{ encoding: "utf-8", timeout: 60000 }
|
||
);
|
||
return JSON.parse(output);
|
||
} catch (e) {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
function evaluate(mode: "search" | "query") {
|
||
const runFn = mode === "search" ? runSearch : runQuery;
|
||
const results = {
|
||
easy: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
|
||
medium: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
|
||
hard: { total: 0, hit1: 0, hit3: 0, hit5: 0 },
|
||
};
|
||
|
||
console.log(`\n=== Evaluating ${mode.toUpperCase()} mode ===\n`);
|
||
|
||
for (const { query, expectedDoc, difficulty, description } of evalQueries) {
|
||
const searchResults = runFn(query);
|
||
const ranks = searchResults
|
||
.map((r, i) => ({ rank: i + 1, matches: r.file.toLowerCase().includes(expectedDoc) }))
|
||
.filter(r => r.matches);
|
||
|
||
const firstHit = ranks.length > 0 ? ranks[0]!.rank : -1;
|
||
|
||
results[difficulty].total++;
|
||
if (firstHit === 1) results[difficulty].hit1++;
|
||
if (firstHit >= 1 && firstHit <= 3) results[difficulty].hit3++;
|
||
if (firstHit >= 1 && firstHit <= 5) results[difficulty].hit5++;
|
||
|
||
const status = firstHit === 1 ? "✓" : firstHit > 0 ? `@${firstHit}` : "✗";
|
||
console.log(`[${difficulty.padEnd(6)}] ${status.padEnd(3)} "${query}" → ${description}`);
|
||
}
|
||
|
||
console.log("\n--- Summary ---");
|
||
for (const [diff, r] of Object.entries(results)) {
|
||
const hit1Pct = ((r.hit1 / r.total) * 100).toFixed(0);
|
||
const hit3Pct = ((r.hit3 / r.total) * 100).toFixed(0);
|
||
const hit5Pct = ((r.hit5 / r.total) * 100).toFixed(0);
|
||
console.log(`${diff.padEnd(8)}: Hit@1=${hit1Pct}% Hit@3=${hit3Pct}% Hit@5=${hit5Pct}% (n=${r.total})`);
|
||
}
|
||
|
||
const total = evalQueries.length;
|
||
const totalHit1 = Object.values(results).reduce((a, r) => a + r.hit1, 0);
|
||
const totalHit3 = Object.values(results).reduce((a, r) => a + r.hit3, 0);
|
||
console.log(`\nOverall: Hit@1=${((totalHit1/total)*100).toFixed(0)}% Hit@3=${((totalHit3/total)*100).toFixed(0)}%`);
|
||
}
|
||
|
||
// Main
|
||
console.log("QMD Evaluation Harness");
|
||
console.log("=".repeat(50));
|
||
console.log(`Testing ${evalQueries.length} queries across 6 documents`);
|
||
|
||
// Check if eval-docs collection exists
|
||
try {
|
||
const status = execSync("bun src/cli/qmd.ts status --json 2>/dev/null", { encoding: "utf-8" });
|
||
if (!status.includes("eval-docs")) {
|
||
console.log("\n⚠️ eval-docs collection not found. Run:");
|
||
console.log(" qmd collection add test/eval-docs --name eval-docs");
|
||
console.log(" qmd embed");
|
||
process.exit(1);
|
||
}
|
||
} catch {
|
||
console.log("\n⚠️ Could not check status. Make sure qmd is working.");
|
||
}
|
||
|
||
// Run evaluations
|
||
evaluate("search");
|
||
evaluate("query");
|