Resolve conflicts: combine AST chunking args (filepath, chunkStrategy) with abort signal parameter from #458. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
210 lines
6.7 KiB
TypeScript
210 lines
6.7 KiB
TypeScript
/**
|
||
* Deep Research Evaluation for QMD
|
||
*
|
||
* Tests end-to-end retrieval quality: query → expansion → reranking → results
|
||
*
|
||
* These are HARD queries with NO exact keyword matches - they require
|
||
* semantic understanding via query expansion and reranking to succeed.
|
||
*
|
||
* Run: bun test/eval-deep-research.ts
|
||
*/
|
||
|
||
import { execSync } from "child_process";
|
||
import { readFileSync, existsSync } from "fs";
|
||
import { join, dirname } from "path";
|
||
import { fileURLToPath } from "url";
|
||
|
||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||
|
||
interface EvalQuery {
|
||
query: string;
|
||
expected_doc: string;
|
||
difficulty: string;
|
||
intent: string; // Domain context hint for future intent-aware retrieval
|
||
notes: string;
|
||
}
|
||
|
||
interface SearchResult {
|
||
file: string;
|
||
score: number;
|
||
title?: string;
|
||
}
|
||
|
||
function loadQueries(): EvalQuery[] {
|
||
const path = join(__dirname, "eval-deep-research.jsonl");
|
||
const content = readFileSync(path, "utf-8");
|
||
return content
|
||
.split("\n")
|
||
.filter((line) => line.trim())
|
||
.map((line) => JSON.parse(line));
|
||
}
|
||
|
||
function runBM25Search(query: string): SearchResult[] {
|
||
try {
|
||
const output = execSync(
|
||
`bun src/qmd.ts search "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`,
|
||
{ encoding: "utf-8", timeout: 30000 }
|
||
);
|
||
return JSON.parse(output);
|
||
} catch {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
function runDeepResearch(query: string): SearchResult[] {
|
||
try {
|
||
const output = execSync(
|
||
`bun src/qmd.ts query "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`,
|
||
{ encoding: "utf-8", timeout: 120000 }
|
||
);
|
||
return JSON.parse(output);
|
||
} catch {
|
||
return [];
|
||
}
|
||
}
|
||
|
||
function matchesExpected(filepath: string, expectedDoc: string): boolean {
|
||
return filepath.toLowerCase().includes(expectedDoc.toLowerCase());
|
||
}
|
||
|
||
function findRank(results: SearchResult[], expectedDoc: string): number {
|
||
for (let i = 0; i < results.length; i++) {
|
||
if (matchesExpected(results[i]!.file, expectedDoc)) {
|
||
return i + 1;
|
||
}
|
||
}
|
||
return -1; // Not found
|
||
}
|
||
|
||
interface MethodResults {
|
||
hit1: number;
|
||
hit3: number;
|
||
hit5: number;
|
||
total: number;
|
||
details: { query: string; rank: number; expected: string; intent?: string }[];
|
||
}
|
||
|
||
function evaluate(
|
||
queries: EvalQuery[],
|
||
searchFn: (q: string) => SearchResult[],
|
||
label: string
|
||
): MethodResults {
|
||
const results: MethodResults = {
|
||
hit1: 0,
|
||
hit3: 0,
|
||
hit5: 0,
|
||
total: queries.length,
|
||
details: [],
|
||
};
|
||
|
||
console.log(`\n${"=".repeat(60)}`);
|
||
console.log(` ${label}`);
|
||
console.log(`${"=".repeat(60)}\n`);
|
||
|
||
for (const { query, expected_doc, intent, notes } of queries) {
|
||
const searchResults = searchFn(query);
|
||
const rank = findRank(searchResults, expected_doc);
|
||
|
||
results.details.push({ query, rank, expected: expected_doc, intent });
|
||
|
||
if (rank === 1) results.hit1++;
|
||
if (rank >= 1 && rank <= 3) results.hit3++;
|
||
if (rank >= 1 && rank <= 5) results.hit5++;
|
||
|
||
const status =
|
||
rank === 1 ? "✓" : rank > 0 && rank <= 3 ? `@${rank}` : rank > 0 ? `@${rank}` : "✗";
|
||
const statusPad = status.padEnd(4);
|
||
console.log(` ${statusPad} "${query.slice(0, 45).padEnd(45)}" → ${expected_doc}`);
|
||
if (rank === -1) {
|
||
console.log(` intent: ${intent} | ${notes}`);
|
||
}
|
||
}
|
||
|
||
const hit1Pct = ((results.hit1 / results.total) * 100).toFixed(0);
|
||
const hit3Pct = ((results.hit3 / results.total) * 100).toFixed(0);
|
||
const hit5Pct = ((results.hit5 / results.total) * 100).toFixed(0);
|
||
|
||
console.log(`\n ${"─".repeat(50)}`);
|
||
console.log(` Hit@1: ${hit1Pct}% (${results.hit1}/${results.total})`);
|
||
console.log(` Hit@3: ${hit3Pct}% (${results.hit3}/${results.total})`);
|
||
console.log(` Hit@5: ${hit5Pct}% (${results.hit5}/${results.total})`);
|
||
|
||
return results;
|
||
}
|
||
|
||
async function main() {
|
||
console.log("QMD Deep Research Evaluation");
|
||
console.log("=".repeat(60));
|
||
console.log("Testing hard queries that require semantic understanding.");
|
||
console.log("These have NO exact keyword matches in documents.");
|
||
|
||
// Check if eval-docs collection exists
|
||
try {
|
||
const status = execSync("bun src/qmd.ts status --json 2>/dev/null", {
|
||
encoding: "utf-8",
|
||
});
|
||
if (!status.includes("eval-docs")) {
|
||
console.log("\n⚠️ eval-docs collection not found. Run:");
|
||
console.log(" qmd collection add test/eval-docs --name eval-docs");
|
||
console.log(" qmd embed");
|
||
process.exit(1);
|
||
}
|
||
} catch {
|
||
console.log("\n⚠️ Could not check status. Make sure qmd is working.");
|
||
}
|
||
|
||
const queries = loadQueries();
|
||
console.log(`\nLoaded ${queries.length} hard queries.`);
|
||
|
||
// Run BM25 baseline (expected to fail on most)
|
||
const bm25Results = evaluate(queries, runBM25Search, "BM25 BASELINE (keyword search)");
|
||
|
||
// Run deep research (expected to succeed via expansion + reranking)
|
||
const deepResults = evaluate(queries, runDeepResearch, "DEEP RESEARCH (expansion + reranking)");
|
||
|
||
// Comparison
|
||
console.log(`\n${"=".repeat(60)}`);
|
||
console.log(" COMPARISON");
|
||
console.log(`${"=".repeat(60)}`);
|
||
console.log(`\n Method Hit@1 Hit@3 Hit@5`);
|
||
console.log(` ${"─".repeat(45)}`);
|
||
console.log(
|
||
` BM25 (baseline) ${((bm25Results.hit1 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit3 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit5 / bm25Results.total) * 100).toFixed(0).padStart(3)}%`
|
||
);
|
||
console.log(
|
||
` Deep Research ${((deepResults.hit1 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit3 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit5 / deepResults.total) * 100).toFixed(0).padStart(3)}%`
|
||
);
|
||
|
||
const improvement = deepResults.hit3 - bm25Results.hit3;
|
||
console.log(`\n Improvement (Hit@3): +${improvement} queries (${((improvement / bm25Results.total) * 100).toFixed(0)}%)`);
|
||
|
||
// Show queries where deep research recovered failures
|
||
const recovered = deepResults.details.filter(
|
||
(d) =>
|
||
d.rank >= 1 &&
|
||
d.rank <= 3 &&
|
||
bm25Results.details.find((b) => b.query === d.query)?.rank === -1
|
||
);
|
||
|
||
if (recovered.length > 0) {
|
||
console.log(`\n Recovered by expansion + reranking (${recovered.length}):`);
|
||
for (const { query, rank, expected } of recovered.slice(0, 5)) {
|
||
console.log(` @${rank} "${query.slice(0, 40)}..." → ${expected}`);
|
||
}
|
||
if (recovered.length > 5) {
|
||
console.log(` ... and ${recovered.length - 5} more`);
|
||
}
|
||
}
|
||
|
||
// Exit with error if deep research performs poorly
|
||
const deepHit3Pct = (deepResults.hit3 / deepResults.total) * 100;
|
||
if (deepHit3Pct < 60) {
|
||
console.log(`\n❌ Deep research Hit@3 < 60% (${deepHit3Pct.toFixed(0)}%)`);
|
||
process.exit(1);
|
||
} else {
|
||
console.log(`\n✓ Deep research Hit@3 >= 60% (${deepHit3Pct.toFixed(0)}%)`);
|
||
}
|
||
}
|
||
|
||
main();
|