fix: return absolute line numbers from qmd_query
The MCP `query` tool, HTTP `/query` endpoint, and CLI `qmd query` all returned chunk-local line numbers in their snippet output, so the line could not be passed back to `qmd_get` as `fromLine` without an out-of-band lookup. Pass the full document body plus `bestChunkPos` to `extractSnippet` instead of the chunk text alone so it can compute absolute line offsets while still scoping the keyword scan to the reranker-chosen chunk window (preserves #149). Also restores documented behavior of `qmd query --full`, which was emitting the best chunk (~3.6KB max) instead of the full document. extractSnippet now also falls back to a full-body scan when given a chunkPos but the chunk window contains no positive matches. The upstream chunk selector leaves bestIdx=0 as its initialization default whenever scoring fails to find a winner (e.g. queryTerms filtered to empty by the length>2 guard, or semantic-only matches with no lex overlap), so an unconditional chunk-scoped scan would land on chunk 0 instead of where the actual match lives. - src/mcp/server.ts: SearchResultItem gains `line: number`; both MCP and HTTP `/query` handlers populate it - src/cli/qmd.ts: OutputRow.body now sources from r.body - src/store.ts: extractSnippet falls back to full-body scan when chunk-scoped pass finds no positive match - test/mcp.test.ts: new fixture asserts absolute line 301 for a marker placed past the first chunk boundary - test/store.test.ts: regression test for the bestScore<=0 fallback
This commit is contained in:
parent
746beedb48
commit
1f522cffe2
11
CHANGELOG.md
11
CHANGELOG.md
@ -4,6 +4,17 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
|
||||
(CLI JSON output and snippet headers) now return absolute source-file
|
||||
line numbers instead of chunk-local ones, so the `line` field can be
|
||||
passed back to `qmd_get` as `fromLine` without a separate lookup.
|
||||
Snippet selection remains scoped to the best matching chunk
|
||||
(preserves #149).
|
||||
- CLI: `qmd query --full` now emits the full document body in all output
|
||||
formats (json, csv, md, xml), restoring the documented behavior of the
|
||||
flag. Previously it returned only the best matching chunk (~3.6KB max
|
||||
per result). Output payload for `--full` queries is now proportional
|
||||
to total document size.
|
||||
- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
|
||||
to the requested collection instead of embedding global pending work.
|
||||
Scoped `--force` clears only collection-owned vectors, preserves shared
|
||||
|
||||
@ -1886,6 +1886,7 @@ type OutputRow = {
|
||||
score: number;
|
||||
context?: string | null;
|
||||
chunkPos?: number;
|
||||
chunkLen?: number;
|
||||
hash?: string;
|
||||
docid?: string;
|
||||
explain?: HybridQueryExplain;
|
||||
@ -1968,9 +1969,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
// JSON output for LLM consumption
|
||||
const output = filtered.map(row => {
|
||||
const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
|
||||
const snippetInfo = extractSnippet(row.body, query, 300, row.chunkPos, row.chunkLen, opts.intent);
|
||||
let body = opts.full ? row.body : undefined;
|
||||
const snippetInfo = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent) : undefined;
|
||||
let snippet = snippetInfo?.snippet;
|
||||
let snippet = !opts.full ? snippetInfo.snippet : undefined;
|
||||
if (opts.lineNumbers) {
|
||||
if (body) body = addLineNumbers(body);
|
||||
if (snippet) snippet = addLineNumbers(snippet);
|
||||
@ -1979,7 +1980,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
...(docid && { docid: `#${docid}` }),
|
||||
score: Math.round(row.score * 100) / 100,
|
||||
file: toQmdPath(row.displayPath),
|
||||
...(snippetInfo && { line: snippetInfo.line }),
|
||||
line: snippetInfo.line,
|
||||
title: row.title,
|
||||
...(row.context && { context: row.context }),
|
||||
...(body && { body }),
|
||||
@ -2002,7 +2003,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
for (let i = 0; i < filtered.length; i++) {
|
||||
const row = filtered[i];
|
||||
if (!row) continue;
|
||||
const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
|
||||
const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
|
||||
const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
|
||||
|
||||
// Line 1: filepath with docid
|
||||
@ -2066,8 +2067,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
console.log();
|
||||
|
||||
// Snippet with highlighting (diff-style header included)
|
||||
let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
|
||||
const highlighted = highlightTerms(displaySnippet, query);
|
||||
const content = opts.full ? row.body : snippet;
|
||||
const displayContent = opts.lineNumbers ? addLineNumbers(content, opts.full ? 1 : line) : content;
|
||||
const highlighted = highlightTerms(displayContent, query);
|
||||
console.log(highlighted);
|
||||
|
||||
// Double empty line between results
|
||||
@ -2079,7 +2081,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
if (!row) continue;
|
||||
const heading = row.title || row.displayPath;
|
||||
const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
|
||||
let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
|
||||
let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
|
||||
if (opts.lineNumbers) {
|
||||
content = addLineNumbers(content);
|
||||
}
|
||||
@ -2092,7 +2094,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '"')}"` : "";
|
||||
const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '"')}"` : "";
|
||||
const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
|
||||
let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
|
||||
let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
|
||||
if (opts.lineNumbers) {
|
||||
content = addLineNumbers(content);
|
||||
}
|
||||
@ -2102,10 +2104,10 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
|
||||
// CSV format
|
||||
console.log("docid,score,file,title,context,line,snippet");
|
||||
for (const row of filtered) {
|
||||
const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
|
||||
const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
|
||||
let content = opts.full ? row.body : snippet;
|
||||
if (opts.lineNumbers) {
|
||||
content = addLineNumbers(content, line);
|
||||
content = addLineNumbers(content, opts.full ? 1 : line);
|
||||
}
|
||||
const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
|
||||
const snippetText = content || "";
|
||||
@ -2461,13 +2463,13 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
|
||||
? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query)
|
||||
: query;
|
||||
|
||||
// Map to CLI output format — use bestChunk for snippet display
|
||||
outputResults(results.map(r => ({
|
||||
file: r.file,
|
||||
displayPath: r.displayPath,
|
||||
title: r.title,
|
||||
body: r.bestChunk,
|
||||
body: r.body,
|
||||
chunkPos: r.bestChunkPos,
|
||||
chunkLen: r.bestChunk.length,
|
||||
score: r.score,
|
||||
context: r.context,
|
||||
docid: r.docid,
|
||||
|
||||
@ -42,6 +42,7 @@ type SearchResultItem = {
|
||||
title: string;
|
||||
score: number;
|
||||
context: string | null;
|
||||
line: number; // Absolute line in source markdown
|
||||
snippet: string;
|
||||
};
|
||||
|
||||
@ -242,6 +243,8 @@ async function createMcpServer(store: QMDStore): Promise<McpServer> {
|
||||
title: "Query",
|
||||
description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall.
|
||||
|
||||
Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = line - 20, maxLines = 80, lineNumbers = true)\`.
|
||||
|
||||
## Query Types
|
||||
|
||||
**lex** — BM25 keyword search. Fast, exact, no LLM needed.
|
||||
@ -341,13 +344,14 @@ Intent-aware lex (C++ performance, not sports):
|
||||
|| searches[0]?.query || "";
|
||||
|
||||
const filtered: SearchResultItem[] = results.map(r => {
|
||||
const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300, undefined, undefined, intent);
|
||||
const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, intent);
|
||||
return {
|
||||
docid: `#${r.docid}`,
|
||||
file: r.displayPath,
|
||||
title: r.title,
|
||||
score: Math.round(r.score * 100) / 100,
|
||||
context: r.context,
|
||||
line,
|
||||
snippet: addLineNumbers(snippet, line),
|
||||
};
|
||||
});
|
||||
@ -702,13 +706,14 @@ export async function startMcpHttpServer(
|
||||
|| params.searches[0]?.query || "";
|
||||
|
||||
const formatted = results.map(r => {
|
||||
const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300);
|
||||
const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, params.intent);
|
||||
return {
|
||||
docid: `#${r.docid}`,
|
||||
file: r.displayPath,
|
||||
title: r.title,
|
||||
score: Math.round(r.score * 100) / 100,
|
||||
context: r.context,
|
||||
line,
|
||||
snippet: addLineNumbers(snippet, line),
|
||||
};
|
||||
});
|
||||
|
||||
19
src/store.ts
19
src/store.ts
@ -4023,7 +4023,7 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
|
||||
let searchBody = body;
|
||||
let lineOffset = 0;
|
||||
|
||||
if (chunkPos && chunkPos > 0) {
|
||||
if (chunkPos !== undefined && chunkPos >= 0) {
|
||||
// Search within the chunk region, with some padding for context
|
||||
// Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
|
||||
const searchLen = chunkLen || CHUNK_SIZE_CHARS;
|
||||
@ -4055,6 +4055,23 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
|
||||
}
|
||||
}
|
||||
|
||||
if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
|
||||
if (chunkPos === 0) {
|
||||
// chunkPos=0 may be the chunk selector's initialization default for queries
|
||||
// where lexical chunk scoring found no winner (e.g. tokens filtered to empty
|
||||
// by the length>2 guard). Retry with full body so the real match isn't missed.
|
||||
return extractSnippet(body, query, maxLen, undefined, undefined, intent);
|
||||
}
|
||||
// For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
|
||||
// match literally is most likely a tokenizer limitation (quoted phrases, FTS5
|
||||
// syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
|
||||
// than disregarding the reranker's pick.
|
||||
const contextStart = Math.max(0, chunkPos - 100);
|
||||
bestLine = chunkPos > contextStart
|
||||
? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
|
||||
: 0;
|
||||
}
|
||||
|
||||
const start = Math.max(0, bestLine - 1);
|
||||
const end = Math.min(lines.length, bestLine + 3);
|
||||
const snippetLines = lines.slice(start, end);
|
||||
|
||||
@ -507,6 +507,16 @@ describe("CLI Search Command", () => {
|
||||
// Error message goes to stderr
|
||||
expect(stderr).toContain("Usage:");
|
||||
});
|
||||
|
||||
test("--json --full includes line field for round-tripping to qmd get", async () => {
|
||||
const { stdout, exitCode } = await runQmd(["search", "meeting", "--json", "--full", "-n", "1"]);
|
||||
expect(exitCode).toBe(0);
|
||||
const results = JSON.parse(stdout);
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results[0].line).toBeTypeOf("number");
|
||||
expect(results[0].line).toBeGreaterThan(0);
|
||||
expect(results[0].body).toBeTypeOf("string");
|
||||
});
|
||||
});
|
||||
|
||||
describe("CLI Get Command", () => {
|
||||
|
||||
@ -913,6 +913,22 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
|
||||
initTestDatabase(db);
|
||||
seedTestData(db);
|
||||
|
||||
// 300 pad lines (37 chars each = 11100 chars) puts the marker past the
|
||||
// first chunk boundary at CHUNK_SIZE_CHARS = 3600.
|
||||
{
|
||||
const padLine = "Pad line for chunk boundary coverage\n";
|
||||
const absLineFixtureBody =
|
||||
padLine.repeat(300) +
|
||||
"UNIQUE_KEYWORD_XYZ marker\n" +
|
||||
padLine.repeat(20);
|
||||
const fixtureHash = "hash-abslines";
|
||||
const now = new Date().toISOString();
|
||||
db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
|
||||
.run(fixtureHash, absLineFixtureBody, now);
|
||||
db.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES ('docs', ?, ?, ?, ?, ?, 1)`)
|
||||
.run("absolute-line-fixture.md", "Absolute Line Fixture", fixtureHash, now, now);
|
||||
}
|
||||
|
||||
// Sync config into SQLite
|
||||
const httpTestConfig: CollectionConfig = {
|
||||
collections: {
|
||||
@ -1074,4 +1090,29 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
|
||||
expect(json.result).toBeDefined();
|
||||
expect(json.result.content.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("POST /mcp tools/call query returns absolute source-file line numbers, not chunk-local", async () => {
|
||||
await mcpRequest({
|
||||
jsonrpc: "2.0", id: 1, method: "initialize",
|
||||
params: { protocolVersion: "2025-03-26", capabilities: {}, clientInfo: { name: "test", version: "1.0" } },
|
||||
});
|
||||
|
||||
const { status, json } = await mcpRequest({
|
||||
jsonrpc: "2.0", id: 5, method: "tools/call",
|
||||
params: {
|
||||
name: "query",
|
||||
arguments: {
|
||||
searches: [{ type: "lex", query: "UNIQUE_KEYWORD_XYZ" }],
|
||||
rerank: false,
|
||||
},
|
||||
},
|
||||
});
|
||||
expect(status).toBe(200);
|
||||
const results = json.result.structuredContent.results;
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
const hit = results.find((r: any) => r.file === "docs/absolute-line-fixture.md");
|
||||
expect(hit).toBeDefined();
|
||||
expect(hit.line).toBe(301);
|
||||
expect(hit.snippet).toMatch(/^\d+: @@ -3\d\d,/);
|
||||
});
|
||||
});
|
||||
|
||||
@ -2001,6 +2001,33 @@ describe("Snippet Extraction", () => {
|
||||
expect(line).toBe(51); // "Target keyword" is line 51
|
||||
expect(linesBefore).toBeGreaterThan(40); // Many lines before
|
||||
});
|
||||
|
||||
test("extractSnippet anchors on chunkPos when lexical scoring finds no match", () => {
|
||||
// The snippet tokenizer does not strip FTS5 syntax, so a quoted-phrase query
|
||||
// tokenises into terms with embedded quotes that never appear in body text.
|
||||
// bestScore stays at 0 even though the reranker correctly identified a chunk;
|
||||
// the fallback should anchor on chunkPos rather than defaulting to line 1.
|
||||
const padLine = "Lorem ipsum dolor sit amet\n";
|
||||
const padding = padLine.repeat(100);
|
||||
const body = padding + "chunk content here\nmore chunk content\n" + padding;
|
||||
const chunkPos = padding.length;
|
||||
|
||||
const { line } = extractSnippet(body, '"unrelated quoted phrase"', 200, chunkPos);
|
||||
|
||||
expect(line).toBeGreaterThan(50);
|
||||
expect(line).toBeLessThan(110);
|
||||
});
|
||||
|
||||
test("extractSnippet with chunkPos=0 falls back to full-body scan when chunk has no match", () => {
|
||||
// chunkPos=0 may be the chunk selector's bestIdx=0 default rather than a real
|
||||
// first-chunk hit, so the fallback must consider matches outside chunk 0.
|
||||
const padding = "Lorem ipsum dolor sit amet\n".repeat(200);
|
||||
const body = padding + "TARGET_KEYWORD line content\ntail line\n";
|
||||
|
||||
const { line } = extractSnippet(body, "TARGET_KEYWORD", 200, 0);
|
||||
|
||||
expect(line).toBe(201);
|
||||
});
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
|
||||
Loading…
Reference in New Issue
Block a user