Merge remote-tracking branch 'origin/main' into workoff/t_0d576ae5-dev-review

# Conflicts: # CHANGELOG.md
2026-05-16 17:24:47 +00:00 · 2026-05-16 17:24:47 +00:00 · e4505607f9
commit e4505607f9
parent dd5d82d523 bad20f5565
7 changed files with 153 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,6 +5,17 @@
 ### Fixes

 - GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
+- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
+  (CLI JSON output and snippet headers) now return absolute source-file
+  line numbers instead of chunk-local ones, so the `line` field can be
+  passed back to `qmd_get` as `fromLine` without a separate lookup.
+  Snippet selection remains scoped to the best matching chunk
+  (preserves #149).
+- CLI: `qmd query --full` now emits the full document body in all output
+  formats (json, csv, md, xml), restoring the documented behavior of the
+  flag. Previously it returned only the best matching chunk (~3.6KB max
+  per result). Output payload for `--full` queries is now proportional
+  to total document size.
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
  to the requested collection instead of embedding global pending work.
  Scoped `--force` clears only collection-owned vectors, preserves shared
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -844,6 +844,7 @@ function getDocument(filename: string, fromLine?: number, maxLines?: number, lin
      inputPath = inputPath.slice(0, -colonMatch[0].length);
    }
  }
+  if (fromLine !== undefined) fromLine = Math.max(1, fromLine);

  const parsedIndexPath = isVirtualPath(inputPath) ? parseVirtualPath(inputPath) : null;
  if (parsedIndexPath?.indexName) {
@ -1925,6 +1926,7 @@ type OutputRow = {
  score: number;
  context?: string | null;
  chunkPos?: number;
+  chunkLen?: number;
  hash?: string;
  docid?: string;
  explain?: HybridQueryExplain;
@ -2007,9 +2009,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
    // JSON output for LLM consumption
    const output = filtered.map(row => {
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
+      const snippetInfo = extractSnippet(row.body, query, 300, row.chunkPos, row.chunkLen, opts.intent);
      let body = opts.full ? row.body : undefined;
-      const snippetInfo = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent) : undefined;
-      let snippet = snippetInfo?.snippet;
+      let snippet = !opts.full ? snippetInfo.snippet : undefined;
      if (opts.lineNumbers) {
        if (body) body = addLineNumbers(body);
        if (snippet) snippet = addLineNumbers(snippet);
@ -2018,7 +2020,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
        ...(docid && { docid: `#${docid}` }),
        score: Math.round(row.score * 100) / 100,
        file: toQmdPath(row.displayPath),
-        ...(snippetInfo && { line: snippetInfo.line }),
+        line: snippetInfo.line,
        title: row.title,
        ...(row.context && { context: row.context }),
        ...(body && { body }),
@ -2041,7 +2043,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
    for (let i = 0; i < filtered.length; i++) {
      const row = filtered[i];
      if (!row) continue;
-      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
+      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);

      // Line 1: filepath with docid
@ -2105,8 +2107,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
      console.log();

      // Snippet with highlighting (diff-style header included)
-      let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
-      const highlighted = highlightTerms(displaySnippet, query);
+      const content = opts.full ? row.body : snippet;
+      const displayContent = opts.lineNumbers ? addLineNumbers(content, opts.full ? 1 : line) : content;
+      const highlighted = highlightTerms(displayContent, query);
      console.log(highlighted);

      // Double empty line between results
@ -2118,7 +2121,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
      if (!row) continue;
      const heading = row.title || row.displayPath;
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
-      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
+      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
      if (opts.lineNumbers) {
        content = addLineNumbers(content);
      }
@ -2131,7 +2134,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
      const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
      const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
-      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
+      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
      if (opts.lineNumbers) {
        content = addLineNumbers(content);
      }
@ -2141,10 +2144,10 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
    // CSV format
    console.log("docid,score,file,title,context,line,snippet");
    for (const row of filtered) {
-      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
+      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
      let content = opts.full ? row.body : snippet;
      if (opts.lineNumbers) {
-        content = addLineNumbers(content, line);
+        content = addLineNumbers(content, opts.full ? 1 : line);
      }
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
      const snippetText = content || "";
@ -2500,13 +2503,13 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
      ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query)
      : query;

-    // Map to CLI output format — use bestChunk for snippet display
    outputResults(results.map(r => ({
      file: r.file,
      displayPath: r.displayPath,
      title: r.title,
-      body: r.bestChunk,
+      body: r.body,
      chunkPos: r.bestChunkPos,
+      chunkLen: r.bestChunk.length,
      score: r.score,
      context: r.context,
      docid: r.docid,
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@ -42,6 +42,7 @@ type SearchResultItem = {
  title: string;
  score: number;
  context: string | null;
+  line: number;   // Absolute line in source markdown
  snippet: string;
 };

@ -239,6 +240,8 @@ async function createMcpServer(store: QMDStore): Promise<McpServer> {
      title: "Query",
      description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall.

+Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = max(1, line - 20), maxLines = 80, lineNumbers = true)\`.
+
 ## Query Types

 **lex** — BM25 keyword search. Fast, exact, no LLM needed.
@ -339,13 +342,14 @@ Intent-aware lex (C++ performance, not sports):
        || searches[0]?.query || "";

      const filtered: SearchResultItem[] = results.map(r => {
-        const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300, undefined, undefined, intent);
+        const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, intent);
        return {
          docid: `#${r.docid}`,
          file: r.displayPath,
          title: r.title,
          score: Math.round(r.score * 100) / 100,
          context: r.context,
+          line,
          snippet: addLineNumbers(snippet, line),
        };
      });
@ -383,6 +387,7 @@ Intent-aware lex (C++ performance, not sports):
        parsedFromLine = parseInt(colonMatch[1], 10);
        lookup = lookup.slice(0, -colonMatch[0].length);
      }
+      if (parsedFromLine !== undefined) parsedFromLine = Math.max(1, parsedFromLine);

      const result = await store.get(lookup, { includeBody: false });

@ -701,13 +706,14 @@ export async function startMcpHttpServer(
          || params.searches[0]?.query || "";

        const formatted = results.map(r => {
-          const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300);
+          const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, params.intent);
          return {
            docid: `#${r.docid}`,
            file: r.displayPath,
            title: r.title,
            score: Math.round(r.score * 100) / 100,
            context: r.context,
+            line,
            snippet: addLineNumbers(snippet, line),
          };
        });
--- a/src/store.ts
+++ b/src/store.ts
@ -3800,7 +3800,7 @@ export function getDocumentBody(db: Database, doc: DocumentResult | { filepath:
  let body = row.body;
  if (fromLine !== undefined || maxLines !== undefined) {
    const lines = body.split('\n');
-    const start = (fromLine || 1) - 1;
+    const start = Math.max(0, (fromLine || 1) - 1);
    const end = maxLines !== undefined ? start + maxLines : lines.length;
    body = lines.slice(start, end).join('\n');
  }
@ -4023,7 +4023,7 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
  let searchBody = body;
  let lineOffset = 0;

-  if (chunkPos && chunkPos > 0) {
+  if (chunkPos !== undefined && chunkPos >= 0) {
    // Search within the chunk region, with some padding for context
    // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
    const searchLen = chunkLen || CHUNK_SIZE_CHARS;
@ -4055,6 +4055,23 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
    }
  }

+  if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
+    if (chunkPos === 0) {
+      // chunkPos=0 may be the chunk selector's initialization default for queries
+      // where lexical chunk scoring found no winner (e.g. tokens filtered to empty
+      // by the length>2 guard). Retry with full body so the real match isn't missed.
+      return extractSnippet(body, query, maxLen, undefined, undefined, intent);
+    }
+    // For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
+    // match literally is most likely a tokenizer limitation (quoted phrases, FTS5
+    // syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
+    // than disregarding the reranker's pick.
+    const contextStart = Math.max(0, chunkPos - 100);
+    bestLine = chunkPos > contextStart
+      ? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
+      : 0;
+  }
+
  const start = Math.max(0, bestLine - 1);
  const end = Math.min(lines.length, bestLine + 3);
  const snippetLines = lines.slice(start, end);
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@ -508,6 +508,16 @@ describe("CLI Search Command", () => {
    // Error message goes to stderr
    expect(stderr).toContain("Usage:");
  });
+
+  test("--json --full includes line field for round-tripping to qmd get", async () => {
+    const { stdout, exitCode } = await runQmd(["search", "meeting", "--json", "--full", "-n", "1"]);
+    expect(exitCode).toBe(0);
+    const results = JSON.parse(stdout);
+    expect(results.length).toBeGreaterThan(0);
+    expect(results[0].line).toBeTypeOf("number");
+    expect(results[0].line).toBeGreaterThan(0);
+    expect(results[0].body).toBeTypeOf("string");
+  });
 });

 describe("CLI Get Command", () => {
@ -533,6 +543,13 @@ describe("CLI Get Command", () => {
    // Should indicate file not found
    expect(exitCode).toBe(1);
  });
+
+  test("clamps negative --from to top of file (no silent tail content)", async () => {
+    const baseline = await runQmd(["get", "README.md"]);
+    const negative = await runQmd(["get", "README.md", "--from", "-19"]);
+    expect(negative.exitCode).toBe(0);
+    expect(negative.stdout).toBe(baseline.stdout);
+  });
 });

 describe("CLI Multi-Get Command", () => {
--- a/test/mcp.test.ts
+++ b/test/mcp.test.ts
@ -913,6 +913,22 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
    initTestDatabase(db);
    seedTestData(db);

+    // 300 pad lines (37 chars each = 11100 chars) puts the marker past the
+    // first chunk boundary at CHUNK_SIZE_CHARS = 3600.
+    {
+      const padLine = "Pad line for chunk boundary coverage\n";
+      const absLineFixtureBody =
+        padLine.repeat(300) +
+        "UNIQUE_KEYWORD_XYZ marker\n" +
+        padLine.repeat(20);
+      const fixtureHash = "hash-abslines";
+      const now = new Date().toISOString();
+      db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+        .run(fixtureHash, absLineFixtureBody, now);
+      db.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES ('docs', ?, ?, ?, ?, ?, 1)`)
+        .run("absolute-line-fixture.md", "Absolute Line Fixture", fixtureHash, now, now);
+    }
+
    // Sync config into SQLite
    const httpTestConfig: CollectionConfig = {
      collections: {
@ -1074,4 +1090,29 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
    expect(json.result).toBeDefined();
    expect(json.result.content.length).toBeGreaterThan(0);
  });
+
+  test("POST /mcp tools/call query returns absolute source-file line numbers, not chunk-local", async () => {
+    await mcpRequest({
+      jsonrpc: "2.0", id: 1, method: "initialize",
+      params: { protocolVersion: "2025-03-26", capabilities: {}, clientInfo: { name: "test", version: "1.0" } },
+    });
+
+    const { status, json } = await mcpRequest({
+      jsonrpc: "2.0", id: 5, method: "tools/call",
+      params: {
+        name: "query",
+        arguments: {
+          searches: [{ type: "lex", query: "UNIQUE_KEYWORD_XYZ" }],
+          rerank: false,
+        },
+      },
+    });
+    expect(status).toBe(200);
+    const results = json.result.structuredContent.results;
+    expect(results.length).toBeGreaterThan(0);
+    const hit = results.find((r: any) => r.file === "docs/absolute-line-fixture.md");
+    expect(hit).toBeDefined();
+    expect(hit.line).toBe(301);
+    expect(hit.snippet).toMatch(/^\d+: @@ -3\d\d,/);
+  });
 });
--- a/test/store.test.ts
+++ b/test/store.test.ts
@ -1713,6 +1713,21 @@ describe("Document Retrieval", () => {
      expect(body).toBeNull();
      await cleanupTestDb(store);
    });
+
+    test("getDocumentBody clamps negative fromLine to top of document", async () => {
+      const store = await createTestStore();
+      const collectionName = await createTestCollection({ pwd: "/path" });
+      await insertTestDocument(store.db, collectionName, {
+        name: "mydoc",
+        displayPath: "mydoc.md",
+        body: "Line 1\nLine 2\nLine 3\nLine 4\nLine 5",
+      });
+
+      const body = store.getDocumentBody({ filepath: "/path/mydoc.md" }, -19, 80);
+      expect(body).toBe("Line 1\nLine 2\nLine 3\nLine 4\nLine 5");
+
+      await cleanupTestDb(store);
+    });
  });

  describe("findDocuments (multi-get)", () => {
@ -2001,6 +2016,33 @@ describe("Snippet Extraction", () => {
    expect(line).toBe(51); // "Target keyword" is line 51
    expect(linesBefore).toBeGreaterThan(40); // Many lines before
  });
+
+  test("extractSnippet anchors on chunkPos when lexical scoring finds no match", () => {
+    // The snippet tokenizer does not strip FTS5 syntax, so a quoted-phrase query
+    // tokenises into terms with embedded quotes that never appear in body text.
+    // bestScore stays at 0 even though the reranker correctly identified a chunk;
+    // the fallback should anchor on chunkPos rather than defaulting to line 1.
+    const padLine = "Lorem ipsum dolor sit amet\n";
+    const padding = padLine.repeat(100);
+    const body = padding + "chunk content here\nmore chunk content\n" + padding;
+    const chunkPos = padding.length;
+
+    const { line } = extractSnippet(body, '"unrelated quoted phrase"', 200, chunkPos);
+
+    expect(line).toBeGreaterThan(50);
+    expect(line).toBeLessThan(110);
+  });
+
+  test("extractSnippet with chunkPos=0 falls back to full-body scan when chunk has no match", () => {
+    // chunkPos=0 may be the chunk selector's bestIdx=0 default rather than a real
+    // first-chunk hit, so the fallback must consider matches outside chunk 0.
+    const padding = "Lorem ipsum dolor sit amet\n".repeat(200);
+    const body = padding + "TARGET_KEYWORD line content\ntail line\n";
+
+    const { line } = extractSnippet(body, "TARGET_KEYWORD", 200, 0);
+
+    expect(line).toBe(201);
+  });
 });

 // =============================================================================