From 1f522cffe2c39042a626d827dbad63b325befba4 Mon Sep 17 00:00:00 2001
From: Riley Shott <riley.shott@shopify.com>
Date: Wed, 13 May 2026 16:15:58 -0700
Subject: [PATCH 1/7] fix: return absolute line numbers from qmd_query

The MCP `query` tool, HTTP `/query` endpoint, and CLI `qmd query`
all returned chunk-local line numbers in their snippet output, so
the line could not be passed back to `qmd_get` as `fromLine`
without an out-of-band lookup. Pass the full document body plus
`bestChunkPos` to `extractSnippet` instead of the chunk text alone
so it can compute absolute line offsets while still scoping the
keyword scan to the reranker-chosen chunk window (preserves #149).

Also restores documented behavior of `qmd query --full`, which was
emitting the best chunk (~3.6KB max) instead of the full document.

extractSnippet now also falls back to a full-body scan when given a
chunkPos but the chunk window contains no positive matches. The
upstream chunk selector leaves bestIdx=0 as its initialization
default whenever scoring fails to find a winner (e.g. queryTerms
filtered to empty by the length>2 guard, or semantic-only matches
with no lex overlap), so an unconditional chunk-scoped scan would
land on chunk 0 instead of where the actual match lives.

- src/mcp/server.ts: SearchResultItem gains `line: number`; both MCP
  and HTTP `/query` handlers populate it
- src/cli/qmd.ts: OutputRow.body now sources from r.body
- src/store.ts: extractSnippet falls back to full-body scan when
  chunk-scoped pass finds no positive match
- test/mcp.test.ts: new fixture asserts absolute line 301 for a
  marker placed past the first chunk boundary
- test/store.test.ts: regression test for the bestScore<=0 fallback
---
 CHANGELOG.md       | 11 +++++++++++
 src/cli/qmd.ts     | 26 ++++++++++++++------------
 src/mcp/server.ts  |  9 +++++++--
 src/store.ts       | 19 ++++++++++++++++++-
 test/cli.test.ts   | 10 ++++++++++
 test/mcp.test.ts   | 41 +++++++++++++++++++++++++++++++++++++++++
 test/store.test.ts | 27 +++++++++++++++++++++++++++
 7 files changed, 128 insertions(+), 15 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2757c8..ac69601 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,17 @@
 
 ### Fixes
 
+- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
+  (CLI JSON output and snippet headers) now return absolute source-file
+  line numbers instead of chunk-local ones, so the `line` field can be
+  passed back to `qmd_get` as `fromLine` without a separate lookup.
+  Snippet selection remains scoped to the best matching chunk
+  (preserves #149).
+- CLI: `qmd query --full` now emits the full document body in all output
+  formats (json, csv, md, xml), restoring the documented behavior of the
+  flag. Previously it returned only the best matching chunk (~3.6KB max
+  per result). Output payload for `--full` queries is now proportional
+  to total document size.
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index f576cde..cdbc241 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -1886,6 +1886,7 @@ type OutputRow = {
   score: number;
   context?: string | null;
   chunkPos?: number;
+  chunkLen?: number;
   hash?: string;
   docid?: string;
   explain?: HybridQueryExplain;
@@ -1968,9 +1969,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
     // JSON output for LLM consumption
     const output = filtered.map(row => {
       const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
+      const snippetInfo = extractSnippet(row.body, query, 300, row.chunkPos, row.chunkLen, opts.intent);
       let body = opts.full ? row.body : undefined;
-      const snippetInfo = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent) : undefined;
-      let snippet = snippetInfo?.snippet;
+      let snippet = !opts.full ? snippetInfo.snippet : undefined;
       if (opts.lineNumbers) {
         if (body) body = addLineNumbers(body);
         if (snippet) snippet = addLineNumbers(snippet);
@@ -1979,7 +1980,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
         ...(docid && { docid: `#${docid}` }),
         score: Math.round(row.score * 100) / 100,
         file: toQmdPath(row.displayPath),
-        ...(snippetInfo && { line: snippetInfo.line }),
+        line: snippetInfo.line,
         title: row.title,
         ...(row.context && { context: row.context }),
         ...(body && { body }),
@@ -2002,7 +2003,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
     for (let i = 0; i < filtered.length; i++) {
       const row = filtered[i];
       if (!row) continue;
-      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
+      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
       const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
 
       // Line 1: filepath with docid
@@ -2066,8 +2067,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
       console.log();
 
       // Snippet with highlighting (diff-style header included)
-      let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
-      const highlighted = highlightTerms(displaySnippet, query);
+      const content = opts.full ? row.body : snippet;
+      const displayContent = opts.lineNumbers ? addLineNumbers(content, opts.full ? 1 : line) : content;
+      const highlighted = highlightTerms(displayContent, query);
       console.log(highlighted);
 
       // Double empty line between results
@@ -2079,7 +2081,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
       if (!row) continue;
       const heading = row.title || row.displayPath;
       const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
-      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
+      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
       if (opts.lineNumbers) {
         content = addLineNumbers(content);
       }
@@ -2092,7 +2094,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
       const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
       const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
       const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
-      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
+      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
       if (opts.lineNumbers) {
         content = addLineNumbers(content);
       }
@@ -2102,10 +2104,10 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
     // CSV format
     console.log("docid,score,file,title,context,line,snippet");
     for (const row of filtered) {
-      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
+      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
       let content = opts.full ? row.body : snippet;
       if (opts.lineNumbers) {
-        content = addLineNumbers(content, line);
+        content = addLineNumbers(content, opts.full ? 1 : line);
       }
       const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
       const snippetText = content || "";
@@ -2461,13 +2463,13 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
       ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query)
       : query;
 
-    // Map to CLI output format — use bestChunk for snippet display
     outputResults(results.map(r => ({
       file: r.file,
       displayPath: r.displayPath,
       title: r.title,
-      body: r.bestChunk,
+      body: r.body,
       chunkPos: r.bestChunkPos,
+      chunkLen: r.bestChunk.length,
       score: r.score,
       context: r.context,
       docid: r.docid,
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
index 2f5482f..a9ec99f 100644
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -42,6 +42,7 @@ type SearchResultItem = {
   title: string;
   score: number;
   context: string | null;
+  line: number;   // Absolute line in source markdown
   snippet: string;
 };
 
@@ -242,6 +243,8 @@ async function createMcpServer(store: QMDStore): Promise<McpServer> {
       title: "Query",
       description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall.
 
+Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = line - 20, maxLines = 80, lineNumbers = true)\`.
+
 ## Query Types
 
 **lex** — BM25 keyword search. Fast, exact, no LLM needed.
@@ -341,13 +344,14 @@ Intent-aware lex (C++ performance, not sports):
         || searches[0]?.query || "";
 
       const filtered: SearchResultItem[] = results.map(r => {
-        const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300, undefined, undefined, intent);
+        const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, intent);
         return {
           docid: `#${r.docid}`,
           file: r.displayPath,
           title: r.title,
           score: Math.round(r.score * 100) / 100,
           context: r.context,
+          line,
           snippet: addLineNumbers(snippet, line),
         };
       });
@@ -702,13 +706,14 @@ export async function startMcpHttpServer(
           || params.searches[0]?.query || "";
 
         const formatted = results.map(r => {
-          const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300);
+          const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, params.intent);
           return {
             docid: `#${r.docid}`,
             file: r.displayPath,
             title: r.title,
             score: Math.round(r.score * 100) / 100,
             context: r.context,
+            line,
             snippet: addLineNumbers(snippet, line),
           };
         });
diff --git a/src/store.ts b/src/store.ts
index 52dd334..f927ccc 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -4023,7 +4023,7 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
   let searchBody = body;
   let lineOffset = 0;
 
-  if (chunkPos && chunkPos > 0) {
+  if (chunkPos !== undefined && chunkPos >= 0) {
     // Search within the chunk region, with some padding for context
     // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
     const searchLen = chunkLen || CHUNK_SIZE_CHARS;
@@ -4055,6 +4055,23 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
     }
   }
 
+  if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
+    if (chunkPos === 0) {
+      // chunkPos=0 may be the chunk selector's initialization default for queries
+      // where lexical chunk scoring found no winner (e.g. tokens filtered to empty
+      // by the length>2 guard). Retry with full body so the real match isn't missed.
+      return extractSnippet(body, query, maxLen, undefined, undefined, intent);
+    }
+    // For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
+    // match literally is most likely a tokenizer limitation (quoted phrases, FTS5
+    // syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
+    // than disregarding the reranker's pick.
+    const contextStart = Math.max(0, chunkPos - 100);
+    bestLine = chunkPos > contextStart
+      ? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
+      : 0;
+  }
+
   const start = Math.max(0, bestLine - 1);
   const end = Math.min(lines.length, bestLine + 3);
   const snippetLines = lines.slice(start, end);
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 9c575f8..2535fe4 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -507,6 +507,16 @@ describe("CLI Search Command", () => {
     // Error message goes to stderr
     expect(stderr).toContain("Usage:");
   });
+
+  test("--json --full includes line field for round-tripping to qmd get", async () => {
+    const { stdout, exitCode } = await runQmd(["search", "meeting", "--json", "--full", "-n", "1"]);
+    expect(exitCode).toBe(0);
+    const results = JSON.parse(stdout);
+    expect(results.length).toBeGreaterThan(0);
+    expect(results[0].line).toBeTypeOf("number");
+    expect(results[0].line).toBeGreaterThan(0);
+    expect(results[0].body).toBeTypeOf("string");
+  });
 });
 
 describe("CLI Get Command", () => {
diff --git a/test/mcp.test.ts b/test/mcp.test.ts
index 3ea87bd..495c624 100644
--- a/test/mcp.test.ts
+++ b/test/mcp.test.ts
@@ -913,6 +913,22 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
     initTestDatabase(db);
     seedTestData(db);
 
+    // 300 pad lines (37 chars each = 11100 chars) puts the marker past the
+    // first chunk boundary at CHUNK_SIZE_CHARS = 3600.
+    {
+      const padLine = "Pad line for chunk boundary coverage\n";
+      const absLineFixtureBody =
+        padLine.repeat(300) +
+        "UNIQUE_KEYWORD_XYZ marker\n" +
+        padLine.repeat(20);
+      const fixtureHash = "hash-abslines";
+      const now = new Date().toISOString();
+      db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+        .run(fixtureHash, absLineFixtureBody, now);
+      db.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES ('docs', ?, ?, ?, ?, ?, 1)`)
+        .run("absolute-line-fixture.md", "Absolute Line Fixture", fixtureHash, now, now);
+    }
+
     // Sync config into SQLite
     const httpTestConfig: CollectionConfig = {
       collections: {
@@ -1074,4 +1090,29 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
     expect(json.result).toBeDefined();
     expect(json.result.content.length).toBeGreaterThan(0);
   });
+
+  test("POST /mcp tools/call query returns absolute source-file line numbers, not chunk-local", async () => {
+    await mcpRequest({
+      jsonrpc: "2.0", id: 1, method: "initialize",
+      params: { protocolVersion: "2025-03-26", capabilities: {}, clientInfo: { name: "test", version: "1.0" } },
+    });
+
+    const { status, json } = await mcpRequest({
+      jsonrpc: "2.0", id: 5, method: "tools/call",
+      params: {
+        name: "query",
+        arguments: {
+          searches: [{ type: "lex", query: "UNIQUE_KEYWORD_XYZ" }],
+          rerank: false,
+        },
+      },
+    });
+    expect(status).toBe(200);
+    const results = json.result.structuredContent.results;
+    expect(results.length).toBeGreaterThan(0);
+    const hit = results.find((r: any) => r.file === "docs/absolute-line-fixture.md");
+    expect(hit).toBeDefined();
+    expect(hit.line).toBe(301);
+    expect(hit.snippet).toMatch(/^\d+: @@ -3\d\d,/);
+  });
 });
diff --git a/test/store.test.ts b/test/store.test.ts
index 8bfaae9..6b3be5b 100644
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -2001,6 +2001,33 @@ describe("Snippet Extraction", () => {
     expect(line).toBe(51); // "Target keyword" is line 51
     expect(linesBefore).toBeGreaterThan(40); // Many lines before
   });
+
+  test("extractSnippet anchors on chunkPos when lexical scoring finds no match", () => {
+    // The snippet tokenizer does not strip FTS5 syntax, so a quoted-phrase query
+    // tokenises into terms with embedded quotes that never appear in body text.
+    // bestScore stays at 0 even though the reranker correctly identified a chunk;
+    // the fallback should anchor on chunkPos rather than defaulting to line 1.
+    const padLine = "Lorem ipsum dolor sit amet\n";
+    const padding = padLine.repeat(100);
+    const body = padding + "chunk content here\nmore chunk content\n" + padding;
+    const chunkPos = padding.length;
+
+    const { line } = extractSnippet(body, '"unrelated quoted phrase"', 200, chunkPos);
+
+    expect(line).toBeGreaterThan(50);
+    expect(line).toBeLessThan(110);
+  });
+
+  test("extractSnippet with chunkPos=0 falls back to full-body scan when chunk has no match", () => {
+    // chunkPos=0 may be the chunk selector's bestIdx=0 default rather than a real
+    // first-chunk hit, so the fallback must consider matches outside chunk 0.
+    const padding = "Lorem ipsum dolor sit amet\n".repeat(200);
+    const body = padding + "TARGET_KEYWORD line content\ntail line\n";
+
+    const { line } = extractSnippet(body, "TARGET_KEYWORD", 200, 0);
+
+    expect(line).toBe(201);
+  });
 });
 
 // =============================================================================

From aa1818e1817c6dacbb2ebfa762112e232e442755 Mon Sep 17 00:00:00 2001
From: Riley Shott <riley.shott@shopify.com>
Date: Wed, 13 May 2026 20:07:13 -0700
Subject: [PATCH 2/7] fix: clamp negative fromLine in get to avoid silent tail
 content

The query tool description tells agents to compute fromLine = line - 20
for context around a hit. For hits in lines 1 through 20 that yields a
negative fromLine, which propagated unchanged through:

  MCP get handler -> store.getDocumentBody -> Array.prototype.slice

A negative slice start offsets from the end of the array rather than
clamping to the beginning, so a top-of-file hit on a long document
returned an empty string and on a short document returned content from
the wrong region (e.g. lines 11-30 of a 30-line file in response to a
request for the head of the document). The lineNumbers branch was the
same shape: addLineNumbers(text, -19) emitted "-19:", "-18:" prefixes.

Same buggy slice lived in the CLI getDocument path independently.

Fix in three layers, plus the docstring:

- src/mcp/server.ts: clamp parsedFromLine to >= 1 after parsing input
  args and the :line suffix, before it reaches getDocumentBody and
  addLineNumbers. Also tighten the query tool's recommendation to
  `fromLine = max(1, line - 20)` so following the docstring literally
  produces a valid value.
- src/cli/qmd.ts: same clamp on the CLI getDocument fromLine after
  the colon-suffix parse.
- src/store.ts: defensive Math.max(0, ...) on the slice start in
  getDocumentBody so SDK callers and any future entry points are
  protected without relying on every caller remembering to clamp.
- test/store.test.ts: regression test on getDocumentBody with
  fromLine = -19 returns the head of the document, not the tail.
- test/cli.test.ts: regression test on `qmd get --from -19` matches
  the no-flag baseline (head of document).
---
 src/cli/qmd.ts     |  1 +
 src/mcp/server.ts  |  3 ++-
 src/store.ts       |  2 +-
 test/cli.test.ts   |  7 +++++++
 test/store.test.ts | 15 +++++++++++++++
 5 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index cdbc241..8b6ada1 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -844,6 +844,7 @@ function getDocument(filename: string, fromLine?: number, maxLines?: number, lin
       inputPath = inputPath.slice(0, -colonMatch[0].length);
     }
   }
+  if (fromLine !== undefined) fromLine = Math.max(1, fromLine);
 
   const parsedIndexPath = isVirtualPath(inputPath) ? parseVirtualPath(inputPath) : null;
   if (parsedIndexPath?.indexName) {
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
index a9ec99f..69c7eff 100644
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -243,7 +243,7 @@ async function createMcpServer(store: QMDStore): Promise<McpServer> {
       title: "Query",
       description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall.
 
-Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = line - 20, maxLines = 80, lineNumbers = true)\`.
+Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = max(1, line - 20), maxLines = 80, lineNumbers = true)\`.
 
 ## Query Types
 
@@ -389,6 +389,7 @@ Intent-aware lex (C++ performance, not sports):
         parsedFromLine = parseInt(colonMatch[1], 10);
         lookup = lookup.slice(0, -colonMatch[0].length);
       }
+      if (parsedFromLine !== undefined) parsedFromLine = Math.max(1, parsedFromLine);
 
       const result = await store.get(lookup, { includeBody: false });
 
diff --git a/src/store.ts b/src/store.ts
index f927ccc..003feca 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -3800,7 +3800,7 @@ export function getDocumentBody(db: Database, doc: DocumentResult | { filepath:
   let body = row.body;
   if (fromLine !== undefined || maxLines !== undefined) {
     const lines = body.split('\n');
-    const start = (fromLine || 1) - 1;
+    const start = Math.max(0, (fromLine || 1) - 1);
     const end = maxLines !== undefined ? start + maxLines : lines.length;
     body = lines.slice(start, end).join('\n');
   }
diff --git a/test/cli.test.ts b/test/cli.test.ts
index 2535fe4..769db00 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -542,6 +542,13 @@ describe("CLI Get Command", () => {
     // Should indicate file not found
     expect(exitCode).toBe(1);
   });
+
+  test("clamps negative --from to top of file (no silent tail content)", async () => {
+    const baseline = await runQmd(["get", "README.md"]);
+    const negative = await runQmd(["get", "README.md", "--from", "-19"]);
+    expect(negative.exitCode).toBe(0);
+    expect(negative.stdout).toBe(baseline.stdout);
+  });
 });
 
 describe("CLI Multi-Get Command", () => {
diff --git a/test/store.test.ts b/test/store.test.ts
index 6b3be5b..2adf717 100644
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -1713,6 +1713,21 @@ describe("Document Retrieval", () => {
       expect(body).toBeNull();
       await cleanupTestDb(store);
     });
+
+    test("getDocumentBody clamps negative fromLine to top of document", async () => {
+      const store = await createTestStore();
+      const collectionName = await createTestCollection({ pwd: "/path" });
+      await insertTestDocument(store.db, collectionName, {
+        name: "mydoc",
+        displayPath: "mydoc.md",
+        body: "Line 1\nLine 2\nLine 3\nLine 4\nLine 5",
+      });
+
+      const body = store.getDocumentBody({ filepath: "/path/mydoc.md" }, -19, 80);
+      expect(body).toBe("Line 1\nLine 2\nLine 3\nLine 4\nLine 5");
+
+      await cleanupTestDb(store);
+    });
   });
 
   describe("findDocuments (multi-get)", () => {

From dd5d82d52368fd0e7501d3f939233bed7dd617e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:18:06 +0000
Subject: [PATCH 3/7] fix: keep llama GPU fallback noise off JSON stdout

---
 CHANGELOG.md     |  1 +
 README.md        |  1 +
 src/cli/qmd.ts   |  6 ++++
 src/llm.ts       | 66 +++++++++++++++++++++++++++++++----
 test/cli.test.ts |  1 +
 test/llm.test.ts | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 158 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b2757c8..d7378bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Fixes
 
+- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared
diff --git a/README.md b/README.md
index 02e4b1e..7eadb93 100644
--- a/README.md
+++ b/README.md
@@ -798,6 +798,7 @@ llm_cache       -- Cached LLM responses (query expansion, rerank scores)
 |----------|---------|-------------|
 | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
 | `QMD_LLAMA_GPU` | `auto` | Force llama.cpp GPU backend (`metal`, `vulkan`, `cuda`) or disable GPU with `false` |
+| `QMD_FORCE_CPU` | unset | Set to `1`/`true` to force CPU mode before any CUDA/Vulkan/Metal probing. Equivalent CLI flag: `--no-gpu`. |
 | `QMD_EMBED_PARALLELISM` | automatic | Override embedding/reranking context parallelism (1-8). Windows CUDA defaults to `1` because parallel CUDA contexts can crash with `ggml-cuda.cu:98`; use Vulkan or raise this only if your driver is stable. |
 
 ## How It Works
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index 01dc540..7df8401 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -2562,6 +2562,7 @@ function parseCLI() {
       // Query options
       "candidate-limit": { type: "string", short: "C" },
       "no-rerank": { type: "boolean", default: false },
+      "no-gpu": { type: "boolean", default: false },
       intent: { type: "string" },
       // Chunking options
       "chunk-strategy": { type: "string" },  // "regex" (default) or "auto" (AST for code files)
@@ -2574,6 +2575,10 @@ function parseCLI() {
     strict: false, // Allow unknown options to pass through
   });
 
+  if (values["no-gpu"]) {
+    process.env.QMD_FORCE_CPU = "1";
+  }
+
   // Select index name (default: "index")
   const indexName = values.index as string | undefined;
   if (indexName) {
@@ -2826,6 +2831,7 @@ function showHelp(): void {
   console.log("  --full                     - Output full document instead of snippet");
   console.log("  -C, --candidate-limit <n>  - Max candidates to rerank (default 40, lower = faster)");
   console.log("  --no-rerank                - Skip LLM reranking (use RRF scores only, much faster on CPU)");
+  console.log("  --no-gpu                   - Force CPU mode for llama.cpp operations (same as QMD_FORCE_CPU=1)");
   console.log("  --line-numbers             - Include line numbers in output");
   console.log("  --explain                  - Include retrieval score traces (query --json/CLI)");
   console.log("  --files | --json | --csv | --md | --xml  - Output format");
diff --git a/src/llm.ts b/src/llm.ts
index d469d36..b0b30d4 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -22,10 +22,45 @@ type NodeLlamaCppModule = {
 
 let nodeLlamaCppImport: Promise<NodeLlamaCppModule> | null = null;
 async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
-  nodeLlamaCppImport ??= import("node-llama-cpp") as Promise<NodeLlamaCppModule>;
+  nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(
+    () => import("node-llama-cpp") as Promise<NodeLlamaCppModule>
+  );
   return nodeLlamaCppImport;
 }
 
+export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void {
+  nodeLlamaCppImport = module ? Promise.resolve(module) : null;
+  failedGpuInitModes.clear();
+}
+
+type StdoutWrite = typeof process.stdout.write;
+let nativeStdoutRedirectDepth = 0;
+let originalStdoutWrite: StdoutWrite | null = null;
+
+/**
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
+ * noise to stderr while native llama initialization is in progress.
+ */
+export async function withNativeStdoutRedirectedToStderr<T>(fn: () => Promise<T>): Promise<T> {
+  if (nativeStdoutRedirectDepth === 0) {
+    originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite;
+    process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => {
+      return process.stderr.write(chunk, encoding, cb as any);
+    }) as StdoutWrite;
+  }
+  nativeStdoutRedirectDepth++;
+  try {
+    return await fn();
+  } finally {
+    nativeStdoutRedirectDepth--;
+    if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
+      process.stdout.write = originalStdoutWrite;
+      originalStdoutWrite = null;
+    }
+  }
+}
+
 import { homedir } from "os";
 import { join } from "path";
 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
@@ -487,7 +522,15 @@ export function resolveSafeParallelism(options: ParallelismOptions): number {
   return Math.max(1, options.computed);
 }
 
-export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode {
+export function resolveLlamaGpuMode(
+  envValue = process.env.QMD_LLAMA_GPU,
+  forceCpuValue = process.env.QMD_FORCE_CPU
+): LlamaGpuMode {
+  const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
+  if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
+    return false;
+  }
+
   const normalized = envValue?.trim().toLowerCase() ?? "";
   if (!normalized) return "auto";
   if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false;
@@ -518,6 +561,8 @@ function resolveExpandContextSize(configValue?: number): number {
   return parsed;
 }
 
+const failedGpuInitModes = new Set<LlamaGpuMode>();
+
 export class LlamaCpp implements LLM {
   private readonly _ciMode = !!process.env.CI;
   private llama: Llama | null = null;
@@ -668,22 +713,29 @@ export class LlamaCpp implements LLM {
 
       const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp();
       const loadLlama = async (gpu: LlamaGpuMode) =>
-        await getLlama({
+        await withNativeStdoutRedirectedToStderr(() => getLlama({
           build: allowBuild ? "autoAttempt" : "never",
           logLevel: LlamaLogLevel.error,
           gpu,
           skipDownload: !allowBuild,
-        });
+        }));
 
       let llama: Llama;
-      if (gpuMode === false) {
+      if (gpuMode === false || failedGpuInitModes.has(gpuMode)) {
+        if (gpuMode !== false && failedGpuInitModes.has(gpuMode)) {
+          process.stderr.write(
+            `QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`
+          );
+        }
         llama = await loadLlama(false);
       } else {
         try {
           llama = await loadLlama(gpuMode);
         } catch (err) {
-          // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
-          // Fall back to CPU so qmd still works.
+          // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
+          // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
+          // expensive native build/probe attempts in this process.
+          failedGpuInitModes.add(gpuMode);
           process.stderr.write(
             `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
           );
diff --git a/test/cli.test.ts b/test/cli.test.ts
index e4ceb35..aacfff5 100644
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@@ -233,6 +233,7 @@ describe("CLI Help", () => {
     expect(stdout).toContain("Usage:");
     expect(stdout).toContain("qmd collection add");
     expect(stdout).toContain("qmd search");
+    expect(stdout).toContain("--no-gpu");
     expect(stdout).toContain("qmd skill show/install");
   });
 
diff --git a/test/llm.test.ts b/test/llm.test.ts
index ff22c0c..2fc03cd 100644
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -13,6 +13,8 @@ import {
   getDefaultLlamaCpp,
   disposeDefaultLlamaCpp,
   resolveLlamaGpuMode,
+  setNodeLlamaCppModuleForTest,
+  withNativeStdoutRedirectedToStderr,
   resolveParallelismOverride,
   resolveSafeParallelism,
   withLLMSession,
@@ -78,6 +80,29 @@ describe("QMD_LLAMA_GPU resolution", () => {
     expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda");
   });
 
+  test("QMD_FORCE_CPU disables GPU before QMD_LLAMA_GPU auto-detection", () => {
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_FORCE_CPU = "1";
+    try {
+      expect(resolveLlamaGpuMode(undefined)).toBe(false);
+      expect(resolveLlamaGpuMode("cuda")).toBe(false);
+    } finally {
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+
+  test("QMD_FORCE_CPU ignores false-ish values", () => {
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_FORCE_CPU = "0";
+    try {
+      expect(resolveLlamaGpuMode(undefined)).toBe("auto");
+    } finally {
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+
   test("warns and falls back to auto for unsupported values", () => {
     const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
     try {
@@ -90,6 +115,71 @@ describe("QMD_LLAMA_GPU resolution", () => {
   });
 });
 
+describe("native llama stdout containment", () => {
+  test("redirects native stdout noise to stderr while JSON callers are initializing llama", async () => {
+    const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      await withNativeStdoutRedirectedToStderr(async () => {
+        process.stdout.write("cmake build spam\n");
+        return "ok";
+      });
+
+      expect(stdoutSpy).not.toHaveBeenCalled();
+      expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
+    } finally {
+      stdoutSpy.mockRestore();
+      stderrSpy.mockRestore();
+    }
+  });
+
+  test("keeps native GPU failure noise off stdout and caches failed GPU init", async () => {
+    const prevGpu = process.env.QMD_LLAMA_GPU;
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_LLAMA_GPU = "cuda";
+    delete process.env.QMD_FORCE_CPU;
+
+    const calls: unknown[] = [];
+    const fakeLlama = { gpu: false, cpuMathCores: 4 };
+    setNodeLlamaCppModuleForTest({
+      LlamaLogLevel: { error: "error" },
+      resolveModelFile: vi.fn(),
+      LlamaChatSession: vi.fn() as any,
+      getLlama: vi.fn(async (options: Record<string, unknown>) => {
+        calls.push(options.gpu);
+        if (options.gpu === "cuda") {
+          process.stdout.write("cmake build spam\n");
+          throw new Error("CUDA unavailable");
+        }
+        return fakeLlama as any;
+      }),
+    });
+
+    const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      const first = new LlamaCpp();
+      const second = new LlamaCpp();
+
+      await (first as any).ensureLlama();
+      await (second as any).ensureLlama();
+
+      expect(stdoutSpy).not.toHaveBeenCalled();
+      expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
+      expect(calls).toEqual(["cuda", false, false]);
+      expect(String(stderrSpy.mock.calls.map(call => call[0]).join(""))).toContain("skipping previously failed GPU init");
+    } finally {
+      stdoutSpy.mockRestore();
+      stderrSpy.mockRestore();
+      setNodeLlamaCppModuleForTest(null);
+      if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU;
+      else process.env.QMD_LLAMA_GPU = prevGpu;
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+});
+
 describe("LLM context parallelism safety", () => {
   test("defaults Windows CUDA to one context to avoid ggml-cuda.cu:98 crashes", () => {
     expect(resolveSafeParallelism({

From 60c75cb3327df40f930d23c12a4c98c4a0f79a97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:20:21 +0000
Subject: [PATCH 4/7] fix: avoid macOS Metal cleanup abort after JSON query

---
 CHANGELOG.md                    |  1 +
 src/cli/qmd.ts                  | 76 ++++++++++++++++++++++++++++++-
 src/llm.ts                      | 60 ++++++++++++++++++------
 test/cli-exit-lifecycle.test.ts | 81 +++++++++++++++++++++++++++++++++
 4 files changed, 202 insertions(+), 16 deletions(-)
 create mode 100644 test/cli-exit-lifecycle.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ac69601..39b811a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
   flag. Previously it returned only the best matching chunk (~3.6KB max
   per result). Output payload for `--full` queries is now proportional
   to total document size.
+- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index 40bc0dd..df73f36 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -207,6 +207,76 @@ const cursor = {
   show() { process.stderr.write('\x1b[?25h'); },
 };
 
+type CliLifecycleWritable = {
+  write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean;
+};
+
+type FinishSuccessfulCliCommandOptions = {
+  command: string;
+  format?: OutputFormat;
+  cleanup?: () => Promise<void>;
+  exit?: (code: number) => void;
+  immediateExit?: (code: number) => void;
+  stdout?: CliLifecycleWritable;
+  stderr?: CliLifecycleWritable;
+  platform?: NodeJS.Platform;
+};
+
+async function flushWritable(stream: CliLifecycleWritable): Promise<void> {
+  await new Promise<void>((resolve) => {
+    stream.write("", () => resolve());
+  });
+}
+
+function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean {
+  return (
+    (options.platform ?? process.platform) === "darwin" &&
+    options.command === "query" &&
+    options.format === "json" &&
+    process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1"
+  );
+}
+
+function immediateProcessExit(code: number): void {
+  const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void };
+  if (typeof processWithReallyExit.reallyExit === "function") {
+    processWithReallyExit.reallyExit(code);
+    return;
+  }
+  process.exit(code);
+}
+
+/**
+ * Finish a successful CLI command after output has been flushed. On macOS JSON
+ * query runs, skip normal native teardown and use Node/Bun's immediate exit path:
+ * ggml Metal can abort from C++ finalizers after valid JSON has already been
+ * produced (#368). This wrapper is only reached after the command completed, so
+ * real query failures still exit through the normal error path before this runs.
+ */
+export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise<void> {
+  const stderr = options.stderr ?? process.stderr;
+  const exit = options.exit ?? ((code: number) => process.exit(code));
+  const immediateExit = options.immediateExit ?? immediateProcessExit;
+
+  await flushWritable(options.stdout ?? process.stdout);
+
+  if (shouldBypassNativeCleanup(options)) {
+    await flushWritable(stderr);
+    immediateExit(0);
+    return;
+  }
+
+  try {
+    await (options.cleanup ?? disposeDefaultLlamaCpp)();
+  } catch (error) {
+    stderr.write(
+      `QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n`
+    );
+  }
+  await flushWritable(stderr);
+  exit(0);
+}
+
 // Ensure cursor is restored on exit
 process.on('SIGINT', () => { cursor.show(); process.exit(130); });
 process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
@@ -3415,8 +3485,10 @@ if (isMain) {
   }
 
   if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
-    process.exit(0);
+    await finishSuccessfulCliCommand({
+      command: cli.command,
+      format: cli.opts.format,
+    });
   }
 
 } // end if (main module)
diff --git a/src/llm.ts b/src/llm.ts
index d469d36..f7ec2fd 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -497,6 +497,23 @@ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): Llama
   return "auto";
 }
 
+async function disposeWithTimeout(resourceName: string, dispose: () => Promise<void>, timeoutMs = 1000): Promise<void> {
+  const timeoutPromise = new Promise<"timeout">((resolve) => {
+    setTimeout(() => resolve("timeout"), timeoutMs).unref();
+  });
+
+  try {
+    const result = await Promise.race([dispose(), timeoutPromise]);
+    if (result === "timeout") {
+      process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
+    }
+  } catch (error) {
+    process.stderr.write(
+      `QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`
+    );
+  }
+}
+
 function resolveExpandContextSize(configValue?: number): number {
   if (configValue !== undefined) {
     if (!Number.isInteger(configValue) || configValue <= 0) {
@@ -1413,22 +1430,37 @@ export class LlamaCpp implements LLM {
       this.inactivityTimer = null;
     }
 
-    // Disposing llama cascades to models and contexts automatically
-    // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
-    // Note: llama.dispose() can hang indefinitely, so we use a timeout
-    if (this.llama) {
-      const disposePromise = this.llama.dispose();
-      const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
-      await Promise.race([disposePromise, timeoutPromise]);
+    // Explicitly dispose in dependency order: contexts first, then models, then llama.
+    // Relying only on llama.dispose() leaves Metal resource sets alive until process
+    // finalization on Apple Silicon, where ggml_metal_device_free can abort after
+    // otherwise-successful CLI output (#368).
+    for (const ctx of this.embedContexts) {
+      await disposeWithTimeout("embedding context", () => ctx.dispose());
+    }
+    this.embedContexts = [];
+
+    for (const ctx of this.rerankContexts) {
+      await disposeWithTimeout("rerank context", () => ctx.dispose());
+    }
+    this.rerankContexts = [];
+
+    if (this.embedModel) {
+      await disposeWithTimeout("embedding model", () => this.embedModel!.dispose());
+      this.embedModel = null;
+    }
+    if (this.generateModel) {
+      await disposeWithTimeout("generation model", () => this.generateModel!.dispose());
+      this.generateModel = null;
+    }
+    if (this.rerankModel) {
+      await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose());
+      this.rerankModel = null;
     }
 
-    // Clear references
-    this.embedContexts = [];
-    this.rerankContexts = [];
-    this.embedModel = null;
-    this.generateModel = null;
-    this.rerankModel = null;
-    this.llama = null;
+    if (this.llama) {
+      await disposeWithTimeout("llama runtime", () => this.llama!.dispose());
+      this.llama = null;
+    }
 
     // Clear any in-flight load/create promises
     this.embedModelLoadPromise = null;
diff --git a/test/cli-exit-lifecycle.test.ts b/test/cli-exit-lifecycle.test.ts
new file mode 100644
index 0000000..b9328ed
--- /dev/null
+++ b/test/cli-exit-lifecycle.test.ts
@@ -0,0 +1,81 @@
+import { describe, expect, test } from "vitest";
+import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts";
+import { LlamaCpp } from "../src/llm.ts";
+
+describe("CLI successful-exit lifecycle", () => {
+  test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => {
+    const exitCodes: number[] = [];
+    const stderr: string[] = [];
+    const flushed: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      cleanup: async () => {
+        throw new Error("ggml_metal_device_free abort simulation");
+      },
+      exit: (code) => {
+        exitCodes.push(code);
+      },
+      stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } },
+      stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } },
+    });
+
+    expect(exitCodes).toEqual([0]);
+    expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed");
+    expect(flushed).toEqual([""]);
+  });
+
+  test("uses immediate exit for successful macOS JSON query after stdout flush", async () => {
+    const calls: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      platform: "darwin",
+      cleanup: async () => {
+        calls.push("cleanup");
+      },
+      exit: (code) => {
+        calls.push(`exit:${code}`);
+      },
+      immediateExit: (code) => {
+        calls.push(`immediate-exit:${code}`);
+      },
+      stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } },
+      stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } },
+    });
+
+    expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]);
+  });
+
+  test("disposes Llama resources in dependency order before CLI exit", async () => {
+    const calls: string[] = [];
+    const llm = new LlamaCpp({ inactivityTimeoutMs: 0 });
+    const disposable = (name: string) => ({
+      dispose: async () => {
+        calls.push(name);
+      },
+    });
+
+    Object.assign(llm as unknown as Record<string, unknown>, {
+      embedContexts: [disposable("embed-context")],
+      rerankContexts: [disposable("rerank-context")],
+      embedModel: disposable("embed-model"),
+      generateModel: disposable("generate-model"),
+      rerankModel: disposable("rerank-model"),
+      llama: disposable("llama"),
+    });
+
+    await llm.dispose();
+
+    expect(calls).toEqual([
+      "embed-context",
+      "rerank-context",
+      "embed-model",
+      "generate-model",
+      "rerank-model",
+      "llama",
+    ]);
+  });
+});

From b59ba6ab1ed35631b17ac914bfceeea588d67ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:32:45 +0000
Subject: [PATCH 5/7] test: keep cleanup lifecycle regression portable

---
 test/cli-exit-lifecycle.test.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/cli-exit-lifecycle.test.ts b/test/cli-exit-lifecycle.test.ts
index b9328ed..8558596 100644
--- a/test/cli-exit-lifecycle.test.ts
+++ b/test/cli-exit-lifecycle.test.ts
@@ -11,6 +11,7 @@ describe("CLI successful-exit lifecycle", () => {
     await finishSuccessfulCliCommand({
       command: "query",
       format: "json",
+      platform: "linux",
       cleanup: async () => {
         throw new Error("ggml_metal_device_free abort simulation");
       },

From dc49ccff1e014aa2cb085d82c1777417de05e8a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:17:40 +0000
Subject: [PATCH 6/7] test: cover qmd bin wrapper install layouts

---
 CHANGELOG.md             |   3 +
 test/bin-wrapper.test.ts | 164 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 167 insertions(+)
 create mode 100644 test/bin-wrapper.test.ts

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5abb2ae..69c05be 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -45,6 +45,9 @@
 - Packaging: install AST grammar WASM packages as required dependencies so
   Bun global installs include TypeScript/TSX/JavaScript grammars, and add a
   `smoke:package-grammars` verification command. #595
+- Launcher: add wrapper smoke coverage for scoped package, npm/npx,
+  Homebrew/Linuxbrew, Bun global symlink layouts, and `$BUN_INSTALL`
+  false-positive runtime selection regressions. #351 #353 #354 #356 #358 #359
 
 ## [2.1.0] - 2026-04-05
 
diff --git a/test/bin-wrapper.test.ts b/test/bin-wrapper.test.ts
new file mode 100644
index 0000000..82796d3
--- /dev/null
+++ b/test/bin-wrapper.test.ts
@@ -0,0 +1,164 @@
+import { afterEach, describe, expect, test } from "vitest";
+import { chmodSync, copyFileSync, mkdtempSync, mkdirSync, readFileSync, realpathSync, rmSync, symlinkSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { dirname, join, relative } from "node:path";
+import { execFileSync } from "node:child_process";
+import { fileURLToPath } from "node:url";
+
+const repoRoot = fileURLToPath(new URL("..", import.meta.url));
+const fixtures: string[] = [];
+
+function makeTempFixture() {
+  const root = mkdtempSync(join(tmpdir(), "qmd-bin-wrapper-"));
+  fixtures.push(root);
+  const capturePath = join(root, "capture.txt");
+  const runtimeBin = join(root, "runtime-bin");
+  mkdirSync(runtimeBin, { recursive: true });
+
+  for (const runtime of ["node", "bun"]) {
+    const runtimePath = join(runtimeBin, runtime);
+    writeFileSync(
+      runtimePath,
+      `#!/bin/sh\n{\n  printf '%s\\n' '${runtime}'\n  printf '%s\\n' "$1"\n  shift\n  printf '%s\\n' "$@"\n} > "$QMD_WRAPPER_CAPTURE"\n`,
+    );
+    chmodSync(runtimePath, 0o755);
+  }
+
+  return { root, capturePath, runtimeBin };
+}
+
+function makePackage(root: string, packagePath: string, lockfiles: string[] = []) {
+  const packageRoot = join(root, packagePath);
+  mkdirSync(join(packageRoot, "bin"), { recursive: true });
+  mkdirSync(join(packageRoot, "dist", "cli"), { recursive: true });
+  copyFileSync(join(repoRoot, "bin", "qmd"), join(packageRoot, "bin", "qmd"));
+  chmodSync(join(packageRoot, "bin", "qmd"), 0o755);
+  writeFileSync(join(packageRoot, "dist", "cli", "qmd.js"), "// fixture\n");
+  for (const lockfile of lockfiles) {
+    writeFileSync(join(packageRoot, lockfile), "");
+  }
+  return packageRoot;
+}
+
+function symlinkRelative(target: string, linkPath: string) {
+  mkdirSync(dirname(linkPath), { recursive: true });
+  symlinkSync(relative(dirname(linkPath), target), linkPath);
+}
+
+function runWrapper(commandPath: string, runtimeBin: string, capturePath: string, env: Record<string, string> = {}) {
+  rmSync(capturePath, { force: true });
+  execFileSync(commandPath, ["--version"], {
+    env: {
+      ...process.env,
+      ...env,
+      PATH: `${runtimeBin}:${process.env.PATH ?? ""}`,
+      QMD_WRAPPER_CAPTURE: capturePath,
+    },
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+  const [runtime, scriptPath, ...args] = readFileSync(capturePath, "utf8").trimEnd().split("\n");
+  return { runtime, scriptPath, args };
+}
+
+afterEach(() => {
+  for (const fixture of fixtures.splice(0)) {
+    rmSync(fixture, { recursive: true, force: true });
+  }
+});
+
+describe("bin/qmd package wrapper", () => {
+  test("direct package invocation resolves dist/cli/qmd.js from the package root", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "node_modules/@tobilu/qmd");
+
+    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+    expect(result.args).toEqual(["--version"]);
+  });
+
+  test("npm/Homebrew global bin symlink resolves scoped package path", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "opt", "homebrew", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("multi-hop global bin symlink chain resolves to the real package root", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "opt", "homebrew", "bin", "qmd");
+    const shim = join(root, "opt", "homebrew", "Cellar", "qmd", "current", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), shim);
+    symlinkRelative(shim, globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("linuxbrew global bin symlink resolves lib/node_modules scoped package path", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "home/linuxbrew/.linuxbrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "home", "linuxbrew", ".linuxbrew", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("npx scoped package .bin symlink resolves @tobilu/qmd package path", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "npm/_npx/abc123/node_modules/@tobilu/qmd");
+    const npxBin = join(root, "npm", "_npx", "abc123", "node_modules", ".bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), npxBin);
+
+    const result = runWrapper(npxBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("bun global symlink uses bun when package-local bun lockfile exists", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "home/user/.bun/install/global/node_modules/@tobilu/qmd", ["bun.lock"]);
+    const bunBin = join(root, "home", "user", ".bun", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), bunBin);
+
+    const result = runWrapper(bunBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("bun");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("ambient BUN_INSTALL alone does not select bun for an npm-installed package", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "opt", "homebrew", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath, { BUN_INSTALL: join(root, ".bun") });
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("package-lock.json takes priority over bun lockfiles", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "node_modules/@tobilu/qmd", ["package-lock.json", "bun.lock"]);
+
+    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+});

From 910ca07fd9df70616e4692547bc389dfc2965bb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobi=20L=C3=BCtke?= <tobi@lutke.com>
Date: Sat, 16 May 2026 17:36:06 +0000
Subject: [PATCH 7/7] fix: keep partial embeddings pending

---
 CHANGELOG.md       |   4 ++
 src/cli/qmd.ts     |   2 +-
 src/store.ts       | 121 ++++++++++++++++++++++++++++++++++-----------
 test/store.test.ts |  82 ++++++++++++++++++++++++++++++
 4 files changed, 178 insertions(+), 31 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9ee8337..d1f26af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,10 @@
   per result). Output payload for `--full` queries is now proportional
   to total document size.
 - macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
+- Embedding: require complete chunk coverage before treating a document as
+  embedded, remove partial vectors when chunk/session failures leave a
+  document incomplete, and keep `qmd status` pending counts honest after
+  interrupted long embed runs. #637 #378
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
   to the requested collection instead of embedding global pending work.
   Scoped `--force` clears only collection-owned vectors, preserves shared
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index fda6d8e..2ff3796 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -1806,7 +1806,7 @@ async function vectorIndex(
   }
 
   // Check if there's work to do before starting
-  const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection);
+  const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection, model);
   if (hashesToEmbed === 0 && !force) {
     console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
     closeDb();
diff --git a/src/store.ts b/src/store.ts
index 003feca..5323245 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -871,10 +871,15 @@ function initializeDatabase(db: Database): void {
       seq INTEGER NOT NULL DEFAULT 0,
       pos INTEGER NOT NULL DEFAULT 0,
       model TEXT NOT NULL,
+      total_chunks INTEGER NOT NULL DEFAULT 1,
       embedded_at TEXT NOT NULL,
       PRIMARY KEY (hash, seq)
     )
   `);
+  const cvInfoAfterCreate = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
+  if (!cvInfoAfterCreate.some(col => col.name === 'total_chunks')) {
+    db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
+  }
 
   // Store collections — makes the DB self-contained (no external config needed)
   db.exec(`
@@ -1167,9 +1172,9 @@ export type Store = {
   ensureVecTable: (dimensions: number) => void;
 
   // Index health
-  getHashesNeedingEmbedding: () => number;
-  getIndexHealth: () => IndexHealthInfo;
-  getStatus: () => IndexStatus;
+  getHashesNeedingEmbedding: (model?: string) => number;
+  getIndexHealth: (model?: string) => IndexHealthInfo;
+  getStatus: (model?: string) => IndexStatus;
 
   // Caching
   getCacheKey: typeof getCacheKey;
@@ -1229,7 +1234,7 @@ export type Store = {
   // Vector/embedding operations
   getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
   clearAllEmbeddings: () => void;
-  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
+  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => void;
 };
 
 // =============================================================================
@@ -1420,18 +1425,31 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
   };
 }
 
-function getPendingEmbeddingDocs(db: Database, collection?: string): PendingEmbeddingDoc[] {
+function contentVectorExpectedChunksExpr(db: Database): string {
+  const columns = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
+  return columns.some(col => col.name === 'total_chunks') ? 'MAX(total_chunks)' : '1';
+}
+
+function getPendingEmbeddingDocs(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): PendingEmbeddingDoc[] {
   const collectionFilter = collection ? `AND d.collection = ?` : ``;
+  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
   const stmt = db.prepare(`
     SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
     FROM documents d
     JOIN content c ON d.hash = c.hash
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
+    LEFT JOIN (
+      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      FROM content_vectors
+      WHERE model = ?
+      GROUP BY hash, model
+    ) v ON d.hash = v.hash
+    WHERE d.active = 1
+      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
+      ${collectionFilter}
     GROUP BY d.hash
     ORDER BY MIN(d.path)
   `);
-  return (collection ? stmt.all(collection) : stmt.all()) as PendingEmbeddingDoc[];
+  return (collection ? stmt.all(model, collection) : stmt.all(model)) as PendingEmbeddingDoc[];
 }
 
 function buildEmbeddingBatches(
@@ -1502,7 +1520,7 @@ export async function generateEmbeddings(
     clearAllEmbeddings(db, options?.collection);
   }
 
-  const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection);
+  const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model);
 
   if (docsToEmbed.length === 0) {
     return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
@@ -1533,6 +1551,7 @@ export async function generateEmbeddings(
 
       const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
       const batchChunks: ChunkItem[] = [];
+      const expectedChunksByHash = new Map<string, number>();
       const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);
 
       for (const doc of batchDocs) {
@@ -1558,6 +1577,7 @@ export async function generateEmbeddings(
             bytes: encoder.encode(chunks[seq]!.text).length,
           });
         }
+        expectedChunksByHash.set(doc.hash, chunks.length);
       }
 
       totalChunks += batchChunks.length;
@@ -1610,7 +1630,7 @@ export async function generateEmbeddings(
             const chunk = chunkBatch[i]!;
             const embedding = embeddings[i];
             if (embedding) {
-              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
               chunksEmbedded++;
             } else {
               errors++;
@@ -1629,7 +1649,7 @@ export async function generateEmbeddings(
                 const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
                 const result = await session.embed(text, { model });
                 if (result) {
-                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
                   chunksEmbedded++;
                 } else {
                   errors++;
@@ -1654,6 +1674,11 @@ export async function generateEmbeddings(
         });
       }
 
+      const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
+      if (removedPartialChunks > 0) {
+        chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
+      }
+
       bytesProcessed += batchBytes;
       options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
     }
@@ -1688,9 +1713,9 @@ export function createStore(dbPath?: string): Store {
     ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),
 
     // Index health
-    getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
-    getIndexHealth: () => getIndexHealth(db),
-    getStatus: () => getStatus(db),
+    getHashesNeedingEmbedding: (model?: string) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
+    getIndexHealth: (model?: string) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
+    getStatus: (model?: string) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
 
     // Caching
     getCacheKey,
@@ -1750,7 +1775,7 @@ export function createStore(dbPath?: string): Store {
     // Vector/embedding operations
     getHashesForEmbedding: () => getHashesForEmbedding(db),
     clearAllEmbeddings: () => clearAllEmbeddings(db),
-    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
+    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks),
   };
 
   return store;
@@ -1949,15 +1974,23 @@ export type IndexStatus = {
 // Index health
 // =============================================================================
 
-export function getHashesNeedingEmbedding(db: Database, collection?: string): number {
+export function getHashesNeedingEmbedding(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): number {
   const collectionFilter = collection ? `AND d.collection = ?` : ``;
+  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
   const stmt = db.prepare(`
     SELECT COUNT(DISTINCT d.hash) as count
     FROM documents d
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
+    LEFT JOIN (
+      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      FROM content_vectors
+      WHERE model = ?
+      GROUP BY hash, model
+    ) v ON d.hash = v.hash
+    WHERE d.active = 1
+      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
+      ${collectionFilter}
   `);
-  const result = (collection ? stmt.get(collection) : stmt.get()) as { count: number };
+  const result = (collection ? stmt.get(model, collection) : stmt.get(model)) as { count: number };
   return result.count;
 }
 
@@ -1967,8 +2000,8 @@ export type IndexHealthInfo = {
   daysStale: number | null;
 };
 
-export function getIndexHealth(db: Database): IndexHealthInfo {
-  const needsEmbedding = getHashesNeedingEmbedding(db);
+export function getIndexHealth(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexHealthInfo {
+  const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
   const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
 
   const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
@@ -3316,15 +3349,22 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi
  * Get all unique content hashes that need embeddings (from active documents).
  * Returns hash, document body, and a sample path for display purposes.
  */
-export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
+export function getHashesForEmbedding(db: Database, model: string = DEFAULT_EMBED_MODEL): { hash: string; body: string; path: string }[] {
+  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
   return db.prepare(`
     SELECT d.hash, c.doc as body, MIN(d.path) as path
     FROM documents d
     JOIN content c ON d.hash = c.hash
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL
+    LEFT JOIN (
+      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      FROM content_vectors
+      WHERE model = ?
+      GROUP BY hash, model
+    ) v ON d.hash = v.hash
+    WHERE d.active = 1
+      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
     GROUP BY d.hash
-  `).all() as { hash: string; body: string; path: string }[];
+  `).all(model) as { hash: string; body: string; path: string }[];
 }
 
 /**
@@ -3409,13 +3449,14 @@ export function insertEmbedding(
   pos: number,
   embedding: Float32Array,
   model: string,
-  embeddedAt: string
+  embeddedAt: string,
+  totalChunks: number = 1
 ): void {
   const hashSeq = `${hash}_${seq}`;
 
   // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
-  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
-  insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
+  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`);
+  insertContentVectorStmt.run(hash, seq, pos, model, totalChunks, embeddedAt);
 
   // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
   const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
@@ -3424,6 +3465,26 @@ export function insertEmbedding(
   insertVecStmt.run(hashSeq, embedding);
 }
 
+function removeIncompleteEmbeddings(db: Database, expectedChunksByHash: Map<string, number>, model: string): number {
+  let removed = 0;
+  const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`);
+  const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`);
+  const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
+
+  for (const [hash, expectedChunks] of expectedChunksByHash) {
+    const rows = rowsStmt.all(hash, model) as { seq: number }[];
+    if (rows.length === 0 || rows.length === expectedChunks) continue;
+
+    for (const row of rows) {
+      deleteVecStmt.run(`${hash}_${row.seq}`);
+    }
+    deleteContentStmt.run(hash, model);
+    removed += rows.length;
+  }
+
+  return removed;
+}
+
 // =============================================================================
 // Query expansion
 // =============================================================================
@@ -3922,7 +3983,7 @@ export function findDocuments(
 // Status
 // =============================================================================
 
-export function getStatus(db: Database): IndexStatus {
+export function getStatus(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexStatus {
   // DB is source of truth for collections — config provides supplementary metadata
   const dbCollections = db.prepare(`
     SELECT
@@ -3957,7 +4018,7 @@ export function getStatus(db: Database): IndexStatus {
   });
 
   const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
-  const needsEmbedding = getHashesNeedingEmbedding(db);
+  const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
   const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
 
   return {
diff --git a/test/store.test.ts b/test/store.test.ts
index 2adf717..9f132f8 100644
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -2281,6 +2281,26 @@ describe("Index Status", () => {
     await cleanupTestDb(store);
   });
 
+  test("embedding health is scoped to the active embed model", async () => {
+    const store = await createTestStore();
+    const collectionName = await createTestCollection();
+    const activeModel = "hf:active/embed-model.gguf";
+    const staleModel = "hf:stale/embed-model.gguf";
+    const now = new Date().toISOString();
+
+    store.llm = { embedModelName: activeModel } as any;
+    store.ensureVecTable(3);
+    await insertTestDocument(store.db, collectionName, { name: "doc1", hash: "hash1" });
+    store.insertEmbedding("hash1", 0, 0, new Float32Array([1, 2, 3]), staleModel, now, 1);
+
+    expect(store.getHashesNeedingEmbedding()).toBe(1);
+    expect(store.getStatus().needsEmbedding).toBe(1);
+    expect(store.getIndexHealth().needsEmbedding).toBe(1);
+    expect(store.getHashesNeedingEmbedding(staleModel)).toBe(0);
+
+    await cleanupTestDb(store);
+  });
+
   test("getIndexHealth returns health info", async () => {
     const store = await createTestStore();
     const collectionName = await createTestCollection();
@@ -3093,6 +3113,68 @@ describe("Embedding batching", () => {
     }
   });
 
+  test("generateEmbeddings does not mark a partially embedded multi-chunk document complete", async () => {
+    const store = await createTestStore();
+    const db = store.db;
+    const fakeLlm = {
+      async embed(_text: string, _options?: { model?: string }) {
+        return { embedding: [0.1, 0.2, 0.3], model: "fake-embed" };
+      },
+      async embedBatch(texts: string[], _options?: { model?: string }) {
+        return texts.map((_text, index) => index === 0
+          ? { embedding: [1, 2, 3], model: "fake-embed" }
+          : null
+        );
+      },
+    };
+
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.llm = fakeLlm as any;
+
+    try {
+      await insertTestDocument(db, "docs", {
+        name: "long-doc",
+        body: "# Long doc\n\n" + "partial embedding regression ".repeat(260),
+      });
+
+      const result = await generateEmbeddings(store);
+
+      expect(result.errors).toBeGreaterThan(0);
+      expect(db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get()).toEqual({ count: 0 });
+      expect(db.prepare(`SELECT COUNT(*) as count FROM vectors_vec`).get()).toEqual({ count: 0 });
+      expect(store.getHashesNeedingEmbedding()).toBe(1);
+      expect(store.getStatus().needsEmbedding).toBe(1);
+    } finally {
+      setDefaultLlamaCpp(null);
+      await cleanupTestDb(store);
+    }
+  });
+
+  test("generateEmbeddings opens a long-lived LLM session for embed runs", async () => {
+    const store = await createTestStore();
+    const fakeLlm = createFakeEmbedLlm();
+    const sessionSpy = vi.spyOn(llmModule, "withLLMSessionForLlm");
+
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.llm = fakeLlm as any;
+
+    try {
+      await insertTestDocument(store.db, "docs", { name: "one", body: "# One\n\nAlpha" });
+
+      await generateEmbeddings(store);
+
+      expect(sessionSpy).toHaveBeenCalledWith(
+        fakeLlm,
+        expect.any(Function),
+        expect.objectContaining({ maxDuration: 30 * 60 * 1000, name: "generateEmbeddings" }),
+      );
+    } finally {
+      sessionSpy.mockRestore();
+      setDefaultLlamaCpp(null);
+      await cleanupTestDb(store);
+    }
+  });
+
   test("vectorSearchQuery uses the active llm embed model for vector lookups", async () => {
     const store = await createTestStore();
     const model = "hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf";