From 1f522cffe2c39042a626d827dbad63b325befba4 Mon Sep 17 00:00:00 2001 From: Riley Shott Date: Wed, 13 May 2026 16:15:58 -0700 Subject: [PATCH 1/7] fix: return absolute line numbers from qmd_query The MCP `query` tool, HTTP `/query` endpoint, and CLI `qmd query` all returned chunk-local line numbers in their snippet output, so the line could not be passed back to `qmd_get` as `fromLine` without an out-of-band lookup. Pass the full document body plus `bestChunkPos` to `extractSnippet` instead of the chunk text alone so it can compute absolute line offsets while still scoping the keyword scan to the reranker-chosen chunk window (preserves #149). Also restores documented behavior of `qmd query --full`, which was emitting the best chunk (~3.6KB max) instead of the full document. extractSnippet now also falls back to a full-body scan when given a chunkPos but the chunk window contains no positive matches. The upstream chunk selector leaves bestIdx=0 as its initialization default whenever scoring fails to find a winner (e.g. queryTerms filtered to empty by the length>2 guard, or semantic-only matches with no lex overlap), so an unconditional chunk-scoped scan would land on chunk 0 instead of where the actual match lives. - src/mcp/server.ts: SearchResultItem gains `line: number`; both MCP and HTTP `/query` handlers populate it - src/cli/qmd.ts: OutputRow.body now sources from r.body - src/store.ts: extractSnippet falls back to full-body scan when chunk-scoped pass finds no positive match - test/mcp.test.ts: new fixture asserts absolute line 301 for a marker placed past the first chunk boundary - test/store.test.ts: regression test for the bestScore<=0 fallback --- CHANGELOG.md | 11 +++++++++++ src/cli/qmd.ts | 26 ++++++++++++++------------ src/mcp/server.ts | 9 +++++++-- src/store.ts | 19 ++++++++++++++++++- test/cli.test.ts | 10 ++++++++++ test/mcp.test.ts | 41 +++++++++++++++++++++++++++++++++++++++++ test/store.test.ts | 27 +++++++++++++++++++++++++++ 7 files changed, 128 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2757c8..ac69601 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,17 @@ ### Fixes +- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query` + (CLI JSON output and snippet headers) now return absolute source-file + line numbers instead of chunk-local ones, so the `line` field can be + passed back to `qmd_get` as `fromLine` without a separate lookup. + Snippet selection remains scoped to the best matching chunk + (preserves #149). +- CLI: `qmd query --full` now emits the full document body in all output + formats (json, csv, md, xml), restoring the documented behavior of the + flag. Previously it returned only the best matching chunk (~3.6KB max + per result). Output payload for `--full` queries is now proportional + to total document size. - Embedding: `qmd embed -c ` now scopes pending-doc selection to the requested collection instead of embedding global pending work. Scoped `--force` clears only collection-owned vectors, preserves shared diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index f576cde..cdbc241 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -1886,6 +1886,7 @@ type OutputRow = { score: number; context?: string | null; chunkPos?: number; + chunkLen?: number; hash?: string; docid?: string; explain?: HybridQueryExplain; @@ -1968,9 +1969,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) // JSON output for LLM consumption const output = filtered.map(row => { const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined); + const snippetInfo = extractSnippet(row.body, query, 300, row.chunkPos, row.chunkLen, opts.intent); let body = opts.full ? row.body : undefined; - const snippetInfo = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent) : undefined; - let snippet = snippetInfo?.snippet; + let snippet = !opts.full ? snippetInfo.snippet : undefined; if (opts.lineNumbers) { if (body) body = addLineNumbers(body); if (snippet) snippet = addLineNumbers(snippet); @@ -1979,7 +1980,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) ...(docid && { docid: `#${docid}` }), score: Math.round(row.score * 100) / 100, file: toQmdPath(row.displayPath), - ...(snippetInfo && { line: snippetInfo.line }), + line: snippetInfo.line, title: row.title, ...(row.context && { context: row.context }), ...(body && { body }), @@ -2002,7 +2003,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) for (let i = 0; i < filtered.length; i++) { const row = filtered[i]; if (!row) continue; - const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent); + const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent); const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined); // Line 1: filepath with docid @@ -2066,8 +2067,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) console.log(); // Snippet with highlighting (diff-style header included) - let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet; - const highlighted = highlightTerms(displaySnippet, query); + const content = opts.full ? row.body : snippet; + const displayContent = opts.lineNumbers ? addLineNumbers(content, opts.full ? 1 : line) : content; + const highlighted = highlightTerms(displayContent, query); console.log(highlighted); // Double empty line between results @@ -2079,7 +2081,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) if (!row) continue; const heading = row.title || row.displayPath; const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined); - let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet; + let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet; if (opts.lineNumbers) { content = addLineNumbers(content); } @@ -2092,7 +2094,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '"')}"` : ""; const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '"')}"` : ""; const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : ""); - let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet; + let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet; if (opts.lineNumbers) { content = addLineNumbers(content); } @@ -2102,10 +2104,10 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions) // CSV format console.log("docid,score,file,title,context,line,snippet"); for (const row of filtered) { - const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent); + const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent); let content = opts.full ? row.body : snippet; if (opts.lineNumbers) { - content = addLineNumbers(content, line); + content = addLineNumbers(content, opts.full ? 1 : line); } const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : ""); const snippetText = content || ""; @@ -2461,13 +2463,13 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query) : query; - // Map to CLI output format — use bestChunk for snippet display outputResults(results.map(r => ({ file: r.file, displayPath: r.displayPath, title: r.title, - body: r.bestChunk, + body: r.body, chunkPos: r.bestChunkPos, + chunkLen: r.bestChunk.length, score: r.score, context: r.context, docid: r.docid, diff --git a/src/mcp/server.ts b/src/mcp/server.ts index 2f5482f..a9ec99f 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -42,6 +42,7 @@ type SearchResultItem = { title: string; score: number; context: string | null; + line: number; // Absolute line in source markdown snippet: string; }; @@ -242,6 +243,8 @@ async function createMcpServer(store: QMDStore): Promise { title: "Query", description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall. +Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = line - 20, maxLines = 80, lineNumbers = true)\`. + ## Query Types **lex** — BM25 keyword search. Fast, exact, no LLM needed. @@ -341,13 +344,14 @@ Intent-aware lex (C++ performance, not sports): || searches[0]?.query || ""; const filtered: SearchResultItem[] = results.map(r => { - const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300, undefined, undefined, intent); + const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, intent); return { docid: `#${r.docid}`, file: r.displayPath, title: r.title, score: Math.round(r.score * 100) / 100, context: r.context, + line, snippet: addLineNumbers(snippet, line), }; }); @@ -702,13 +706,14 @@ export async function startMcpHttpServer( || params.searches[0]?.query || ""; const formatted = results.map(r => { - const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300); + const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, params.intent); return { docid: `#${r.docid}`, file: r.displayPath, title: r.title, score: Math.round(r.score * 100) / 100, context: r.context, + line, snippet: addLineNumbers(snippet, line), }; }); diff --git a/src/store.ts b/src/store.ts index 52dd334..f927ccc 100644 --- a/src/store.ts +++ b/src/store.ts @@ -4023,7 +4023,7 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP let searchBody = body; let lineOffset = 0; - if (chunkPos && chunkPos > 0) { + if (chunkPos !== undefined && chunkPos >= 0) { // Search within the chunk region, with some padding for context // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks) const searchLen = chunkLen || CHUNK_SIZE_CHARS; @@ -4055,6 +4055,23 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP } } + if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) { + if (chunkPos === 0) { + // chunkPos=0 may be the chunk selector's initialization default for queries + // where lexical chunk scoring found no winner (e.g. tokens filtered to empty + // by the length>2 guard). Retry with full body so the real match isn't missed. + return extractSnippet(body, query, maxLen, undefined, undefined, intent); + } + // For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to + // match literally is most likely a tokenizer limitation (quoted phrases, FTS5 + // syntax, HYDE passages, semantic hits), so anchor on the chunk start rather + // than disregarding the reranker's pick. + const contextStart = Math.max(0, chunkPos - 100); + bestLine = chunkPos > contextStart + ? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1 + : 0; + } + const start = Math.max(0, bestLine - 1); const end = Math.min(lines.length, bestLine + 3); const snippetLines = lines.slice(start, end); diff --git a/test/cli.test.ts b/test/cli.test.ts index 9c575f8..2535fe4 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -507,6 +507,16 @@ describe("CLI Search Command", () => { // Error message goes to stderr expect(stderr).toContain("Usage:"); }); + + test("--json --full includes line field for round-tripping to qmd get", async () => { + const { stdout, exitCode } = await runQmd(["search", "meeting", "--json", "--full", "-n", "1"]); + expect(exitCode).toBe(0); + const results = JSON.parse(stdout); + expect(results.length).toBeGreaterThan(0); + expect(results[0].line).toBeTypeOf("number"); + expect(results[0].line).toBeGreaterThan(0); + expect(results[0].body).toBeTypeOf("string"); + }); }); describe("CLI Get Command", () => { diff --git a/test/mcp.test.ts b/test/mcp.test.ts index 3ea87bd..495c624 100644 --- a/test/mcp.test.ts +++ b/test/mcp.test.ts @@ -913,6 +913,22 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => { initTestDatabase(db); seedTestData(db); + // 300 pad lines (37 chars each = 11100 chars) puts the marker past the + // first chunk boundary at CHUNK_SIZE_CHARS = 3600. + { + const padLine = "Pad line for chunk boundary coverage\n"; + const absLineFixtureBody = + padLine.repeat(300) + + "UNIQUE_KEYWORD_XYZ marker\n" + + padLine.repeat(20); + const fixtureHash = "hash-abslines"; + const now = new Date().toISOString(); + db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`) + .run(fixtureHash, absLineFixtureBody, now); + db.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES ('docs', ?, ?, ?, ?, ?, 1)`) + .run("absolute-line-fixture.md", "Absolute Line Fixture", fixtureHash, now, now); + } + // Sync config into SQLite const httpTestConfig: CollectionConfig = { collections: { @@ -1074,4 +1090,29 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => { expect(json.result).toBeDefined(); expect(json.result.content.length).toBeGreaterThan(0); }); + + test("POST /mcp tools/call query returns absolute source-file line numbers, not chunk-local", async () => { + await mcpRequest({ + jsonrpc: "2.0", id: 1, method: "initialize", + params: { protocolVersion: "2025-03-26", capabilities: {}, clientInfo: { name: "test", version: "1.0" } }, + }); + + const { status, json } = await mcpRequest({ + jsonrpc: "2.0", id: 5, method: "tools/call", + params: { + name: "query", + arguments: { + searches: [{ type: "lex", query: "UNIQUE_KEYWORD_XYZ" }], + rerank: false, + }, + }, + }); + expect(status).toBe(200); + const results = json.result.structuredContent.results; + expect(results.length).toBeGreaterThan(0); + const hit = results.find((r: any) => r.file === "docs/absolute-line-fixture.md"); + expect(hit).toBeDefined(); + expect(hit.line).toBe(301); + expect(hit.snippet).toMatch(/^\d+: @@ -3\d\d,/); + }); }); diff --git a/test/store.test.ts b/test/store.test.ts index 8bfaae9..6b3be5b 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -2001,6 +2001,33 @@ describe("Snippet Extraction", () => { expect(line).toBe(51); // "Target keyword" is line 51 expect(linesBefore).toBeGreaterThan(40); // Many lines before }); + + test("extractSnippet anchors on chunkPos when lexical scoring finds no match", () => { + // The snippet tokenizer does not strip FTS5 syntax, so a quoted-phrase query + // tokenises into terms with embedded quotes that never appear in body text. + // bestScore stays at 0 even though the reranker correctly identified a chunk; + // the fallback should anchor on chunkPos rather than defaulting to line 1. + const padLine = "Lorem ipsum dolor sit amet\n"; + const padding = padLine.repeat(100); + const body = padding + "chunk content here\nmore chunk content\n" + padding; + const chunkPos = padding.length; + + const { line } = extractSnippet(body, '"unrelated quoted phrase"', 200, chunkPos); + + expect(line).toBeGreaterThan(50); + expect(line).toBeLessThan(110); + }); + + test("extractSnippet with chunkPos=0 falls back to full-body scan when chunk has no match", () => { + // chunkPos=0 may be the chunk selector's bestIdx=0 default rather than a real + // first-chunk hit, so the fallback must consider matches outside chunk 0. + const padding = "Lorem ipsum dolor sit amet\n".repeat(200); + const body = padding + "TARGET_KEYWORD line content\ntail line\n"; + + const { line } = extractSnippet(body, "TARGET_KEYWORD", 200, 0); + + expect(line).toBe(201); + }); }); // ============================================================================= From aa1818e1817c6dacbb2ebfa762112e232e442755 Mon Sep 17 00:00:00 2001 From: Riley Shott Date: Wed, 13 May 2026 20:07:13 -0700 Subject: [PATCH 2/7] fix: clamp negative fromLine in get to avoid silent tail content The query tool description tells agents to compute fromLine = line - 20 for context around a hit. For hits in lines 1 through 20 that yields a negative fromLine, which propagated unchanged through: MCP get handler -> store.getDocumentBody -> Array.prototype.slice A negative slice start offsets from the end of the array rather than clamping to the beginning, so a top-of-file hit on a long document returned an empty string and on a short document returned content from the wrong region (e.g. lines 11-30 of a 30-line file in response to a request for the head of the document). The lineNumbers branch was the same shape: addLineNumbers(text, -19) emitted "-19:", "-18:" prefixes. Same buggy slice lived in the CLI getDocument path independently. Fix in three layers, plus the docstring: - src/mcp/server.ts: clamp parsedFromLine to >= 1 after parsing input args and the :line suffix, before it reaches getDocumentBody and addLineNumbers. Also tighten the query tool's recommendation to `fromLine = max(1, line - 20)` so following the docstring literally produces a valid value. - src/cli/qmd.ts: same clamp on the CLI getDocument fromLine after the colon-suffix parse. - src/store.ts: defensive Math.max(0, ...) on the slice start in getDocumentBody so SDK callers and any future entry points are protected without relying on every caller remembering to clamp. - test/store.test.ts: regression test on getDocumentBody with fromLine = -19 returns the head of the document, not the tail. - test/cli.test.ts: regression test on `qmd get --from -19` matches the no-flag baseline (head of document). --- src/cli/qmd.ts | 1 + src/mcp/server.ts | 3 ++- src/store.ts | 2 +- test/cli.test.ts | 7 +++++++ test/store.test.ts | 15 +++++++++++++++ 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index cdbc241..8b6ada1 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -844,6 +844,7 @@ function getDocument(filename: string, fromLine?: number, maxLines?: number, lin inputPath = inputPath.slice(0, -colonMatch[0].length); } } + if (fromLine !== undefined) fromLine = Math.max(1, fromLine); const parsedIndexPath = isVirtualPath(inputPath) ? parseVirtualPath(inputPath) : null; if (parsedIndexPath?.indexName) { diff --git a/src/mcp/server.ts b/src/mcp/server.ts index a9ec99f..69c7eff 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -243,7 +243,7 @@ async function createMcpServer(store: QMDStore): Promise { title: "Query", description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall. -Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = line - 20, maxLines = 80, lineNumbers = true)\`. +Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = max(1, line - 20), maxLines = 80, lineNumbers = true)\`. ## Query Types @@ -389,6 +389,7 @@ Intent-aware lex (C++ performance, not sports): parsedFromLine = parseInt(colonMatch[1], 10); lookup = lookup.slice(0, -colonMatch[0].length); } + if (parsedFromLine !== undefined) parsedFromLine = Math.max(1, parsedFromLine); const result = await store.get(lookup, { includeBody: false }); diff --git a/src/store.ts b/src/store.ts index f927ccc..003feca 100644 --- a/src/store.ts +++ b/src/store.ts @@ -3800,7 +3800,7 @@ export function getDocumentBody(db: Database, doc: DocumentResult | { filepath: let body = row.body; if (fromLine !== undefined || maxLines !== undefined) { const lines = body.split('\n'); - const start = (fromLine || 1) - 1; + const start = Math.max(0, (fromLine || 1) - 1); const end = maxLines !== undefined ? start + maxLines : lines.length; body = lines.slice(start, end).join('\n'); } diff --git a/test/cli.test.ts b/test/cli.test.ts index 2535fe4..769db00 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -542,6 +542,13 @@ describe("CLI Get Command", () => { // Should indicate file not found expect(exitCode).toBe(1); }); + + test("clamps negative --from to top of file (no silent tail content)", async () => { + const baseline = await runQmd(["get", "README.md"]); + const negative = await runQmd(["get", "README.md", "--from", "-19"]); + expect(negative.exitCode).toBe(0); + expect(negative.stdout).toBe(baseline.stdout); + }); }); describe("CLI Multi-Get Command", () => { diff --git a/test/store.test.ts b/test/store.test.ts index 6b3be5b..2adf717 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -1713,6 +1713,21 @@ describe("Document Retrieval", () => { expect(body).toBeNull(); await cleanupTestDb(store); }); + + test("getDocumentBody clamps negative fromLine to top of document", async () => { + const store = await createTestStore(); + const collectionName = await createTestCollection({ pwd: "/path" }); + await insertTestDocument(store.db, collectionName, { + name: "mydoc", + displayPath: "mydoc.md", + body: "Line 1\nLine 2\nLine 3\nLine 4\nLine 5", + }); + + const body = store.getDocumentBody({ filepath: "/path/mydoc.md" }, -19, 80); + expect(body).toBe("Line 1\nLine 2\nLine 3\nLine 4\nLine 5"); + + await cleanupTestDb(store); + }); }); describe("findDocuments (multi-get)", () => { From dd5d82d52368fd0e7501d3f939233bed7dd617e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 16 May 2026 17:18:06 +0000 Subject: [PATCH 3/7] fix: keep llama GPU fallback noise off JSON stdout --- CHANGELOG.md | 1 + README.md | 1 + src/cli/qmd.ts | 6 ++++ src/llm.ts | 66 +++++++++++++++++++++++++++++++---- test/cli.test.ts | 1 + test/llm.test.ts | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 158 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2757c8..d7378bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Fixes +- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands. - Embedding: `qmd embed -c ` now scopes pending-doc selection to the requested collection instead of embedding global pending work. Scoped `--force` clears only collection-owned vectors, preserves shared diff --git a/README.md b/README.md index 02e4b1e..7eadb93 100644 --- a/README.md +++ b/README.md @@ -798,6 +798,7 @@ llm_cache -- Cached LLM responses (query expansion, rerank scores) |----------|---------|-------------| | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location | | `QMD_LLAMA_GPU` | `auto` | Force llama.cpp GPU backend (`metal`, `vulkan`, `cuda`) or disable GPU with `false` | +| `QMD_FORCE_CPU` | unset | Set to `1`/`true` to force CPU mode before any CUDA/Vulkan/Metal probing. Equivalent CLI flag: `--no-gpu`. | | `QMD_EMBED_PARALLELISM` | automatic | Override embedding/reranking context parallelism (1-8). Windows CUDA defaults to `1` because parallel CUDA contexts can crash with `ggml-cuda.cu:98`; use Vulkan or raise this only if your driver is stable. | ## How It Works diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 01dc540..7df8401 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -2562,6 +2562,7 @@ function parseCLI() { // Query options "candidate-limit": { type: "string", short: "C" }, "no-rerank": { type: "boolean", default: false }, + "no-gpu": { type: "boolean", default: false }, intent: { type: "string" }, // Chunking options "chunk-strategy": { type: "string" }, // "regex" (default) or "auto" (AST for code files) @@ -2574,6 +2575,10 @@ function parseCLI() { strict: false, // Allow unknown options to pass through }); + if (values["no-gpu"]) { + process.env.QMD_FORCE_CPU = "1"; + } + // Select index name (default: "index") const indexName = values.index as string | undefined; if (indexName) { @@ -2826,6 +2831,7 @@ function showHelp(): void { console.log(" --full - Output full document instead of snippet"); console.log(" -C, --candidate-limit - Max candidates to rerank (default 40, lower = faster)"); console.log(" --no-rerank - Skip LLM reranking (use RRF scores only, much faster on CPU)"); + console.log(" --no-gpu - Force CPU mode for llama.cpp operations (same as QMD_FORCE_CPU=1)"); console.log(" --line-numbers - Include line numbers in output"); console.log(" --explain - Include retrieval score traces (query --json/CLI)"); console.log(" --files | --json | --csv | --md | --xml - Output format"); diff --git a/src/llm.ts b/src/llm.ts index d469d36..b0b30d4 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -22,10 +22,45 @@ type NodeLlamaCppModule = { let nodeLlamaCppImport: Promise | null = null; async function loadNodeLlamaCpp(): Promise { - nodeLlamaCppImport ??= import("node-llama-cpp") as Promise; + nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr( + () => import("node-llama-cpp") as Promise + ); return nodeLlamaCppImport; } +export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void { + nodeLlamaCppImport = module ? Promise.resolve(module) : null; + failedGpuInitModes.clear(); +} + +type StdoutWrite = typeof process.stdout.write; +let nativeStdoutRedirectDepth = 0; +let originalStdoutWrite: StdoutWrite | null = null; + +/** + * Some node-llama-cpp native build/probe paths write library noise to stdout. + * JSON APIs must reserve stdout for machine-readable payloads, so route that + * noise to stderr while native llama initialization is in progress. + */ +export async function withNativeStdoutRedirectedToStderr(fn: () => Promise): Promise { + if (nativeStdoutRedirectDepth === 0) { + originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite; + process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => { + return process.stderr.write(chunk, encoding, cb as any); + }) as StdoutWrite; + } + nativeStdoutRedirectDepth++; + try { + return await fn(); + } finally { + nativeStdoutRedirectDepth--; + if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) { + process.stdout.write = originalStdoutWrite; + originalStdoutWrite = null; + } + } +} + import { homedir } from "os"; import { join } from "path"; import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs"; @@ -487,7 +522,15 @@ export function resolveSafeParallelism(options: ParallelismOptions): number { return Math.max(1, options.computed); } -export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode { +export function resolveLlamaGpuMode( + envValue = process.env.QMD_LLAMA_GPU, + forceCpuValue = process.env.QMD_FORCE_CPU +): LlamaGpuMode { + const forceCpu = forceCpuValue?.trim().toLowerCase() ?? ""; + if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) { + return false; + } + const normalized = envValue?.trim().toLowerCase() ?? ""; if (!normalized) return "auto"; if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false; @@ -518,6 +561,8 @@ function resolveExpandContextSize(configValue?: number): number { return parsed; } +const failedGpuInitModes = new Set(); + export class LlamaCpp implements LLM { private readonly _ciMode = !!process.env.CI; private llama: Llama | null = null; @@ -668,22 +713,29 @@ export class LlamaCpp implements LLM { const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp(); const loadLlama = async (gpu: LlamaGpuMode) => - await getLlama({ + await withNativeStdoutRedirectedToStderr(() => getLlama({ build: allowBuild ? "autoAttempt" : "never", logLevel: LlamaLogLevel.error, gpu, skipDownload: !allowBuild, - }); + })); let llama: Llama; - if (gpuMode === false) { + if (gpuMode === false || failedGpuInitModes.has(gpuMode)) { + if (gpuMode !== false && failedGpuInitModes.has(gpuMode)) { + process.stderr.write( + `QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n` + ); + } llama = await loadLlama(false); } else { try { llama = await loadLlama(gpuMode); } catch (err) { - // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init. - // Fall back to CPU so qmd still works. + // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init. + // Fall back to CPU so qmd still works, and cache the failure to avoid repeated + // expensive native build/probe attempts in this process. + failedGpuInitModes.add(gpuMode); process.stderr.write( `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n` ); diff --git a/test/cli.test.ts b/test/cli.test.ts index e4ceb35..aacfff5 100644 --- a/test/cli.test.ts +++ b/test/cli.test.ts @@ -233,6 +233,7 @@ describe("CLI Help", () => { expect(stdout).toContain("Usage:"); expect(stdout).toContain("qmd collection add"); expect(stdout).toContain("qmd search"); + expect(stdout).toContain("--no-gpu"); expect(stdout).toContain("qmd skill show/install"); }); diff --git a/test/llm.test.ts b/test/llm.test.ts index ff22c0c..2fc03cd 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -13,6 +13,8 @@ import { getDefaultLlamaCpp, disposeDefaultLlamaCpp, resolveLlamaGpuMode, + setNodeLlamaCppModuleForTest, + withNativeStdoutRedirectedToStderr, resolveParallelismOverride, resolveSafeParallelism, withLLMSession, @@ -78,6 +80,29 @@ describe("QMD_LLAMA_GPU resolution", () => { expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda"); }); + test("QMD_FORCE_CPU disables GPU before QMD_LLAMA_GPU auto-detection", () => { + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_FORCE_CPU = "1"; + try { + expect(resolveLlamaGpuMode(undefined)).toBe(false); + expect(resolveLlamaGpuMode("cuda")).toBe(false); + } finally { + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); + + test("QMD_FORCE_CPU ignores false-ish values", () => { + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_FORCE_CPU = "0"; + try { + expect(resolveLlamaGpuMode(undefined)).toBe("auto"); + } finally { + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); + test("warns and falls back to auto for unsupported values", () => { const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); try { @@ -90,6 +115,71 @@ describe("QMD_LLAMA_GPU resolution", () => { }); }); +describe("native llama stdout containment", () => { + test("redirects native stdout noise to stderr while JSON callers are initializing llama", async () => { + const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true); + const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); + try { + await withNativeStdoutRedirectedToStderr(async () => { + process.stdout.write("cmake build spam\n"); + return "ok"; + }); + + expect(stdoutSpy).not.toHaveBeenCalled(); + expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined); + } finally { + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + } + }); + + test("keeps native GPU failure noise off stdout and caches failed GPU init", async () => { + const prevGpu = process.env.QMD_LLAMA_GPU; + const prevForceCpu = process.env.QMD_FORCE_CPU; + process.env.QMD_LLAMA_GPU = "cuda"; + delete process.env.QMD_FORCE_CPU; + + const calls: unknown[] = []; + const fakeLlama = { gpu: false, cpuMathCores: 4 }; + setNodeLlamaCppModuleForTest({ + LlamaLogLevel: { error: "error" }, + resolveModelFile: vi.fn(), + LlamaChatSession: vi.fn() as any, + getLlama: vi.fn(async (options: Record) => { + calls.push(options.gpu); + if (options.gpu === "cuda") { + process.stdout.write("cmake build spam\n"); + throw new Error("CUDA unavailable"); + } + return fakeLlama as any; + }), + }); + + const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true); + const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true); + try { + const first = new LlamaCpp(); + const second = new LlamaCpp(); + + await (first as any).ensureLlama(); + await (second as any).ensureLlama(); + + expect(stdoutSpy).not.toHaveBeenCalled(); + expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined); + expect(calls).toEqual(["cuda", false, false]); + expect(String(stderrSpy.mock.calls.map(call => call[0]).join(""))).toContain("skipping previously failed GPU init"); + } finally { + stdoutSpy.mockRestore(); + stderrSpy.mockRestore(); + setNodeLlamaCppModuleForTest(null); + if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU; + else process.env.QMD_LLAMA_GPU = prevGpu; + if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU; + else process.env.QMD_FORCE_CPU = prevForceCpu; + } + }); +}); + describe("LLM context parallelism safety", () => { test("defaults Windows CUDA to one context to avoid ggml-cuda.cu:98 crashes", () => { expect(resolveSafeParallelism({ From 60c75cb3327df40f930d23c12a4c98c4a0f79a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 16 May 2026 17:20:21 +0000 Subject: [PATCH 4/7] fix: avoid macOS Metal cleanup abort after JSON query --- CHANGELOG.md | 1 + src/cli/qmd.ts | 76 ++++++++++++++++++++++++++++++- src/llm.ts | 60 ++++++++++++++++++------ test/cli-exit-lifecycle.test.ts | 81 +++++++++++++++++++++++++++++++++ 4 files changed, 202 insertions(+), 16 deletions(-) create mode 100644 test/cli-exit-lifecycle.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index ac69601..39b811a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ flag. Previously it returned only the best matching chunk (~3.6KB max per result). Output payload for `--full` queries is now proportional to total document size. +- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368 - Embedding: `qmd embed -c ` now scopes pending-doc selection to the requested collection instead of embedding global pending work. Scoped `--force` clears only collection-owned vectors, preserves shared diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index 40bc0dd..df73f36 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -207,6 +207,76 @@ const cursor = { show() { process.stderr.write('\x1b[?25h'); }, }; +type CliLifecycleWritable = { + write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean; +}; + +type FinishSuccessfulCliCommandOptions = { + command: string; + format?: OutputFormat; + cleanup?: () => Promise; + exit?: (code: number) => void; + immediateExit?: (code: number) => void; + stdout?: CliLifecycleWritable; + stderr?: CliLifecycleWritable; + platform?: NodeJS.Platform; +}; + +async function flushWritable(stream: CliLifecycleWritable): Promise { + await new Promise((resolve) => { + stream.write("", () => resolve()); + }); +} + +function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean { + return ( + (options.platform ?? process.platform) === "darwin" && + options.command === "query" && + options.format === "json" && + process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1" + ); +} + +function immediateProcessExit(code: number): void { + const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void }; + if (typeof processWithReallyExit.reallyExit === "function") { + processWithReallyExit.reallyExit(code); + return; + } + process.exit(code); +} + +/** + * Finish a successful CLI command after output has been flushed. On macOS JSON + * query runs, skip normal native teardown and use Node/Bun's immediate exit path: + * ggml Metal can abort from C++ finalizers after valid JSON has already been + * produced (#368). This wrapper is only reached after the command completed, so + * real query failures still exit through the normal error path before this runs. + */ +export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise { + const stderr = options.stderr ?? process.stderr; + const exit = options.exit ?? ((code: number) => process.exit(code)); + const immediateExit = options.immediateExit ?? immediateProcessExit; + + await flushWritable(options.stdout ?? process.stdout); + + if (shouldBypassNativeCleanup(options)) { + await flushWritable(stderr); + immediateExit(0); + return; + } + + try { + await (options.cleanup ?? disposeDefaultLlamaCpp)(); + } catch (error) { + stderr.write( + `QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n` + ); + } + await flushWritable(stderr); + exit(0); +} + // Ensure cursor is restored on exit process.on('SIGINT', () => { cursor.show(); process.exit(130); }); process.on('SIGTERM', () => { cursor.show(); process.exit(143); }); @@ -3415,8 +3485,10 @@ if (isMain) { } if (cli.command !== "mcp") { - await disposeDefaultLlamaCpp(); - process.exit(0); + await finishSuccessfulCliCommand({ + command: cli.command, + format: cli.opts.format, + }); } } // end if (main module) diff --git a/src/llm.ts b/src/llm.ts index d469d36..f7ec2fd 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -497,6 +497,23 @@ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): Llama return "auto"; } +async function disposeWithTimeout(resourceName: string, dispose: () => Promise, timeoutMs = 1000): Promise { + const timeoutPromise = new Promise<"timeout">((resolve) => { + setTimeout(() => resolve("timeout"), timeoutMs).unref(); + }); + + try { + const result = await Promise.race([dispose(), timeoutPromise]); + if (result === "timeout") { + process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`); + } + } catch (error) { + process.stderr.write( + `QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n` + ); + } +} + function resolveExpandContextSize(configValue?: number): number { if (configValue !== undefined) { if (!Number.isInteger(configValue) || configValue <= 0) { @@ -1413,22 +1430,37 @@ export class LlamaCpp implements LLM { this.inactivityTimer = null; } - // Disposing llama cascades to models and contexts automatically - // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle - // Note: llama.dispose() can hang indefinitely, so we use a timeout - if (this.llama) { - const disposePromise = this.llama.dispose(); - const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 1000)); - await Promise.race([disposePromise, timeoutPromise]); + // Explicitly dispose in dependency order: contexts first, then models, then llama. + // Relying only on llama.dispose() leaves Metal resource sets alive until process + // finalization on Apple Silicon, where ggml_metal_device_free can abort after + // otherwise-successful CLI output (#368). + for (const ctx of this.embedContexts) { + await disposeWithTimeout("embedding context", () => ctx.dispose()); + } + this.embedContexts = []; + + for (const ctx of this.rerankContexts) { + await disposeWithTimeout("rerank context", () => ctx.dispose()); + } + this.rerankContexts = []; + + if (this.embedModel) { + await disposeWithTimeout("embedding model", () => this.embedModel!.dispose()); + this.embedModel = null; + } + if (this.generateModel) { + await disposeWithTimeout("generation model", () => this.generateModel!.dispose()); + this.generateModel = null; + } + if (this.rerankModel) { + await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose()); + this.rerankModel = null; } - // Clear references - this.embedContexts = []; - this.rerankContexts = []; - this.embedModel = null; - this.generateModel = null; - this.rerankModel = null; - this.llama = null; + if (this.llama) { + await disposeWithTimeout("llama runtime", () => this.llama!.dispose()); + this.llama = null; + } // Clear any in-flight load/create promises this.embedModelLoadPromise = null; diff --git a/test/cli-exit-lifecycle.test.ts b/test/cli-exit-lifecycle.test.ts new file mode 100644 index 0000000..b9328ed --- /dev/null +++ b/test/cli-exit-lifecycle.test.ts @@ -0,0 +1,81 @@ +import { describe, expect, test } from "vitest"; +import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts"; +import { LlamaCpp } from "../src/llm.ts"; + +describe("CLI successful-exit lifecycle", () => { + test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => { + const exitCodes: number[] = []; + const stderr: string[] = []; + const flushed: string[] = []; + + await finishSuccessfulCliCommand({ + command: "query", + format: "json", + cleanup: async () => { + throw new Error("ggml_metal_device_free abort simulation"); + }, + exit: (code) => { + exitCodes.push(code); + }, + stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } }, + stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } }, + }); + + expect(exitCodes).toEqual([0]); + expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed"); + expect(flushed).toEqual([""]); + }); + + test("uses immediate exit for successful macOS JSON query after stdout flush", async () => { + const calls: string[] = []; + + await finishSuccessfulCliCommand({ + command: "query", + format: "json", + platform: "darwin", + cleanup: async () => { + calls.push("cleanup"); + }, + exit: (code) => { + calls.push(`exit:${code}`); + }, + immediateExit: (code) => { + calls.push(`immediate-exit:${code}`); + }, + stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } }, + stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } }, + }); + + expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]); + }); + + test("disposes Llama resources in dependency order before CLI exit", async () => { + const calls: string[] = []; + const llm = new LlamaCpp({ inactivityTimeoutMs: 0 }); + const disposable = (name: string) => ({ + dispose: async () => { + calls.push(name); + }, + }); + + Object.assign(llm as unknown as Record, { + embedContexts: [disposable("embed-context")], + rerankContexts: [disposable("rerank-context")], + embedModel: disposable("embed-model"), + generateModel: disposable("generate-model"), + rerankModel: disposable("rerank-model"), + llama: disposable("llama"), + }); + + await llm.dispose(); + + expect(calls).toEqual([ + "embed-context", + "rerank-context", + "embed-model", + "generate-model", + "rerank-model", + "llama", + ]); + }); +}); From b59ba6ab1ed35631b17ac914bfceeea588d67ef7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 16 May 2026 17:32:45 +0000 Subject: [PATCH 5/7] test: keep cleanup lifecycle regression portable --- test/cli-exit-lifecycle.test.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/test/cli-exit-lifecycle.test.ts b/test/cli-exit-lifecycle.test.ts index b9328ed..8558596 100644 --- a/test/cli-exit-lifecycle.test.ts +++ b/test/cli-exit-lifecycle.test.ts @@ -11,6 +11,7 @@ describe("CLI successful-exit lifecycle", () => { await finishSuccessfulCliCommand({ command: "query", format: "json", + platform: "linux", cleanup: async () => { throw new Error("ggml_metal_device_free abort simulation"); }, From dc49ccff1e014aa2cb085d82c1777417de05e8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 16 May 2026 17:17:40 +0000 Subject: [PATCH 6/7] test: cover qmd bin wrapper install layouts --- CHANGELOG.md | 3 + test/bin-wrapper.test.ts | 164 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 test/bin-wrapper.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 5abb2ae..69c05be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -45,6 +45,9 @@ - Packaging: install AST grammar WASM packages as required dependencies so Bun global installs include TypeScript/TSX/JavaScript grammars, and add a `smoke:package-grammars` verification command. #595 +- Launcher: add wrapper smoke coverage for scoped package, npm/npx, + Homebrew/Linuxbrew, Bun global symlink layouts, and `$BUN_INSTALL` + false-positive runtime selection regressions. #351 #353 #354 #356 #358 #359 ## [2.1.0] - 2026-04-05 diff --git a/test/bin-wrapper.test.ts b/test/bin-wrapper.test.ts new file mode 100644 index 0000000..82796d3 --- /dev/null +++ b/test/bin-wrapper.test.ts @@ -0,0 +1,164 @@ +import { afterEach, describe, expect, test } from "vitest"; +import { chmodSync, copyFileSync, mkdtempSync, mkdirSync, readFileSync, realpathSync, rmSync, symlinkSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join, relative } from "node:path"; +import { execFileSync } from "node:child_process"; +import { fileURLToPath } from "node:url"; + +const repoRoot = fileURLToPath(new URL("..", import.meta.url)); +const fixtures: string[] = []; + +function makeTempFixture() { + const root = mkdtempSync(join(tmpdir(), "qmd-bin-wrapper-")); + fixtures.push(root); + const capturePath = join(root, "capture.txt"); + const runtimeBin = join(root, "runtime-bin"); + mkdirSync(runtimeBin, { recursive: true }); + + for (const runtime of ["node", "bun"]) { + const runtimePath = join(runtimeBin, runtime); + writeFileSync( + runtimePath, + `#!/bin/sh\n{\n printf '%s\\n' '${runtime}'\n printf '%s\\n' "$1"\n shift\n printf '%s\\n' "$@"\n} > "$QMD_WRAPPER_CAPTURE"\n`, + ); + chmodSync(runtimePath, 0o755); + } + + return { root, capturePath, runtimeBin }; +} + +function makePackage(root: string, packagePath: string, lockfiles: string[] = []) { + const packageRoot = join(root, packagePath); + mkdirSync(join(packageRoot, "bin"), { recursive: true }); + mkdirSync(join(packageRoot, "dist", "cli"), { recursive: true }); + copyFileSync(join(repoRoot, "bin", "qmd"), join(packageRoot, "bin", "qmd")); + chmodSync(join(packageRoot, "bin", "qmd"), 0o755); + writeFileSync(join(packageRoot, "dist", "cli", "qmd.js"), "// fixture\n"); + for (const lockfile of lockfiles) { + writeFileSync(join(packageRoot, lockfile), ""); + } + return packageRoot; +} + +function symlinkRelative(target: string, linkPath: string) { + mkdirSync(dirname(linkPath), { recursive: true }); + symlinkSync(relative(dirname(linkPath), target), linkPath); +} + +function runWrapper(commandPath: string, runtimeBin: string, capturePath: string, env: Record = {}) { + rmSync(capturePath, { force: true }); + execFileSync(commandPath, ["--version"], { + env: { + ...process.env, + ...env, + PATH: `${runtimeBin}:${process.env.PATH ?? ""}`, + QMD_WRAPPER_CAPTURE: capturePath, + }, + stdio: ["ignore", "pipe", "pipe"], + }); + const [runtime, scriptPath, ...args] = readFileSync(capturePath, "utf8").trimEnd().split("\n"); + return { runtime, scriptPath, args }; +} + +afterEach(() => { + for (const fixture of fixtures.splice(0)) { + rmSync(fixture, { recursive: true, force: true }); + } +}); + +describe("bin/qmd package wrapper", () => { + test("direct package invocation resolves dist/cli/qmd.js from the package root", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "node_modules/@tobilu/qmd"); + + const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + expect(result.args).toEqual(["--version"]); + }); + + test("npm/Homebrew global bin symlink resolves scoped package path", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd"); + const globalBin = join(root, "opt", "homebrew", "bin", "qmd"); + symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin); + + const result = runWrapper(globalBin, runtimeBin, capturePath); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); + + test("multi-hop global bin symlink chain resolves to the real package root", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd"); + const globalBin = join(root, "opt", "homebrew", "bin", "qmd"); + const shim = join(root, "opt", "homebrew", "Cellar", "qmd", "current", "bin", "qmd"); + symlinkRelative(join(packageRoot, "bin", "qmd"), shim); + symlinkRelative(shim, globalBin); + + const result = runWrapper(globalBin, runtimeBin, capturePath); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); + + test("linuxbrew global bin symlink resolves lib/node_modules scoped package path", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "home/linuxbrew/.linuxbrew/lib/node_modules/@tobilu/qmd"); + const globalBin = join(root, "home", "linuxbrew", ".linuxbrew", "bin", "qmd"); + symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin); + + const result = runWrapper(globalBin, runtimeBin, capturePath); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); + + test("npx scoped package .bin symlink resolves @tobilu/qmd package path", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "npm/_npx/abc123/node_modules/@tobilu/qmd"); + const npxBin = join(root, "npm", "_npx", "abc123", "node_modules", ".bin", "qmd"); + symlinkRelative(join(packageRoot, "bin", "qmd"), npxBin); + + const result = runWrapper(npxBin, runtimeBin, capturePath); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); + + test("bun global symlink uses bun when package-local bun lockfile exists", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "home/user/.bun/install/global/node_modules/@tobilu/qmd", ["bun.lock"]); + const bunBin = join(root, "home", "user", ".bun", "bin", "qmd"); + symlinkRelative(join(packageRoot, "bin", "qmd"), bunBin); + + const result = runWrapper(bunBin, runtimeBin, capturePath); + + expect(result.runtime).toBe("bun"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); + + test("ambient BUN_INSTALL alone does not select bun for an npm-installed package", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd"); + const globalBin = join(root, "opt", "homebrew", "bin", "qmd"); + symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin); + + const result = runWrapper(globalBin, runtimeBin, capturePath, { BUN_INSTALL: join(root, ".bun") }); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); + + test("package-lock.json takes priority over bun lockfiles", () => { + const { root, runtimeBin, capturePath } = makeTempFixture(); + const packageRoot = makePackage(root, "node_modules/@tobilu/qmd", ["package-lock.json", "bun.lock"]); + + const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath); + + expect(result.runtime).toBe("node"); + expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js"))); + }); +}); From 910ca07fd9df70616e4692547bc389dfc2965bb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 16 May 2026 17:36:06 +0000 Subject: [PATCH 7/7] fix: keep partial embeddings pending --- CHANGELOG.md | 4 ++ src/cli/qmd.ts | 2 +- src/store.ts | 121 ++++++++++++++++++++++++++++++++++----------- test/store.test.ts | 82 ++++++++++++++++++++++++++++++ 4 files changed, 178 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ee8337..d1f26af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ per result). Output payload for `--full` queries is now proportional to total document size. - macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368 +- Embedding: require complete chunk coverage before treating a document as + embedded, remove partial vectors when chunk/session failures leave a + document incomplete, and keep `qmd status` pending counts honest after + interrupted long embed runs. #637 #378 - Embedding: `qmd embed -c ` now scopes pending-doc selection to the requested collection instead of embedding global pending work. Scoped `--force` clears only collection-owned vectors, preserves shared diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index fda6d8e..2ff3796 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -1806,7 +1806,7 @@ async function vectorIndex( } // Check if there's work to do before starting - const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection); + const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection, model); if (hashesToEmbed === 0 && !force) { console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`); closeDb(); diff --git a/src/store.ts b/src/store.ts index 003feca..5323245 100644 --- a/src/store.ts +++ b/src/store.ts @@ -871,10 +871,15 @@ function initializeDatabase(db: Database): void { seq INTEGER NOT NULL DEFAULT 0, pos INTEGER NOT NULL DEFAULT 0, model TEXT NOT NULL, + total_chunks INTEGER NOT NULL DEFAULT 1, embedded_at TEXT NOT NULL, PRIMARY KEY (hash, seq) ) `); + const cvInfoAfterCreate = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[]; + if (!cvInfoAfterCreate.some(col => col.name === 'total_chunks')) { + db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`); + } // Store collections — makes the DB self-contained (no external config needed) db.exec(` @@ -1167,9 +1172,9 @@ export type Store = { ensureVecTable: (dimensions: number) => void; // Index health - getHashesNeedingEmbedding: () => number; - getIndexHealth: () => IndexHealthInfo; - getStatus: () => IndexStatus; + getHashesNeedingEmbedding: (model?: string) => number; + getIndexHealth: (model?: string) => IndexHealthInfo; + getStatus: (model?: string) => IndexStatus; // Caching getCacheKey: typeof getCacheKey; @@ -1229,7 +1234,7 @@ export type Store = { // Vector/embedding operations getHashesForEmbedding: () => { hash: string; body: string; path: string }[]; clearAllEmbeddings: () => void; - insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void; + insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => void; }; // ============================================================================= @@ -1420,18 +1425,31 @@ function resolveEmbedOptions(options?: EmbedOptions): Required col.name === 'total_chunks') ? 'MAX(total_chunks)' : '1'; +} + +function getPendingEmbeddingDocs(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): PendingEmbeddingDoc[] { const collectionFilter = collection ? `AND d.collection = ?` : ``; + const expectedChunksExpr = contentVectorExpectedChunksExpr(db); const stmt = db.prepare(` SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes FROM documents d JOIN content c ON d.hash = c.hash - LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 - WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter} + LEFT JOIN ( + SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks + FROM content_vectors + WHERE model = ? + GROUP BY hash, model + ) v ON d.hash = v.hash + WHERE d.active = 1 + AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks) + ${collectionFilter} GROUP BY d.hash ORDER BY MIN(d.path) `); - return (collection ? stmt.all(collection) : stmt.all()) as PendingEmbeddingDoc[]; + return (collection ? stmt.all(model, collection) : stmt.all(model)) as PendingEmbeddingDoc[]; } function buildEmbeddingBatches( @@ -1502,7 +1520,7 @@ export async function generateEmbeddings( clearAllEmbeddings(db, options?.collection); } - const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection); + const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model); if (docsToEmbed.length === 0) { return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 }; @@ -1533,6 +1551,7 @@ export async function generateEmbeddings( const batchDocs = getEmbeddingDocsForBatch(db, batchMeta); const batchChunks: ChunkItem[] = []; + const expectedChunksByHash = new Map(); const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0); for (const doc of batchDocs) { @@ -1558,6 +1577,7 @@ export async function generateEmbeddings( bytes: encoder.encode(chunks[seq]!.text).length, }); } + expectedChunksByHash.set(doc.hash, chunks.length); } totalChunks += batchChunks.length; @@ -1610,7 +1630,7 @@ export async function generateEmbeddings( const chunk = chunkBatch[i]!; const embedding = embeddings[i]; if (embedding) { - insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now); + insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1); chunksEmbedded++; } else { errors++; @@ -1629,7 +1649,7 @@ export async function generateEmbeddings( const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri); const result = await session.embed(text, { model }); if (result) { - insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now); + insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1); chunksEmbedded++; } else { errors++; @@ -1654,6 +1674,11 @@ export async function generateEmbeddings( }); } + const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model); + if (removedPartialChunks > 0) { + chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks); + } + bytesProcessed += batchBytes; options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors }); } @@ -1688,9 +1713,9 @@ export function createStore(dbPath?: string): Store { ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions), // Index health - getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db), - getIndexHealth: () => getIndexHealth(db), - getStatus: () => getStatus(db), + getHashesNeedingEmbedding: (model?: string) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL), + getIndexHealth: (model?: string) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL), + getStatus: (model?: string) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL), // Caching getCacheKey, @@ -1750,7 +1775,7 @@ export function createStore(dbPath?: string): Store { // Vector/embedding operations getHashesForEmbedding: () => getHashesForEmbedding(db), clearAllEmbeddings: () => clearAllEmbeddings(db), - insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt), + insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks), }; return store; @@ -1949,15 +1974,23 @@ export type IndexStatus = { // Index health // ============================================================================= -export function getHashesNeedingEmbedding(db: Database, collection?: string): number { +export function getHashesNeedingEmbedding(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): number { const collectionFilter = collection ? `AND d.collection = ?` : ``; + const expectedChunksExpr = contentVectorExpectedChunksExpr(db); const stmt = db.prepare(` SELECT COUNT(DISTINCT d.hash) as count FROM documents d - LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 - WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter} + LEFT JOIN ( + SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks + FROM content_vectors + WHERE model = ? + GROUP BY hash, model + ) v ON d.hash = v.hash + WHERE d.active = 1 + AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks) + ${collectionFilter} `); - const result = (collection ? stmt.get(collection) : stmt.get()) as { count: number }; + const result = (collection ? stmt.get(model, collection) : stmt.get(model)) as { count: number }; return result.count; } @@ -1967,8 +2000,8 @@ export type IndexHealthInfo = { daysStale: number | null; }; -export function getIndexHealth(db: Database): IndexHealthInfo { - const needsEmbedding = getHashesNeedingEmbedding(db); +export function getIndexHealth(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexHealthInfo { + const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model); const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count; const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null }; @@ -3316,15 +3349,22 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi * Get all unique content hashes that need embeddings (from active documents). * Returns hash, document body, and a sample path for display purposes. */ -export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] { +export function getHashesForEmbedding(db: Database, model: string = DEFAULT_EMBED_MODEL): { hash: string; body: string; path: string }[] { + const expectedChunksExpr = contentVectorExpectedChunksExpr(db); return db.prepare(` SELECT d.hash, c.doc as body, MIN(d.path) as path FROM documents d JOIN content c ON d.hash = c.hash - LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0 - WHERE d.active = 1 AND v.hash IS NULL + LEFT JOIN ( + SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks + FROM content_vectors + WHERE model = ? + GROUP BY hash, model + ) v ON d.hash = v.hash + WHERE d.active = 1 + AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks) GROUP BY d.hash - `).all() as { hash: string; body: string; path: string }[]; + `).all(model) as { hash: string; body: string; path: string }[]; } /** @@ -3409,13 +3449,14 @@ export function insertEmbedding( pos: number, embedding: Float32Array, model: string, - embeddedAt: string + embeddedAt: string, + totalChunks: number = 1 ): void { const hashSeq = `${hash}_${seq}`; // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding) - const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`); - insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt); + const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`); + insertContentVectorStmt.run(hash, seq, pos, model, totalChunks, embeddedAt); // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`); @@ -3424,6 +3465,26 @@ export function insertEmbedding( insertVecStmt.run(hashSeq, embedding); } +function removeIncompleteEmbeddings(db: Database, expectedChunksByHash: Map, model: string): number { + let removed = 0; + const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`); + const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`); + const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`); + + for (const [hash, expectedChunks] of expectedChunksByHash) { + const rows = rowsStmt.all(hash, model) as { seq: number }[]; + if (rows.length === 0 || rows.length === expectedChunks) continue; + + for (const row of rows) { + deleteVecStmt.run(`${hash}_${row.seq}`); + } + deleteContentStmt.run(hash, model); + removed += rows.length; + } + + return removed; +} + // ============================================================================= // Query expansion // ============================================================================= @@ -3922,7 +3983,7 @@ export function findDocuments( // Status // ============================================================================= -export function getStatus(db: Database): IndexStatus { +export function getStatus(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexStatus { // DB is source of truth for collections — config provides supplementary metadata const dbCollections = db.prepare(` SELECT @@ -3957,7 +4018,7 @@ export function getStatus(db: Database): IndexStatus { }); const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c; - const needsEmbedding = getHashesNeedingEmbedding(db); + const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model); const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); return { diff --git a/test/store.test.ts b/test/store.test.ts index 2adf717..9f132f8 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -2281,6 +2281,26 @@ describe("Index Status", () => { await cleanupTestDb(store); }); + test("embedding health is scoped to the active embed model", async () => { + const store = await createTestStore(); + const collectionName = await createTestCollection(); + const activeModel = "hf:active/embed-model.gguf"; + const staleModel = "hf:stale/embed-model.gguf"; + const now = new Date().toISOString(); + + store.llm = { embedModelName: activeModel } as any; + store.ensureVecTable(3); + await insertTestDocument(store.db, collectionName, { name: "doc1", hash: "hash1" }); + store.insertEmbedding("hash1", 0, 0, new Float32Array([1, 2, 3]), staleModel, now, 1); + + expect(store.getHashesNeedingEmbedding()).toBe(1); + expect(store.getStatus().needsEmbedding).toBe(1); + expect(store.getIndexHealth().needsEmbedding).toBe(1); + expect(store.getHashesNeedingEmbedding(staleModel)).toBe(0); + + await cleanupTestDb(store); + }); + test("getIndexHealth returns health info", async () => { const store = await createTestStore(); const collectionName = await createTestCollection(); @@ -3093,6 +3113,68 @@ describe("Embedding batching", () => { } }); + test("generateEmbeddings does not mark a partially embedded multi-chunk document complete", async () => { + const store = await createTestStore(); + const db = store.db; + const fakeLlm = { + async embed(_text: string, _options?: { model?: string }) { + return { embedding: [0.1, 0.2, 0.3], model: "fake-embed" }; + }, + async embedBatch(texts: string[], _options?: { model?: string }) { + return texts.map((_text, index) => index === 0 + ? { embedding: [1, 2, 3], model: "fake-embed" } + : null + ); + }, + }; + + setDefaultLlamaCpp(createFakeTokenizer() as any); + store.llm = fakeLlm as any; + + try { + await insertTestDocument(db, "docs", { + name: "long-doc", + body: "# Long doc\n\n" + "partial embedding regression ".repeat(260), + }); + + const result = await generateEmbeddings(store); + + expect(result.errors).toBeGreaterThan(0); + expect(db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get()).toEqual({ count: 0 }); + expect(db.prepare(`SELECT COUNT(*) as count FROM vectors_vec`).get()).toEqual({ count: 0 }); + expect(store.getHashesNeedingEmbedding()).toBe(1); + expect(store.getStatus().needsEmbedding).toBe(1); + } finally { + setDefaultLlamaCpp(null); + await cleanupTestDb(store); + } + }); + + test("generateEmbeddings opens a long-lived LLM session for embed runs", async () => { + const store = await createTestStore(); + const fakeLlm = createFakeEmbedLlm(); + const sessionSpy = vi.spyOn(llmModule, "withLLMSessionForLlm"); + + setDefaultLlamaCpp(createFakeTokenizer() as any); + store.llm = fakeLlm as any; + + try { + await insertTestDocument(store.db, "docs", { name: "one", body: "# One\n\nAlpha" }); + + await generateEmbeddings(store); + + expect(sessionSpy).toHaveBeenCalledWith( + fakeLlm, + expect.any(Function), + expect.objectContaining({ maxDuration: 30 * 60 * 1000, name: "generateEmbeddings" }), + ); + } finally { + sessionSpy.mockRestore(); + setDefaultLlamaCpp(null); + await cleanupTestDb(store); + } + }); + test("vectorSearchQuery uses the active llm embed model for vector lookups", async () => { const store = await createTestStore(); const model = "hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf";