Merge remote-tracking branch 'origin/main' into feat/local-qmd-index-bench

# Conflicts: # src/cli/qmd.ts
2026-05-16 18:27:49 +00:00 · 2026-05-16 18:27:49 +00:00 · b2550d273a
commit b2550d273a
parent 2e0c74310c 87520252a5
12 changed files with 859 additions and 70 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,23 @@

 ### Fixes

+- GPU: add `QMD_FORCE_CPU=1` / `--no-gpu` to bypass CUDA/Vulkan/Metal probing entirely, and route native llama.cpp stdout noise to stderr so JSON output stays parseable during search/query commands.
+- Snippet line numbers: `qmd_query` (MCP), HTTP `/query`, and `qmd query`
+  (CLI JSON output and snippet headers) now return absolute source-file
+  line numbers instead of chunk-local ones, so the `line` field can be
+  passed back to `qmd_get` as `fromLine` without a separate lookup.
+  Snippet selection remains scoped to the best matching chunk
+  (preserves #149).
+- CLI: `qmd query --full` now emits the full document body in all output
+  formats (json, csv, md, xml), restoring the documented behavior of the
+  flag. Previously it returned only the best matching chunk (~3.6KB max
+  per result). Output payload for `--full` queries is now proportional
+  to total document size.
+- macOS Metal: `qmd query --json` now flushes successful JSON output and uses a safe immediate-exit path on Darwin to avoid ggml Metal finalizer aborts; other commands still dispose LLM contexts/models before the llama runtime. #368
+- Embedding: require complete chunk coverage before treating a document as
+  embedded, remove partial vectors when chunk/session failures leave a
+  document incomplete, and keep `qmd status` pending counts honest after
+  interrupted long embed runs. #637 #378
 - Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
  to the requested collection instead of embedding global pending work.
  Scoped `--force` clears only collection-owned vectors, preserves shared
@ -33,6 +50,9 @@
 - Packaging: install AST grammar WASM packages as required dependencies so
  Bun global installs include TypeScript/TSX/JavaScript grammars, and add a
  `smoke:package-grammars` verification command. #595
+- Launcher: add wrapper smoke coverage for scoped package, npm/npx,
+  Homebrew/Linuxbrew, Bun global symlink layouts, and `$BUN_INSTALL`
+  false-positive runtime selection regressions. #351 #353 #354 #356 #358 #359

 ## [2.1.0] - 2026-04-05

--- a/README.md
+++ b/README.md
@ -798,6 +798,7 @@ llm_cache       -- Cached LLM responses (query expansion, rerank scores)
 |----------|---------|-------------|
 | `XDG_CACHE_HOME` | `~/.cache` | Cache directory location |
 | `QMD_LLAMA_GPU` | `auto` | Force llama.cpp GPU backend (`metal`, `vulkan`, `cuda`) or disable GPU with `false` |
+| `QMD_FORCE_CPU` | unset | Set to `1`/`true` to force CPU mode before any CUDA/Vulkan/Metal probing. Equivalent CLI flag: `--no-gpu`. |
 | `QMD_EMBED_PARALLELISM` | automatic | Override embedding/reranking context parallelism (1-8). Windows CUDA defaults to `1` because parallel CUDA contexts can crash with `ggml-cuda.cu:98`; use Vulkan or raise this only if your driver is stable. |

 ## How It Works
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -212,6 +212,76 @@ const cursor = {
  show() { process.stderr.write('\x1b[?25h'); },
 };

+type CliLifecycleWritable = {
+  write(chunk: string | Uint8Array, callback?: (error?: Error | null) => void): boolean;
+};
+
+type FinishSuccessfulCliCommandOptions = {
+  command: string;
+  format?: OutputFormat;
+  cleanup?: () => Promise<void>;
+  exit?: (code: number) => void;
+  immediateExit?: (code: number) => void;
+  stdout?: CliLifecycleWritable;
+  stderr?: CliLifecycleWritable;
+  platform?: NodeJS.Platform;
+};
+
+async function flushWritable(stream: CliLifecycleWritable): Promise<void> {
+  await new Promise<void>((resolve) => {
+    stream.write("", () => resolve());
+  });
+}
+
+function shouldBypassNativeCleanup(options: FinishSuccessfulCliCommandOptions): boolean {
+  return (
+    (options.platform ?? process.platform) === "darwin" &&
+    options.command === "query" &&
+    options.format === "json" &&
+    process.env.QMD_DISABLE_DARWIN_QUERY_JSON_SAFE_EXIT !== "1"
+  );
+}
+
+function immediateProcessExit(code: number): void {
+  const processWithReallyExit = process as NodeJS.Process & { reallyExit?: (code?: number) => void };
+  if (typeof processWithReallyExit.reallyExit === "function") {
+    processWithReallyExit.reallyExit(code);
+    return;
+  }
+  process.exit(code);
+}
+
+/**
+ * Finish a successful CLI command after output has been flushed. On macOS JSON
+ * query runs, skip normal native teardown and use Node/Bun's immediate exit path:
+ * ggml Metal can abort from C++ finalizers after valid JSON has already been
+ * produced (#368). This wrapper is only reached after the command completed, so
+ * real query failures still exit through the normal error path before this runs.
+ */
+export async function finishSuccessfulCliCommand(options: FinishSuccessfulCliCommandOptions): Promise<void> {
+  const stderr = options.stderr ?? process.stderr;
+  const exit = options.exit ?? ((code: number) => process.exit(code));
+  const immediateExit = options.immediateExit ?? immediateProcessExit;
+
+  await flushWritable(options.stdout ?? process.stdout);
+
+  if (shouldBypassNativeCleanup(options)) {
+    await flushWritable(stderr);
+    immediateExit(0);
+    return;
+  }
+
+  try {
+    await (options.cleanup ?? disposeDefaultLlamaCpp)();
+  } catch (error) {
+    stderr.write(
+      `QMD Warning: cleanup after successful output failed (${error instanceof Error ? error.message : String(error)}); exiting 0 because command output completed.\n`
+    );
+  }
+  await flushWritable(stderr);
+  exit(0);
+}
+
 // Ensure cursor is restored on exit
 process.on('SIGINT', () => { cursor.show(); process.exit(130); });
 process.on('SIGTERM', () => { cursor.show(); process.exit(143); });
@ -849,6 +919,7 @@ function getDocument(filename: string, fromLine?: number, maxLines?: number, lin
      inputPath = inputPath.slice(0, -colonMatch[0].length);
    }
  }
+  if (fromLine !== undefined) fromLine = Math.max(1, fromLine);

  const parsedIndexPath = isVirtualPath(inputPath) ? parseVirtualPath(inputPath) : null;
  if (parsedIndexPath?.indexName) {
@ -1740,7 +1811,7 @@ async function vectorIndex(
  }

  // Check if there's work to do before starting
-  const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection);
+  const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection, model);
  if (hashesToEmbed === 0 && !force) {
    console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
    closeDb();
@ -1930,6 +2001,7 @@ type OutputRow = {
  score: number;
  context?: string | null;
  chunkPos?: number;
+  chunkLen?: number;
  hash?: string;
  docid?: string;
  explain?: HybridQueryExplain;
@ -2012,9 +2084,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
    // JSON output for LLM consumption
    const output = filtered.map(row => {
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
+      const snippetInfo = extractSnippet(row.body, query, 300, row.chunkPos, row.chunkLen, opts.intent);
      let body = opts.full ? row.body : undefined;
-      const snippetInfo = !opts.full ? extractSnippet(row.body, query, 300, row.chunkPos, undefined, opts.intent) : undefined;
-      let snippet = snippetInfo?.snippet;
+      let snippet = !opts.full ? snippetInfo.snippet : undefined;
      if (opts.lineNumbers) {
        if (body) body = addLineNumbers(body);
        if (snippet) snippet = addLineNumbers(snippet);
@ -2023,7 +2095,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
        ...(docid && { docid: `#${docid}` }),
        score: Math.round(row.score * 100) / 100,
        file: toQmdPath(row.displayPath),
-        ...(snippetInfo && { line: snippetInfo.line }),
+        line: snippetInfo.line,
        title: row.title,
        ...(row.context && { context: row.context }),
        ...(body && { body }),
@ -2046,7 +2118,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
    for (let i = 0; i < filtered.length; i++) {
      const row = filtered[i];
      if (!row) continue;
-      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
+      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);

      // Line 1: filepath with docid
@ -2110,8 +2182,9 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
      console.log();

      // Snippet with highlighting (diff-style header included)
-      let displaySnippet = opts.lineNumbers ? addLineNumbers(snippet, line) : snippet;
-      const highlighted = highlightTerms(displaySnippet, query);
+      const content = opts.full ? row.body : snippet;
+      const displayContent = opts.lineNumbers ? addLineNumbers(content, opts.full ? 1 : line) : content;
+      const highlighted = highlightTerms(displayContent, query);
      console.log(highlighted);

      // Double empty line between results
@ -2123,7 +2196,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
      if (!row) continue;
      const heading = row.title || row.displayPath;
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : undefined);
-      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
+      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
      if (opts.lineNumbers) {
        content = addLineNumbers(content);
      }
@ -2136,7 +2209,7 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
      const titleAttr = row.title ? ` title="${row.title.replace(/"/g, '&quot;')}"` : "";
      const contextAttr = row.context ? ` context="${row.context.replace(/"/g, '&quot;')}"` : "";
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
-      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent).snippet;
+      let content = opts.full ? row.body : extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent).snippet;
      if (opts.lineNumbers) {
        content = addLineNumbers(content);
      }
@ -2146,10 +2219,10 @@ function outputResults(results: OutputRow[], query: string, opts: OutputOptions)
    // CSV format
    console.log("docid,score,file,title,context,line,snippet");
    for (const row of filtered) {
-      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, undefined, opts.intent);
+      const { line, snippet } = extractSnippet(row.body, query, 500, row.chunkPos, row.chunkLen, opts.intent);
      let content = opts.full ? row.body : snippet;
      if (opts.lineNumbers) {
-        content = addLineNumbers(content, line);
+        content = addLineNumbers(content, opts.full ? 1 : line);
      }
      const docid = row.docid || (row.hash ? row.hash.slice(0, 6) : "");
      const snippetText = content || "";
@ -2505,13 +2578,13 @@ async function querySearch(query: string, opts: OutputOptions, _embedModel: stri
      ? (structuredQueries.find(s => s.type === 'lex')?.query || structuredQueries.find(s => s.type === 'vec')?.query || query)
      : query;

-    // Map to CLI output format — use bestChunk for snippet display
    outputResults(results.map(r => ({
      file: r.file,
      displayPath: r.displayPath,
      title: r.title,
-      body: r.bestChunk,
+      body: r.body,
      chunkPos: r.bestChunkPos,
+      chunkLen: r.bestChunk.length,
      score: r.score,
      context: r.context,
      docid: r.docid,
@ -2567,6 +2640,7 @@ function parseCLI() {
      // Query options
      "candidate-limit": { type: "string", short: "C" },
      "no-rerank": { type: "boolean", default: false },
+      "no-gpu": { type: "boolean", default: false },
      intent: { type: "string" },
      // Chunking options
      "chunk-strategy": { type: "string" },  // "regex" (default) or "auto" (AST for code files)
@ -2579,6 +2653,10 @@ function parseCLI() {
    strict: false, // Allow unknown options to pass through
  });

+  if (values["no-gpu"]) {
+    process.env.QMD_FORCE_CPU = "1";
+  }
+
  // Select index name (default: "index"). If no explicit --index is supplied,
  // a project-local .qmd/index.yaml overrides the global config/cache paths.
  const indexName = values.index as string | undefined;
@ -2842,6 +2920,7 @@ function showHelp(): void {
  console.log("  --full                     - Output full document instead of snippet");
  console.log("  -C, --candidate-limit <n>  - Max candidates to rerank (default 40, lower = faster)");
  console.log("  --no-rerank                - Skip LLM reranking (use RRF scores only, much faster on CPU)");
+  console.log("  --no-gpu                   - Force CPU mode for llama.cpp operations (same as QMD_FORCE_CPU=1)");
  console.log("  --line-numbers             - Include line numbers in output");
  console.log("  --explain                  - Include retrieval score traces (query --json/CLI)");
  console.log("  --files | --json | --csv | --md | --xml  - Output format");
@ -3430,8 +3509,10 @@ if (isMain) {
  }

  if (cli.command !== "mcp") {
-    await disposeDefaultLlamaCpp();
-    process.exit(0);
+    await finishSuccessfulCliCommand({
+      command: cli.command,
+      format: cli.opts.format,
+    });
  }

 } // end if (main module)
--- a/src/llm.ts
+++ b/src/llm.ts
@ -22,10 +22,45 @@ type NodeLlamaCppModule = {

 let nodeLlamaCppImport: Promise<NodeLlamaCppModule> | null = null;
 async function loadNodeLlamaCpp(): Promise<NodeLlamaCppModule> {
-  nodeLlamaCppImport ??= import("node-llama-cpp") as Promise<NodeLlamaCppModule>;
+  nodeLlamaCppImport ??= withNativeStdoutRedirectedToStderr(
+    () => import("node-llama-cpp") as Promise<NodeLlamaCppModule>
+  );
  return nodeLlamaCppImport;
 }

+export function setNodeLlamaCppModuleForTest(module: NodeLlamaCppModule | null): void {
+  nodeLlamaCppImport = module ? Promise.resolve(module) : null;
+  failedGpuInitModes.clear();
+}
+
+type StdoutWrite = typeof process.stdout.write;
+let nativeStdoutRedirectDepth = 0;
+let originalStdoutWrite: StdoutWrite | null = null;
+
+/**
+ * Some node-llama-cpp native build/probe paths write library noise to stdout.
+ * JSON APIs must reserve stdout for machine-readable payloads, so route that
+ * noise to stderr while native llama initialization is in progress.
+ */
+export async function withNativeStdoutRedirectedToStderr<T>(fn: () => Promise<T>): Promise<T> {
+  if (nativeStdoutRedirectDepth === 0) {
+    originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite;
+    process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => {
+      return process.stderr.write(chunk, encoding, cb as any);
+    }) as StdoutWrite;
+  }
+  nativeStdoutRedirectDepth++;
+  try {
+    return await fn();
+  } finally {
+    nativeStdoutRedirectDepth--;
+    if (nativeStdoutRedirectDepth === 0 && originalStdoutWrite) {
+      process.stdout.write = originalStdoutWrite;
+      originalStdoutWrite = null;
+    }
+  }
+}
+
 import { homedir } from "os";
 import { join } from "path";
 import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync, writeFileSync, openSync, readSync, closeSync } from "fs";
@ -487,7 +522,15 @@ export function resolveSafeParallelism(options: ParallelismOptions): number {
  return Math.max(1, options.computed);
 }

-export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): LlamaGpuMode {
+export function resolveLlamaGpuMode(
+  envValue = process.env.QMD_LLAMA_GPU,
+  forceCpuValue = process.env.QMD_FORCE_CPU
+): LlamaGpuMode {
+  const forceCpu = forceCpuValue?.trim().toLowerCase() ?? "";
+  if (forceCpu && !["false", "off", "none", "disable", "disabled", "0"].includes(forceCpu)) {
+    return false;
+  }
+
  const normalized = envValue?.trim().toLowerCase() ?? "";
  if (!normalized) return "auto";
  if (["false", "off", "none", "disable", "disabled", "0"].includes(normalized)) return false;
@ -497,6 +540,23 @@ export function resolveLlamaGpuMode(envValue = process.env.QMD_LLAMA_GPU): Llama
  return "auto";
 }

+async function disposeWithTimeout(resourceName: string, dispose: () => Promise<void>, timeoutMs = 1000): Promise<void> {
+  const timeoutPromise = new Promise<"timeout">((resolve) => {
+    setTimeout(() => resolve("timeout"), timeoutMs).unref();
+  });
+
+  try {
+    const result = await Promise.race([dispose(), timeoutPromise]);
+    if (result === "timeout") {
+      process.stderr.write(`QMD Warning: timed out disposing ${resourceName}; continuing shutdown.\n`);
+    }
+  } catch (error) {
+    process.stderr.write(
+      `QMD Warning: failed to dispose ${resourceName} (${error instanceof Error ? error.message : String(error)}); continuing shutdown.\n`
+    );
+  }
+}
+
 function resolveExpandContextSize(configValue?: number): number {
  if (configValue !== undefined) {
    if (!Number.isInteger(configValue) || configValue <= 0) {
@ -518,6 +578,8 @@ function resolveExpandContextSize(configValue?: number): number {
  return parsed;
 }

+const failedGpuInitModes = new Set<LlamaGpuMode>();
+
 export class LlamaCpp implements LLM {
  private readonly _ciMode = !!process.env.CI;
  private llama: Llama | null = null;
@ -668,22 +730,29 @@ export class LlamaCpp implements LLM {

      const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp();
      const loadLlama = async (gpu: LlamaGpuMode) =>
-        await getLlama({
+        await withNativeStdoutRedirectedToStderr(() => getLlama({
          build: allowBuild ? "autoAttempt" : "never",
          logLevel: LlamaLogLevel.error,
          gpu,
          skipDownload: !allowBuild,
-        });
+        }));

      let llama: Llama;
-      if (gpuMode === false) {
+      if (gpuMode === false || failedGpuInitModes.has(gpuMode)) {
+        if (gpuMode !== false && failedGpuInitModes.has(gpuMode)) {
+          process.stderr.write(
+            `QMD Warning: skipping previously failed GPU init${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`}, using CPU.\n`
+          );
+        }
        llama = await loadLlama(false);
      } else {
        try {
          llama = await loadLlama(gpuMode);
        } catch (err) {
-          // GPU backend (e.g. Vulkan on headless/driverless machines) can throw at init.
-          // Fall back to CPU so qmd still works.
+          // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
+          // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
+          // expensive native build/probe attempts in this process.
+          failedGpuInitModes.add(gpuMode);
          process.stderr.write(
            `QMD Warning: GPU init failed${gpuMode === "auto" ? "" : ` for QMD_LLAMA_GPU=${gpuMode}`} (${err instanceof Error ? err.message : String(err)}), falling back to CPU.\n`
          );
@ -1413,22 +1482,37 @@ export class LlamaCpp implements LLM {
      this.inactivityTimer = null;
    }

-    // Disposing llama cascades to models and contexts automatically
-    // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
-    // Note: llama.dispose() can hang indefinitely, so we use a timeout
-    if (this.llama) {
-      const disposePromise = this.llama.dispose();
-      const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
-      await Promise.race([disposePromise, timeoutPromise]);
+    // Explicitly dispose in dependency order: contexts first, then models, then llama.
+    // Relying only on llama.dispose() leaves Metal resource sets alive until process
+    // finalization on Apple Silicon, where ggml_metal_device_free can abort after
+    // otherwise-successful CLI output (#368).
+    for (const ctx of this.embedContexts) {
+      await disposeWithTimeout("embedding context", () => ctx.dispose());
+    }
+    this.embedContexts = [];
+
+    for (const ctx of this.rerankContexts) {
+      await disposeWithTimeout("rerank context", () => ctx.dispose());
+    }
+    this.rerankContexts = [];
+
+    if (this.embedModel) {
+      await disposeWithTimeout("embedding model", () => this.embedModel!.dispose());
+      this.embedModel = null;
+    }
+    if (this.generateModel) {
+      await disposeWithTimeout("generation model", () => this.generateModel!.dispose());
+      this.generateModel = null;
+    }
+    if (this.rerankModel) {
+      await disposeWithTimeout("rerank model", () => this.rerankModel!.dispose());
+      this.rerankModel = null;
    }

-    // Clear references
-    this.embedContexts = [];
-    this.rerankContexts = [];
-    this.embedModel = null;
-    this.generateModel = null;
-    this.rerankModel = null;
-    this.llama = null;
+    if (this.llama) {
+      await disposeWithTimeout("llama runtime", () => this.llama!.dispose());
+      this.llama = null;
+    }

    // Clear any in-flight load/create promises
    this.embedModelLoadPromise = null;
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@ -42,6 +42,7 @@ type SearchResultItem = {
  title: string;
  score: number;
  context: string | null;
+  line: number;   // Absolute line in source markdown
  snippet: string;
 };

@ -239,6 +240,8 @@ async function createMcpServer(store: QMDStore): Promise<McpServer> {
      title: "Query",
      description: `Search the knowledge base using a query document — one or more typed sub-queries combined for best recall.

+Each result includes a \`line\` field with the absolute 1-indexed line of the best match in the source markdown. To read more context around a hit, call \`get(file, fromLine = max(1, line - 20), maxLines = 80, lineNumbers = true)\`.
+
 ## Query Types

 **lex** — BM25 keyword search. Fast, exact, no LLM needed.
@ -339,13 +342,14 @@ Intent-aware lex (C++ performance, not sports):
        || searches[0]?.query || "";

      const filtered: SearchResultItem[] = results.map(r => {
-        const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300, undefined, undefined, intent);
+        const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, intent);
        return {
          docid: `#${r.docid}`,
          file: r.displayPath,
          title: r.title,
          score: Math.round(r.score * 100) / 100,
          context: r.context,
+          line,
          snippet: addLineNumbers(snippet, line),
        };
      });
@ -383,6 +387,7 @@ Intent-aware lex (C++ performance, not sports):
        parsedFromLine = parseInt(colonMatch[1], 10);
        lookup = lookup.slice(0, -colonMatch[0].length);
      }
+      if (parsedFromLine !== undefined) parsedFromLine = Math.max(1, parsedFromLine);

      const result = await store.get(lookup, { includeBody: false });

@ -701,13 +706,14 @@ export async function startMcpHttpServer(
          || params.searches[0]?.query || "";

        const formatted = results.map(r => {
-          const { line, snippet } = extractSnippet(r.bestChunk, primaryQuery, 300);
+          const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, params.intent);
          return {
            docid: `#${r.docid}`,
            file: r.displayPath,
            title: r.title,
            score: Math.round(r.score * 100) / 100,
            context: r.context,
+            line,
            snippet: addLineNumbers(snippet, line),
          };
        });
--- a/src/store.ts
+++ b/src/store.ts
@ -871,10 +871,15 @@ function initializeDatabase(db: Database): void {
      seq INTEGER NOT NULL DEFAULT 0,
      pos INTEGER NOT NULL DEFAULT 0,
      model TEXT NOT NULL,
+      total_chunks INTEGER NOT NULL DEFAULT 1,
      embedded_at TEXT NOT NULL,
      PRIMARY KEY (hash, seq)
    )
  `);
+  const cvInfoAfterCreate = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
+  if (!cvInfoAfterCreate.some(col => col.name === 'total_chunks')) {
+    db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
+  }

  // Store collections — makes the DB self-contained (no external config needed)
  db.exec(`
@ -1167,9 +1172,9 @@ export type Store = {
  ensureVecTable: (dimensions: number) => void;

  // Index health
-  getHashesNeedingEmbedding: () => number;
-  getIndexHealth: () => IndexHealthInfo;
-  getStatus: () => IndexStatus;
+  getHashesNeedingEmbedding: (model?: string) => number;
+  getIndexHealth: (model?: string) => IndexHealthInfo;
+  getStatus: (model?: string) => IndexStatus;

  // Caching
  getCacheKey: typeof getCacheKey;
@ -1229,7 +1234,7 @@ export type Store = {
  // Vector/embedding operations
  getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  clearAllEmbeddings: () => void;
-  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
+  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => void;
 };

 // =============================================================================
@ -1420,18 +1425,31 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
  };
 }

-function getPendingEmbeddingDocs(db: Database, collection?: string): PendingEmbeddingDoc[] {
+function contentVectorExpectedChunksExpr(db: Database): string {
+  const columns = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
+  return columns.some(col => col.name === 'total_chunks') ? 'MAX(total_chunks)' : '1';
+}
+
+function getPendingEmbeddingDocs(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): PendingEmbeddingDoc[] {
  const collectionFilter = collection ? `AND d.collection = ?` : ``;
+  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
  const stmt = db.prepare(`
    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
    FROM documents d
    JOIN content c ON d.hash = c.hash
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
+    LEFT JOIN (
+      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      FROM content_vectors
+      WHERE model = ?
+      GROUP BY hash, model
+    ) v ON d.hash = v.hash
+    WHERE d.active = 1
+      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
+      ${collectionFilter}
    GROUP BY d.hash
    ORDER BY MIN(d.path)
  `);
-  return (collection ? stmt.all(collection) : stmt.all()) as PendingEmbeddingDoc[];
+  return (collection ? stmt.all(model, collection) : stmt.all(model)) as PendingEmbeddingDoc[];
 }

 function buildEmbeddingBatches(
@ -1502,7 +1520,7 @@ export async function generateEmbeddings(
    clearAllEmbeddings(db, options?.collection);
  }

-  const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection);
+  const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection, model);

  if (docsToEmbed.length === 0) {
    return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
@ -1533,6 +1551,7 @@ export async function generateEmbeddings(

      const batchDocs = getEmbeddingDocsForBatch(db, batchMeta);
      const batchChunks: ChunkItem[] = [];
+      const expectedChunksByHash = new Map<string, number>();
      const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0);

      for (const doc of batchDocs) {
@ -1558,6 +1577,7 @@ export async function generateEmbeddings(
            bytes: encoder.encode(chunks[seq]!.text).length,
          });
        }
+        expectedChunksByHash.set(doc.hash, chunks.length);
      }

      totalChunks += batchChunks.length;
@ -1610,7 +1630,7 @@ export async function generateEmbeddings(
            const chunk = chunkBatch[i]!;
            const embedding = embeddings[i];
            if (embedding) {
-              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now);
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
              chunksEmbedded++;
            } else {
              errors++;
@ -1629,7 +1649,7 @@ export async function generateEmbeddings(
                const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
                const result = await session.embed(text, { model });
                if (result) {
-                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now);
+                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
                  chunksEmbedded++;
                } else {
                  errors++;
@ -1654,6 +1674,11 @@ export async function generateEmbeddings(
        });
      }

+      const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
+      if (removedPartialChunks > 0) {
+        chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
+      }
+
      bytesProcessed += batchBytes;
      options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
    }
@ -1688,9 +1713,9 @@ export function createStore(dbPath?: string): Store {
    ensureVecTable: (dimensions: number) => ensureVecTableInternal(db, dimensions),

    // Index health
-    getHashesNeedingEmbedding: () => getHashesNeedingEmbedding(db),
-    getIndexHealth: () => getIndexHealth(db),
-    getStatus: () => getStatus(db),
+    getHashesNeedingEmbedding: (model?: string) => getHashesNeedingEmbedding(db, undefined, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
+    getIndexHealth: (model?: string) => getIndexHealth(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),
+    getStatus: (model?: string) => getStatus(db, model ?? store.llm?.embedModelName ?? DEFAULT_EMBED_MODEL),

    // Caching
    getCacheKey,
@ -1750,7 +1775,7 @@ export function createStore(dbPath?: string): Store {
    // Vector/embedding operations
    getHashesForEmbedding: () => getHashesForEmbedding(db),
    clearAllEmbeddings: () => clearAllEmbeddings(db),
-    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
+    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks),
  };

  return store;
@ -1949,15 +1974,23 @@ export type IndexStatus = {
 // Index health
 // =============================================================================

-export function getHashesNeedingEmbedding(db: Database, collection?: string): number {
+export function getHashesNeedingEmbedding(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): number {
  const collectionFilter = collection ? `AND d.collection = ?` : ``;
+  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
  const stmt = db.prepare(`
    SELECT COUNT(DISTINCT d.hash) as count
    FROM documents d
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
+    LEFT JOIN (
+      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      FROM content_vectors
+      WHERE model = ?
+      GROUP BY hash, model
+    ) v ON d.hash = v.hash
+    WHERE d.active = 1
+      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
+      ${collectionFilter}
  `);
-  const result = (collection ? stmt.get(collection) : stmt.get()) as { count: number };
+  const result = (collection ? stmt.get(model, collection) : stmt.get(model)) as { count: number };
  return result.count;
 }

@ -1967,8 +2000,8 @@ export type IndexHealthInfo = {
  daysStale: number | null;
 };

-export function getIndexHealth(db: Database): IndexHealthInfo {
-  const needsEmbedding = getHashesNeedingEmbedding(db);
+export function getIndexHealth(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexHealthInfo {
+  const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
  const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;

  const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
@ -3316,15 +3349,22 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi
 * Get all unique content hashes that need embeddings (from active documents).
 * Returns hash, document body, and a sample path for display purposes.
 */
-export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
+export function getHashesForEmbedding(db: Database, model: string = DEFAULT_EMBED_MODEL): { hash: string; body: string; path: string }[] {
+  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
  return db.prepare(`
    SELECT d.hash, c.doc as body, MIN(d.path) as path
    FROM documents d
    JOIN content c ON d.hash = c.hash
-    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL
+    LEFT JOIN (
+      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      FROM content_vectors
+      WHERE model = ?
+      GROUP BY hash, model
+    ) v ON d.hash = v.hash
+    WHERE d.active = 1
+      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
    GROUP BY d.hash
-  `).all() as { hash: string; body: string; path: string }[];
+  `).all(model) as { hash: string; body: string; path: string }[];
 }

 /**
@ -3409,13 +3449,14 @@ export function insertEmbedding(
  pos: number,
  embedding: Float32Array,
  model: string,
-  embeddedAt: string
+  embeddedAt: string,
+  totalChunks: number = 1
 ): void {
  const hashSeq = `${hash}_${seq}`;

  // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
-  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
-  insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
+  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`);
+  insertContentVectorStmt.run(hash, seq, pos, model, totalChunks, embeddedAt);

  // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
  const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
@ -3424,6 +3465,26 @@ export function insertEmbedding(
  insertVecStmt.run(hashSeq, embedding);
 }

+function removeIncompleteEmbeddings(db: Database, expectedChunksByHash: Map<string, number>, model: string): number {
+  let removed = 0;
+  const rowsStmt = db.prepare(`SELECT seq FROM content_vectors WHERE hash = ? AND model = ?`);
+  const deleteContentStmt = db.prepare(`DELETE FROM content_vectors WHERE hash = ? AND model = ?`);
+  const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
+
+  for (const [hash, expectedChunks] of expectedChunksByHash) {
+    const rows = rowsStmt.all(hash, model) as { seq: number }[];
+    if (rows.length === 0 || rows.length === expectedChunks) continue;
+
+    for (const row of rows) {
+      deleteVecStmt.run(`${hash}_${row.seq}`);
+    }
+    deleteContentStmt.run(hash, model);
+    removed += rows.length;
+  }
+
+  return removed;
+}
+
 // =============================================================================
 // Query expansion
 // =============================================================================
@ -3800,7 +3861,7 @@ export function getDocumentBody(db: Database, doc: DocumentResult | { filepath:
  let body = row.body;
  if (fromLine !== undefined || maxLines !== undefined) {
    const lines = body.split('\n');
-    const start = (fromLine || 1) - 1;
+    const start = Math.max(0, (fromLine || 1) - 1);
    const end = maxLines !== undefined ? start + maxLines : lines.length;
    body = lines.slice(start, end).join('\n');
  }
@ -3922,7 +3983,7 @@ export function findDocuments(
 // Status
 // =============================================================================

-export function getStatus(db: Database): IndexStatus {
+export function getStatus(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexStatus {
  // DB is source of truth for collections — config provides supplementary metadata
  const dbCollections = db.prepare(`
    SELECT
@ -3957,7 +4018,7 @@ export function getStatus(db: Database): IndexStatus {
  });

  const totalDocs = (db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 1`).get() as { c: number }).c;
-  const needsEmbedding = getHashesNeedingEmbedding(db);
+  const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
  const hasVectors = !!db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();

  return {
@ -4023,7 +4084,7 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
  let searchBody = body;
  let lineOffset = 0;

-  if (chunkPos && chunkPos > 0) {
+  if (chunkPos !== undefined && chunkPos >= 0) {
    // Search within the chunk region, with some padding for context
    // Use provided chunkLen or fall back to max chunk size (covers variable-length chunks)
    const searchLen = chunkLen || CHUNK_SIZE_CHARS;
@ -4055,6 +4116,23 @@ export function extractSnippet(body: string, query: string, maxLen = 500, chunkP
    }
  }

+  if (chunkPos !== undefined && chunkPos >= 0 && bestScore <= 0) {
+    if (chunkPos === 0) {
+      // chunkPos=0 may be the chunk selector's initialization default for queries
+      // where lexical chunk scoring found no winner (e.g. tokens filtered to empty
+      // by the length>2 guard). Retry with full body so the real match isn't missed.
+      return extractSnippet(body, query, maxLen, undefined, undefined, intent);
+    }
+    // For chunkPos > 0 the reranker actively picked this chunk. Tokens failing to
+    // match literally is most likely a tokenizer limitation (quoted phrases, FTS5
+    // syntax, HYDE passages, semantic hits), so anchor on the chunk start rather
+    // than disregarding the reranker's pick.
+    const contextStart = Math.max(0, chunkPos - 100);
+    bestLine = chunkPos > contextStart
+      ? searchBody.slice(0, chunkPos - contextStart).split('\n').length - 1
+      : 0;
+  }
+
  const start = Math.max(0, bestLine - 1);
  const end = Math.min(lines.length, bestLine + 3);
  const snippetLines = lines.slice(start, end);
--- a/test/bin-wrapper.test.ts
+++ b/test/bin-wrapper.test.ts
@ -0,0 +1,164 @@
+import { afterEach, describe, expect, test } from "vitest";
+import { chmodSync, copyFileSync, mkdtempSync, mkdirSync, readFileSync, realpathSync, rmSync, symlinkSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { dirname, join, relative } from "node:path";
+import { execFileSync } from "node:child_process";
+import { fileURLToPath } from "node:url";
+
+const repoRoot = fileURLToPath(new URL("..", import.meta.url));
+const fixtures: string[] = [];
+
+function makeTempFixture() {
+  const root = mkdtempSync(join(tmpdir(), "qmd-bin-wrapper-"));
+  fixtures.push(root);
+  const capturePath = join(root, "capture.txt");
+  const runtimeBin = join(root, "runtime-bin");
+  mkdirSync(runtimeBin, { recursive: true });
+
+  for (const runtime of ["node", "bun"]) {
+    const runtimePath = join(runtimeBin, runtime);
+    writeFileSync(
+      runtimePath,
+      `#!/bin/sh\n{\n  printf '%s\\n' '${runtime}'\n  printf '%s\\n' "$1"\n  shift\n  printf '%s\\n' "$@"\n} > "$QMD_WRAPPER_CAPTURE"\n`,
+    );
+    chmodSync(runtimePath, 0o755);
+  }
+
+  return { root, capturePath, runtimeBin };
+}
+
+function makePackage(root: string, packagePath: string, lockfiles: string[] = []) {
+  const packageRoot = join(root, packagePath);
+  mkdirSync(join(packageRoot, "bin"), { recursive: true });
+  mkdirSync(join(packageRoot, "dist", "cli"), { recursive: true });
+  copyFileSync(join(repoRoot, "bin", "qmd"), join(packageRoot, "bin", "qmd"));
+  chmodSync(join(packageRoot, "bin", "qmd"), 0o755);
+  writeFileSync(join(packageRoot, "dist", "cli", "qmd.js"), "// fixture\n");
+  for (const lockfile of lockfiles) {
+    writeFileSync(join(packageRoot, lockfile), "");
+  }
+  return packageRoot;
+}
+
+function symlinkRelative(target: string, linkPath: string) {
+  mkdirSync(dirname(linkPath), { recursive: true });
+  symlinkSync(relative(dirname(linkPath), target), linkPath);
+}
+
+function runWrapper(commandPath: string, runtimeBin: string, capturePath: string, env: Record<string, string> = {}) {
+  rmSync(capturePath, { force: true });
+  execFileSync(commandPath, ["--version"], {
+    env: {
+      ...process.env,
+      ...env,
+      PATH: `${runtimeBin}:${process.env.PATH ?? ""}`,
+      QMD_WRAPPER_CAPTURE: capturePath,
+    },
+    stdio: ["ignore", "pipe", "pipe"],
+  });
+  const [runtime, scriptPath, ...args] = readFileSync(capturePath, "utf8").trimEnd().split("\n");
+  return { runtime, scriptPath, args };
+}
+
+afterEach(() => {
+  for (const fixture of fixtures.splice(0)) {
+    rmSync(fixture, { recursive: true, force: true });
+  }
+});
+
+describe("bin/qmd package wrapper", () => {
+  test("direct package invocation resolves dist/cli/qmd.js from the package root", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "node_modules/@tobilu/qmd");
+
+    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+    expect(result.args).toEqual(["--version"]);
+  });
+
+  test("npm/Homebrew global bin symlink resolves scoped package path", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "opt", "homebrew", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("multi-hop global bin symlink chain resolves to the real package root", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "opt", "homebrew", "bin", "qmd");
+    const shim = join(root, "opt", "homebrew", "Cellar", "qmd", "current", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), shim);
+    symlinkRelative(shim, globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("linuxbrew global bin symlink resolves lib/node_modules scoped package path", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "home/linuxbrew/.linuxbrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "home", "linuxbrew", ".linuxbrew", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("npx scoped package .bin symlink resolves @tobilu/qmd package path", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "npm/_npx/abc123/node_modules/@tobilu/qmd");
+    const npxBin = join(root, "npm", "_npx", "abc123", "node_modules", ".bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), npxBin);
+
+    const result = runWrapper(npxBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("bun global symlink uses bun when package-local bun lockfile exists", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "home/user/.bun/install/global/node_modules/@tobilu/qmd", ["bun.lock"]);
+    const bunBin = join(root, "home", "user", ".bun", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), bunBin);
+
+    const result = runWrapper(bunBin, runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("bun");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("ambient BUN_INSTALL alone does not select bun for an npm-installed package", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "opt/homebrew/lib/node_modules/@tobilu/qmd");
+    const globalBin = join(root, "opt", "homebrew", "bin", "qmd");
+    symlinkRelative(join(packageRoot, "bin", "qmd"), globalBin);
+
+    const result = runWrapper(globalBin, runtimeBin, capturePath, { BUN_INSTALL: join(root, ".bun") });
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("package-lock.json takes priority over bun lockfiles", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "node_modules/@tobilu/qmd", ["package-lock.json", "bun.lock"]);
+
+    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("node");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+});
--- a/test/cli-exit-lifecycle.test.ts
+++ b/test/cli-exit-lifecycle.test.ts
@ -0,0 +1,82 @@
+import { describe, expect, test } from "vitest";
+import { finishSuccessfulCliCommand } from "../src/cli/qmd.ts";
+import { LlamaCpp } from "../src/llm.ts";
+
+describe("CLI successful-exit lifecycle", () => {
+  test("exits 0 after successful JSON output when post-output LLM cleanup fails", async () => {
+    const exitCodes: number[] = [];
+    const stderr: string[] = [];
+    const flushed: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      platform: "linux",
+      cleanup: async () => {
+        throw new Error("ggml_metal_device_free abort simulation");
+      },
+      exit: (code) => {
+        exitCodes.push(code);
+      },
+      stdout: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { flushed.push(String(chunk)); cb?.(); return true; } },
+      stderr: { write: (chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { stderr.push(String(chunk)); cb?.(); return true; } },
+    });
+
+    expect(exitCodes).toEqual([0]);
+    expect(stderr.join("")).toContain("QMD Warning: cleanup after successful output failed");
+    expect(flushed).toEqual([""]);
+  });
+
+  test("uses immediate exit for successful macOS JSON query after stdout flush", async () => {
+    const calls: string[] = [];
+
+    await finishSuccessfulCliCommand({
+      command: "query",
+      format: "json",
+      platform: "darwin",
+      cleanup: async () => {
+        calls.push("cleanup");
+      },
+      exit: (code) => {
+        calls.push(`exit:${code}`);
+      },
+      immediateExit: (code) => {
+        calls.push(`immediate-exit:${code}`);
+      },
+      stdout: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stdout-flush"); cb?.(); return true; } },
+      stderr: { write: (_chunk: string | Uint8Array, cb?: (error?: Error | null) => void) => { calls.push("stderr-flush"); cb?.(); return true; } },
+    });
+
+    expect(calls).toEqual(["stdout-flush", "stderr-flush", "immediate-exit:0"]);
+  });
+
+  test("disposes Llama resources in dependency order before CLI exit", async () => {
+    const calls: string[] = [];
+    const llm = new LlamaCpp({ inactivityTimeoutMs: 0 });
+    const disposable = (name: string) => ({
+      dispose: async () => {
+        calls.push(name);
+      },
+    });
+
+    Object.assign(llm as unknown as Record<string, unknown>, {
+      embedContexts: [disposable("embed-context")],
+      rerankContexts: [disposable("rerank-context")],
+      embedModel: disposable("embed-model"),
+      generateModel: disposable("generate-model"),
+      rerankModel: disposable("rerank-model"),
+      llama: disposable("llama"),
+    });
+
+    await llm.dispose();
+
+    expect(calls).toEqual([
+      "embed-context",
+      "rerank-context",
+      "embed-model",
+      "generate-model",
+      "rerank-model",
+      "llama",
+    ]);
+  });
+});
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@ -233,6 +233,7 @@ describe("CLI Help", () => {
    expect(stdout).toContain("Usage:");
    expect(stdout).toContain("qmd collection add");
    expect(stdout).toContain("qmd search");
+    expect(stdout).toContain("--no-gpu");
    expect(stdout).toContain("qmd skill show/install");
  });

@ -507,6 +508,16 @@ describe("CLI Search Command", () => {
    // Error message goes to stderr
    expect(stderr).toContain("Usage:");
  });
+
+  test("--json --full includes line field for round-tripping to qmd get", async () => {
+    const { stdout, exitCode } = await runQmd(["search", "meeting", "--json", "--full", "-n", "1"]);
+    expect(exitCode).toBe(0);
+    const results = JSON.parse(stdout);
+    expect(results.length).toBeGreaterThan(0);
+    expect(results[0].line).toBeTypeOf("number");
+    expect(results[0].line).toBeGreaterThan(0);
+    expect(results[0].body).toBeTypeOf("string");
+  });
 });

 describe("CLI Get Command", () => {
@ -532,6 +543,13 @@ describe("CLI Get Command", () => {
    // Should indicate file not found
    expect(exitCode).toBe(1);
  });
+
+  test("clamps negative --from to top of file (no silent tail content)", async () => {
+    const baseline = await runQmd(["get", "README.md"]);
+    const negative = await runQmd(["get", "README.md", "--from", "-19"]);
+    expect(negative.exitCode).toBe(0);
+    expect(negative.stdout).toBe(baseline.stdout);
+  });
 });

 describe("CLI Multi-Get Command", () => {
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@ -13,6 +13,8 @@ import {
  getDefaultLlamaCpp,
  disposeDefaultLlamaCpp,
  resolveLlamaGpuMode,
+  setNodeLlamaCppModuleForTest,
+  withNativeStdoutRedirectedToStderr,
  resolveParallelismOverride,
  resolveSafeParallelism,
  withLLMSession,
@ -78,6 +80,29 @@ describe("QMD_LLAMA_GPU resolution", () => {
    expect(resolveLlamaGpuMode(" cuda ")).toBe("cuda");
  });

+  test("QMD_FORCE_CPU disables GPU before QMD_LLAMA_GPU auto-detection", () => {
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_FORCE_CPU = "1";
+    try {
+      expect(resolveLlamaGpuMode(undefined)).toBe(false);
+      expect(resolveLlamaGpuMode("cuda")).toBe(false);
+    } finally {
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+
+  test("QMD_FORCE_CPU ignores false-ish values", () => {
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_FORCE_CPU = "0";
+    try {
+      expect(resolveLlamaGpuMode(undefined)).toBe("auto");
+    } finally {
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+
  test("warns and falls back to auto for unsupported values", () => {
    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
    try {
@ -90,6 +115,71 @@ describe("QMD_LLAMA_GPU resolution", () => {
  });
 });

+describe("native llama stdout containment", () => {
+  test("redirects native stdout noise to stderr while JSON callers are initializing llama", async () => {
+    const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      await withNativeStdoutRedirectedToStderr(async () => {
+        process.stdout.write("cmake build spam\n");
+        return "ok";
+      });
+
+      expect(stdoutSpy).not.toHaveBeenCalled();
+      expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
+    } finally {
+      stdoutSpy.mockRestore();
+      stderrSpy.mockRestore();
+    }
+  });
+
+  test("keeps native GPU failure noise off stdout and caches failed GPU init", async () => {
+    const prevGpu = process.env.QMD_LLAMA_GPU;
+    const prevForceCpu = process.env.QMD_FORCE_CPU;
+    process.env.QMD_LLAMA_GPU = "cuda";
+    delete process.env.QMD_FORCE_CPU;
+
+    const calls: unknown[] = [];
+    const fakeLlama = { gpu: false, cpuMathCores: 4 };
+    setNodeLlamaCppModuleForTest({
+      LlamaLogLevel: { error: "error" },
+      resolveModelFile: vi.fn(),
+      LlamaChatSession: vi.fn() as any,
+      getLlama: vi.fn(async (options: Record<string, unknown>) => {
+        calls.push(options.gpu);
+        if (options.gpu === "cuda") {
+          process.stdout.write("cmake build spam\n");
+          throw new Error("CUDA unavailable");
+        }
+        return fakeLlama as any;
+      }),
+    });
+
+    const stdoutSpy = vi.spyOn(process.stdout, "write").mockReturnValue(true);
+    const stderrSpy = vi.spyOn(process.stderr, "write").mockReturnValue(true);
+    try {
+      const first = new LlamaCpp();
+      const second = new LlamaCpp();
+
+      await (first as any).ensureLlama();
+      await (second as any).ensureLlama();
+
+      expect(stdoutSpy).not.toHaveBeenCalled();
+      expect(stderrSpy).toHaveBeenCalledWith("cmake build spam\n", undefined, undefined);
+      expect(calls).toEqual(["cuda", false, false]);
+      expect(String(stderrSpy.mock.calls.map(call => call[0]).join(""))).toContain("skipping previously failed GPU init");
+    } finally {
+      stdoutSpy.mockRestore();
+      stderrSpy.mockRestore();
+      setNodeLlamaCppModuleForTest(null);
+      if (prevGpu === undefined) delete process.env.QMD_LLAMA_GPU;
+      else process.env.QMD_LLAMA_GPU = prevGpu;
+      if (prevForceCpu === undefined) delete process.env.QMD_FORCE_CPU;
+      else process.env.QMD_FORCE_CPU = prevForceCpu;
+    }
+  });
+});
+
 describe("LLM context parallelism safety", () => {
  test("defaults Windows CUDA to one context to avoid ggml-cuda.cu:98 crashes", () => {
    expect(resolveSafeParallelism({
--- a/test/mcp.test.ts
+++ b/test/mcp.test.ts
@ -913,6 +913,22 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
    initTestDatabase(db);
    seedTestData(db);

+    // 300 pad lines (37 chars each = 11100 chars) puts the marker past the
+    // first chunk boundary at CHUNK_SIZE_CHARS = 3600.
+    {
+      const padLine = "Pad line for chunk boundary coverage\n";
+      const absLineFixtureBody =
+        padLine.repeat(300) +
+        "UNIQUE_KEYWORD_XYZ marker\n" +
+        padLine.repeat(20);
+      const fixtureHash = "hash-abslines";
+      const now = new Date().toISOString();
+      db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
+        .run(fixtureHash, absLineFixtureBody, now);
+      db.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES ('docs', ?, ?, ?, ?, ?, 1)`)
+        .run("absolute-line-fixture.md", "Absolute Line Fixture", fixtureHash, now, now);
+    }
+
    // Sync config into SQLite
    const httpTestConfig: CollectionConfig = {
      collections: {
@ -1074,4 +1090,29 @@ describe.skipIf(!!process.env.CI)("MCP HTTP Transport", () => {
    expect(json.result).toBeDefined();
    expect(json.result.content.length).toBeGreaterThan(0);
  });
+
+  test("POST /mcp tools/call query returns absolute source-file line numbers, not chunk-local", async () => {
+    await mcpRequest({
+      jsonrpc: "2.0", id: 1, method: "initialize",
+      params: { protocolVersion: "2025-03-26", capabilities: {}, clientInfo: { name: "test", version: "1.0" } },
+    });
+
+    const { status, json } = await mcpRequest({
+      jsonrpc: "2.0", id: 5, method: "tools/call",
+      params: {
+        name: "query",
+        arguments: {
+          searches: [{ type: "lex", query: "UNIQUE_KEYWORD_XYZ" }],
+          rerank: false,
+        },
+      },
+    });
+    expect(status).toBe(200);
+    const results = json.result.structuredContent.results;
+    expect(results.length).toBeGreaterThan(0);
+    const hit = results.find((r: any) => r.file === "docs/absolute-line-fixture.md");
+    expect(hit).toBeDefined();
+    expect(hit.line).toBe(301);
+    expect(hit.snippet).toMatch(/^\d+: @@ -3\d\d,/);
+  });
 });
--- a/test/store.test.ts
+++ b/test/store.test.ts
@ -1713,6 +1713,21 @@ describe("Document Retrieval", () => {
      expect(body).toBeNull();
      await cleanupTestDb(store);
    });
+
+    test("getDocumentBody clamps negative fromLine to top of document", async () => {
+      const store = await createTestStore();
+      const collectionName = await createTestCollection({ pwd: "/path" });
+      await insertTestDocument(store.db, collectionName, {
+        name: "mydoc",
+        displayPath: "mydoc.md",
+        body: "Line 1\nLine 2\nLine 3\nLine 4\nLine 5",
+      });
+
+      const body = store.getDocumentBody({ filepath: "/path/mydoc.md" }, -19, 80);
+      expect(body).toBe("Line 1\nLine 2\nLine 3\nLine 4\nLine 5");
+
+      await cleanupTestDb(store);
+    });
  });

  describe("findDocuments (multi-get)", () => {
@ -2001,6 +2016,33 @@ describe("Snippet Extraction", () => {
    expect(line).toBe(51); // "Target keyword" is line 51
    expect(linesBefore).toBeGreaterThan(40); // Many lines before
  });
+
+  test("extractSnippet anchors on chunkPos when lexical scoring finds no match", () => {
+    // The snippet tokenizer does not strip FTS5 syntax, so a quoted-phrase query
+    // tokenises into terms with embedded quotes that never appear in body text.
+    // bestScore stays at 0 even though the reranker correctly identified a chunk;
+    // the fallback should anchor on chunkPos rather than defaulting to line 1.
+    const padLine = "Lorem ipsum dolor sit amet\n";
+    const padding = padLine.repeat(100);
+    const body = padding + "chunk content here\nmore chunk content\n" + padding;
+    const chunkPos = padding.length;
+
+    const { line } = extractSnippet(body, '"unrelated quoted phrase"', 200, chunkPos);
+
+    expect(line).toBeGreaterThan(50);
+    expect(line).toBeLessThan(110);
+  });
+
+  test("extractSnippet with chunkPos=0 falls back to full-body scan when chunk has no match", () => {
+    // chunkPos=0 may be the chunk selector's bestIdx=0 default rather than a real
+    // first-chunk hit, so the fallback must consider matches outside chunk 0.
+    const padding = "Lorem ipsum dolor sit amet\n".repeat(200);
+    const body = padding + "TARGET_KEYWORD line content\ntail line\n";
+
+    const { line } = extractSnippet(body, "TARGET_KEYWORD", 200, 0);
+
+    expect(line).toBe(201);
+  });
 });

 // =============================================================================
@ -2239,6 +2281,26 @@ describe("Index Status", () => {
    await cleanupTestDb(store);
  });

+  test("embedding health is scoped to the active embed model", async () => {
+    const store = await createTestStore();
+    const collectionName = await createTestCollection();
+    const activeModel = "hf:active/embed-model.gguf";
+    const staleModel = "hf:stale/embed-model.gguf";
+    const now = new Date().toISOString();
+
+    store.llm = { embedModelName: activeModel } as any;
+    store.ensureVecTable(3);
+    await insertTestDocument(store.db, collectionName, { name: "doc1", hash: "hash1" });
+    store.insertEmbedding("hash1", 0, 0, new Float32Array([1, 2, 3]), staleModel, now, 1);
+
+    expect(store.getHashesNeedingEmbedding()).toBe(1);
+    expect(store.getStatus().needsEmbedding).toBe(1);
+    expect(store.getIndexHealth().needsEmbedding).toBe(1);
+    expect(store.getHashesNeedingEmbedding(staleModel)).toBe(0);
+
+    await cleanupTestDb(store);
+  });
+
  test("getIndexHealth returns health info", async () => {
    const store = await createTestStore();
    const collectionName = await createTestCollection();
@ -3051,6 +3113,68 @@ describe("Embedding batching", () => {
    }
  });

+  test("generateEmbeddings does not mark a partially embedded multi-chunk document complete", async () => {
+    const store = await createTestStore();
+    const db = store.db;
+    const fakeLlm = {
+      async embed(_text: string, _options?: { model?: string }) {
+        return { embedding: [0.1, 0.2, 0.3], model: "fake-embed" };
+      },
+      async embedBatch(texts: string[], _options?: { model?: string }) {
+        return texts.map((_text, index) => index === 0
+          ? { embedding: [1, 2, 3], model: "fake-embed" }
+          : null
+        );
+      },
+    };
+
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.llm = fakeLlm as any;
+
+    try {
+      await insertTestDocument(db, "docs", {
+        name: "long-doc",
+        body: "# Long doc\n\n" + "partial embedding regression ".repeat(260),
+      });
+
+      const result = await generateEmbeddings(store);
+
+      expect(result.errors).toBeGreaterThan(0);
+      expect(db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get()).toEqual({ count: 0 });
+      expect(db.prepare(`SELECT COUNT(*) as count FROM vectors_vec`).get()).toEqual({ count: 0 });
+      expect(store.getHashesNeedingEmbedding()).toBe(1);
+      expect(store.getStatus().needsEmbedding).toBe(1);
+    } finally {
+      setDefaultLlamaCpp(null);
+      await cleanupTestDb(store);
+    }
+  });
+
+  test("generateEmbeddings opens a long-lived LLM session for embed runs", async () => {
+    const store = await createTestStore();
+    const fakeLlm = createFakeEmbedLlm();
+    const sessionSpy = vi.spyOn(llmModule, "withLLMSessionForLlm");
+
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.llm = fakeLlm as any;
+
+    try {
+      await insertTestDocument(store.db, "docs", { name: "one", body: "# One\n\nAlpha" });
+
+      await generateEmbeddings(store);
+
+      expect(sessionSpy).toHaveBeenCalledWith(
+        fakeLlm,
+        expect.any(Function),
+        expect.objectContaining({ maxDuration: 30 * 60 * 1000, name: "generateEmbeddings" }),
+      );
+    } finally {
+      sessionSpy.mockRestore();
+      setDefaultLlamaCpp(null);
+      await cleanupTestDb(store);
+    }
+  });
+
  test("vectorSearchQuery uses the active llm embed model for vector lookups", async () => {
    const store = await createTestStore();
    const model = "hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf";