Improve qmd diagnostics and embed resilience

2026-05-19 21:39:48 +00:00 · 2026-05-19 21:39:48 +00:00 · b5f156c313
commit b5f156c313
parent 105c577b3b
15 changed files with 450 additions and 170 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -10,6 +10,13 @@

 ### Fixes

+- Launcher: prefer runnable TypeScript source in git checkouts even when ignored `dist/` artifacts exist, while packaged installs continue to run `dist/`.
+- GPU: keep node-llama-cpp's documented `gpu: "auto"` initialization as the primary path, then perform no-build packaged CUDA/Vulkan/Metal probes only if auto falls back to CPU.
+- CLI: move GPU/CPU runtime diagnostics out of `qmd status`; use `qmd doctor` for device probing and related environment guidance.
+- CLI: point unexpected command/setup failures toward `qmd doctor` so diagnostics are the default next step when QMD behaves incorrectly.
+- Doctor: explicitly warn when `content_vectors` contains multiple non-empty embedding fingerprint names, with the per-fingerprint document/chunk breakdown.
+- Embed: make the TTY progress line label byte-based input progress explicitly, show embedded chunks as a count, and shorten the displayed model name.
+- Embed: retain per-chunk failure details, retry failed chunks after later successful embeds and again when no other chunks remain, clear recovered errors, and cap retries to avoid endless loops.
 - Embedding: fingerprint vector metadata using the active embedding model and formatting/chunking parameters so stale vectors are treated as pending after search semantics change. Legacy `content_vectors` columns are migrated lazily on first vector-health/write use to preserve fast QMD startup.

 - Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable.
--- a/bin/qmd
+++ b/bin/qmd
@ -26,27 +26,28 @@ if [ "$1" = "mcp" ]; then
 fi

 JS="$DIR/dist/cli/qmd.js"
+TS="$DIR/src/cli/qmd.ts"

-# In published packages dist/ is always present. In a fresh checkout, however,
-# people often run ./bin/qmd before building. Prefer a source-mode fallback when
-# dependencies are installed; otherwise fail with an actionable message instead
-# of a low-level "Module not found" from Node/Bun.
-if [ ! -f "$JS" ]; then
-  TS="$DIR/src/cli/qmd.ts"
-  if [ -f "$TS" ]; then
-    if [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
-      if command -v bun >/dev/null 2>&1; then
-        exec bun "$TS" "$@"
-      fi
-    fi
-    if [ -f "$DIR/node_modules/tsx/dist/cli.mjs" ]; then
-      exec node "$DIR/node_modules/tsx/dist/cli.mjs" "$TS" "$@"
+# In published packages, bin/qmd must run dist/. In a git checkout, however,
+# dist/ is often ignored and can be stale after git reset or branch switches.
+# Prefer source mode only for checkouts so ./bin/qmd reflects the checked-out
+# source without changing packaged/runtime behavior.
+if [ -e "$DIR/.git" ] && [ -f "$TS" ]; then
+  if [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
+    if command -v bun >/dev/null 2>&1; then
+      exec bun "$TS" "$@"
    fi
  fi
+  if [ -f "$DIR/node_modules/tsx/dist/cli.mjs" ]; then
+    exec node "$DIR/node_modules/tsx/dist/cli.mjs" "$TS" "$@"
+  fi
+fi

+if [ ! -f "$JS" ]; then
  echo "qmd is not built: missing $JS" >&2
  echo "Run: bun install && bun run build" >&2
  echo "Or:  npm install && npm run build" >&2
+  echo "After building, run: qmd doctor" >&2
  exit 1
 fi

--- a/src/bench-rerank.ts
+++ b/src/bench-rerank.ts
@ -260,16 +260,18 @@ async function main() {
      const r = await benchmarkConfig(model, llama, docs, p, true);
      results.push(r);
      process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
-    } catch (e: any) {
-      process.stdout.write(` failed: ${e.message}\n`);
+    } catch (e: unknown) {
+      const message = e instanceof Error ? e.message : String(e);
+      process.stdout.write(` failed: ${message}\n`);
      // Try without flash
      process.stdout.write(`  [${p} ctx, no flash] running...`);
      try {
        const r = await benchmarkConfig(model, llama, docs, p, false);
        results.push(r);
        process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
-      } catch (e2: any) {
-        process.stdout.write(` failed: ${e2.message}\n`);
+      } catch (e2: unknown) {
+        const message = e2 instanceof Error ? e2.message : String(e2);
+        process.stdout.write(` failed: ${message}\n`);
      }
    }
  }
--- a/src/bench/bench.ts
+++ b/src/bench/bench.ts
@ -176,7 +176,7 @@ async function runQuery(
  let resultFiles: string[];
  try {
    resultFiles = await backend.run(store, query, limit, collection);
-  } catch (err: any) {
+  } catch {
    // Backend may not be available (e.g., no embeddings for vector search)
    return {
      precision_at_k: 0,
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -1,5 +1,5 @@
 import { isBun, openDatabase } from "../db.js";
-import type { Database } from "../db.js";
+import type { Database, SQLiteValue } from "../db.js";
 import fastGlob from "fast-glob";
 import { execSync, spawn as nodeSpawn } from "child_process";
 import { fileURLToPath } from "url";
@ -623,40 +623,6 @@ async function showStatus(): Promise<void> {
    console.log(`  Generation:  ${hfLink(activeModels.generate)}`);
  }

-  // Device / GPU info
-  // Important: probing node-llama-cpp can abort the whole process on machines with
-  // incompatible GPU drivers (for example Vulkan loader present but no usable driver).
-  // Keep the native probe opt-in, but always show how QMD is configured and how to probe.
-  console.log(`\n${c.bold}Device${c.reset}`);
-  const configuredGpuMode = configuredGpuModeLabel();
-  console.log(`  Mode:     ${configuredGpuMode}`);
-  if (process.env.QMD_STATUS_DEVICE_PROBE !== "1") {
-    console.log(`  Status:   ${c.dim}not probed${c.reset} (set QMD_STATUS_DEVICE_PROBE=1 to test GPU/CPU backend)`);
-  } else {
-    console.log(`  Status:   probing native llama backend...`);
-    try {
-      const llm = getDefaultLlamaCpp();
-      const device = await llm.getDeviceInfo({ allowBuild: false });
-      if (device.gpu) {
-        console.log(`  GPU:      ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
-        if (device.gpuDevices.length > 0) {
-          console.log(`  Devices:  ${summarizeDeviceNames(device.gpuDevices)}`);
-        }
-        if (device.vram) {
-          console.log(`  VRAM:     ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
-        }
-      } else {
-        console.log(`  GPU:      ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
-        console.log(`  ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
-      }
-      console.log(`  CPU:      ${device.cpuCores} math cores`);
-    } catch (error) {
-      console.log(`  Status:   ${c.dim}probe failed${c.reset}`);
-      if (error instanceof Error && error.message) {
-        console.log(`  ${c.dim}${sanitizeDiagnosticMessage(error.message)}${c.reset}`);
-      }
-    }
-  }

  // Tips section
  const tips: string[] = [];
@ -1514,7 +1480,7 @@ function listFiles(pathArg?: string): void {

  // List files in the collection with size and modification time
  let query: string;
-  let params: any[];
+  let params: SQLiteValue[];

  if (pathPrefix) {
    // List files under a specific path
@ -1764,7 +1730,7 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, coll
    let content: string;
    try {
      content = readFileSync(filepath, "utf-8");
-    } catch (err: any) {
+    } catch {
      // Skip files that can't be read (e.g. iCloud evicted files returning EAGAIN)
      processed++;
      progress.set((processed / total) * 100);
@ -1929,7 +1895,7 @@ async function vectorIndex(
    return;
  }

-  console.log(`${c.dim}Model: ${model}${c.reset}\n`);
+  console.log(`${c.dim}Model: ${shortModelName(model)}${c.reset}\n`);
  if (batchOptions?.maxDocsPerBatch !== undefined || batchOptions?.maxBatchBytes !== undefined) {
    const maxDocsPerBatch = batchOptions.maxDocsPerBatch ?? DEFAULT_EMBED_MAX_DOCS_PER_BATCH;
    const maxBatchBytes = batchOptions.maxBatchBytes ?? DEFAULT_EMBED_MAX_BATCH_BYTES;
@ -1949,21 +1915,28 @@ async function vectorIndex(
    chunkStrategy: batchOptions?.chunkStrategy,
    onProgress: (info) => {
      if (info.totalBytes === 0) return;
-      const percent = (info.bytesProcessed / info.totalBytes) * 100;
+      // Progress is measured by input bytes, not by chunks. The final chunk
+      // count is discovered lazily batch-by-batch, so displaying
+      // chunksEmbedded/totalChunks makes the percent look wrong when a few
+      // large documents remain. Show chunks as a count and label the byte
+      // percentage explicitly as input progress.
+      const percent = Math.min(100, (info.bytesProcessed / info.totalBytes) * 100);
      progress.set(percent);

      const elapsed = (Date.now() - startTime) / 1000;
-      const bytesPerSec = info.bytesProcessed / elapsed;
-      const remainingBytes = info.totalBytes - info.bytesProcessed;
-      const etaSec = remainingBytes / bytesPerSec;
+      const bytesPerSec = elapsed > 0 ? info.bytesProcessed / elapsed : 0;
+      const remainingBytes = Math.max(0, info.totalBytes - info.bytesProcessed);
+      const etaSec = bytesPerSec > 0 ? remainingBytes / bytesPerSec : Number.POSITIVE_INFINITY;

      const bar = renderProgressBar(percent);
      const percentStr = percent.toFixed(0).padStart(3);
-      const throughput = `${formatBytes(bytesPerSec)}/s`;
-      const eta = elapsed > 2 ? formatETA(etaSec) : "...";
-      const errStr = info.errors > 0 ? ` ${c.yellow}${info.errors} err${c.reset}` : "";
+      const throughput = bytesPerSec > 0 ? `${formatBytes(bytesPerSec)}/s` : ".../s";
+      const eta = elapsed > 2 && Number.isFinite(etaSec) ? formatETA(etaSec) : "...";
+      const inputStr = `${formatBytes(info.bytesProcessed)}/${formatBytes(info.totalBytes)} input`;
+      const chunkStr = `${formatCount(info.chunksEmbedded)} chunks`;
+      const errStr = info.errors > 0 ? ` ${c.yellow}${formatCount(info.errors)} err${c.reset}` : "";

-      if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}%${c.reset} ${c.dim}${info.chunksEmbedded}/${info.totalChunks}${c.reset}${errStr} ${c.dim}${throughput} ETA ${eta}${c.reset}   `);
+      if (isTTY) process.stderr.write(`\r${c.cyan}${bar}${c.reset} ${c.bold}${percentStr}% input${c.reset} ${c.dim}${chunkStr}${errStr} · ${inputStr} · ${throughput} · ETA ${eta}${c.reset}   `);
    },
  });

@ -1978,7 +1951,13 @@ async function vectorIndex(
    console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset}                                    `);
    console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${result.chunksEmbedded}${c.reset} chunks from ${c.bold}${result.docsProcessed}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset}`);
    if (result.errors > 0) {
-      console.log(`${c.yellow}⚠ ${result.errors} chunks failed${c.reset}`);
+      console.log(`${c.yellow}⚠ ${formatCount(result.errors)} chunks still failed after retries${c.reset}`);
+      for (const failure of (result.failures ?? []).slice(0, 8)) {
+        console.log(`  ${c.dim}${failure.path}#${failure.seq} (${failure.attempts} attempts): ${failure.reason}${c.reset}`);
+      }
+      if ((result.failures?.length ?? 0) > 8) {
+        console.log(`  ${c.dim}...and ${formatCount((result.failures?.length ?? 0) - 8)} more${c.reset}`);
+      }
    }
  }

@ -3457,7 +3436,6 @@ function collectEnvironmentOverrides(activeModels: { embed: string; generate: st
  add("QMD_FORCE_CPU", "forces llama.cpp to bypass GPU backends; embeddings/query will be slower but GPU crashes are avoided");
  add("QMD_LLAMA_GPU", "selects llama.cpp GPU backend (metal/cuda/vulkan) or disables GPU when set to false/off/0");
  add("QMD_DOCTOR_DEVICE_PROBE", "controls qmd doctor native device probing; 0/off skips GPU probing");
-  add("QMD_STATUS_DEVICE_PROBE", "controls qmd status native device probing only; qmd doctor probes independently");
  add("QMD_EMBED_PARALLELISM", "overrides embedding parallel context count; too high can exhaust RAM/VRAM");
  add("QMD_EXPAND_CONTEXT_SIZE", "overrides query expansion context size; larger values use more memory");
  add("QMD_RERANK_CONTEXT_SIZE", "overrides reranker context size; larger values use more memory");
@ -3655,6 +3633,60 @@ async function checkEmbeddingVectorSamples(db: Database, model: string, fingerpr
  };
 }

+function hasLibraryInDirs(libraryBaseName: string, dirs: string[]): boolean {
+  for (const dir of dirs) {
+    if (!dir || !existsSync(dir)) continue;
+    try {
+      for (const entry of readdirSync(dir)) {
+        if (entry === libraryBaseName || entry.startsWith(`${libraryBaseName}.`)) return true;
+      }
+    } catch { /* ignore unreadable system library dirs */ }
+  }
+  return false;
+}
+
+function linuxCudaRuntimeDiagnostic(): string | null {
+  if (process.platform !== "linux") return null;
+
+  const dirs = new Set<string>();
+  for (const value of [process.env.LD_LIBRARY_PATH, process.env.CUDA_PATH]) {
+    for (const part of (value ?? "").split(":")) {
+      if (part) dirs.add(part);
+    }
+  }
+  if (process.env.CUDA_PATH) {
+    dirs.add(pathJoin(process.env.CUDA_PATH, "lib64"));
+    dirs.add(pathJoin(process.env.CUDA_PATH, "targets", "x86_64-linux", "lib"));
+  }
+  for (const dir of ["/usr/lib", "/usr/lib64", "/usr/lib/x86_64-linux-gnu", "/usr/local/cuda/lib64", "/usr/local/cuda/targets/x86_64-linux/lib"]) {
+    dirs.add(dir);
+  }
+  try {
+    for (const entry of readdirSync("/usr/local")) {
+      if (!entry.toLowerCase().startsWith("cuda-")) continue;
+      const cudaRoot = pathJoin("/usr/local", entry);
+      dirs.add(pathJoin(cudaRoot, "lib64"));
+      dirs.add(pathJoin(cudaRoot, "targets", "x86_64-linux", "lib"));
+    }
+  } catch { /* /usr/local may not be readable in restricted environments */ }
+
+  const searchDirs = [...dirs];
+  const hasDriver = hasLibraryInDirs("libcuda.so", searchDirs) || hasLibraryInDirs("libnvidia-ml.so", searchDirs);
+  if (!hasDriver) return null;
+
+  const cudaLibraries: [library: string, label: string][] = [
+    ["libcudart.so", "CUDA runtime"],
+    ["libcublas.so", "cuBLAS"],
+    ["libcublasLt.so", "cuBLASLt"],
+  ];
+  const missing = cudaLibraries
+    .filter(([library]) => !hasLibraryInDirs(library, searchDirs))
+    .map(([, label]) => label);
+
+  if (missing.length === 0) return null;
+  return `NVIDIA driver libraries are visible, but CUDA user-space libraries are missing from loader paths (${missing.join(", ")})`;
+}
+
 async function runDoctorDeviceChecks(nextSteps: string[]): Promise<void> {
  const mode = configuredGpuModeLabel();
  doctorCheck("device mode", true, mode);
@ -3691,8 +3723,14 @@ async function runDoctorDeviceChecks(nextSteps: string[]): Promise<void> {
        nextSteps.push("GPU was detected but offloading is disabled; check `QMD_LLAMA_GPU=metal|cuda|vulkan` and rerun `qmd doctor`.");
      }
    } else {
-      doctorCheck("device probe", false, `running on CPU (${device.cpuCores} math cores). Next: install/configure Metal, CUDA, or Vulkan for faster embeddings, or set QMD_FORCE_CPU=1 to make CPU mode explicit`);
-      nextSteps.push("Vector operations are running on CPU; install/configure Metal, CUDA, or Vulkan if embedding/query performance is too slow.");
+      const cudaDiagnostic = linuxCudaRuntimeDiagnostic();
+      const diagnosticSuffix = cudaDiagnostic ? ` ${cudaDiagnostic}.` : "";
+      doctorCheck("device probe", false, `running on CPU (${device.cpuCores} math cores).${diagnosticSuffix} Next: install/configure Metal, CUDA, or Vulkan for faster embeddings, or set QMD_FORCE_CPU=1 to make CPU mode explicit`);
+      if (cudaDiagnostic) {
+        nextSteps.push(`${cudaDiagnostic}; install CUDA runtime/cuBLAS libraries or add their directory to LD_LIBRARY_PATH, then rerun \`qmd doctor\`.`);
+      } else {
+        nextSteps.push("Vector operations are running on CPU; install/configure Metal, CUDA, or Vulkan if embedding/query performance is too slow.");
+      }
    }
  } catch (error) {
    if (process.stdout.isTTY) {
@ -3779,6 +3817,15 @@ async function showDoctor(): Promise<void> {
      const label = row.fingerprint === fingerprint ? "current" : (row.fingerprint || "legacy");
      return `${shortModelName(row.model)}:${label} ${formatCount(row.docs)} docs/${formatCount(row.chunks)} chunks`;
    }).join("; ");
+    const namedFingerprintRows = rows.filter(row => row.fingerprint);
+    const namedFingerprints = [...new Set(namedFingerprintRows.map(row => row.fingerprint))];
+    if (namedFingerprints.length > 1) {
+      const namedGroups = namedFingerprintRows
+        .map(row => `${row.fingerprint}${row.fingerprint === fingerprint ? " (current)" : ""}: ${shortModelName(row.model)} ${formatCount(row.docs)} docs/${formatCount(row.chunks)} chunks`)
+        .join("; ");
+      doctorCheck("mixed named embedding fingerprints", false, `content_vectors contains ${namedFingerprints.length} named fingerprints: ${namedGroups}. Next: \`qmd embed\` or \`qmd embed --force\``);
+      nextSteps.push("Run `qmd embed` to converge mixed named embedding fingerprints; use `qmd embed --force` if old named fingerprints or vector sample mismatches remain.");
+    }
    const details = rows.length === 0
      ? `no vectors yet; current fingerprint ${fingerprint}`
      : ok
@ -3815,7 +3862,23 @@ async function showDoctor(): Promise<void> {
  closeDb();
 }

-function readPackageJson(): any {
+function printDoctorHint(): void {
+  console.error("If qmd still behaves unexpectedly, run 'qmd doctor' for diagnostics.");
+}
+
+function exitWithError(error: unknown, code = 1): never {
+  console.error(error instanceof Error ? error.message : String(error));
+  printDoctorHint();
+  process.exit(code);
+}
+
+type PackageJson = {
+  version: string;
+  dependencies?: Record<string, string>;
+  devDependencies?: Record<string, string>;
+};
+
+function readPackageJson(): PackageJson {
  const scriptDir = dirname(fileURLToPath(import.meta.url));
  const pkgPath = resolve(scriptDir, "..", "..", "package.json");
  return JSON.parse(readFileSync(pkgPath, "utf-8"));
@ -4122,6 +4185,7 @@ if (isMain) {
        default:
          console.error(`Unknown subcommand: ${subcommand}`);
          console.error("Run 'qmd collection help' for usage");
+          printDoctorHint();
          process.exit(1);
      }
      break;
@ -4131,8 +4195,7 @@ if (isMain) {
      try {
        initLocalIndex();
      } catch (error) {
-        console.error(error instanceof Error ? error.message : String(error));
-        process.exit(1);
+        exitWithError(error);
      }
      break;

@ -4166,8 +4229,7 @@ if (isMain) {
          collection: embedCollection,
        });
      } catch (error) {
-        console.error(error instanceof Error ? error.message : String(error));
-        process.exit(1);
+        exitWithError(error);
      }
      break;

@ -4314,8 +4376,8 @@ if (isMain) {
        const { startMcpHttpServer } = await import("../mcp/server.js");
        try {
          await startMcpHttpServer(port, { dbPath: getDbPath() });
-        } catch (e: any) {
-          if (e?.code === "EADDRINUSE") {
+        } catch (e: unknown) {
+          if (typeof e === "object" && e !== null && "code" in e && e.code === "EADDRINUSE") {
            console.error(`Port ${port} already in use. Try a different port with --port.`);
            process.exit(1);
          }
@ -4359,8 +4421,7 @@ if (isMain) {
          try {
            await installSkill(Boolean(cli.values.global), Boolean(cli.values.force), Boolean(cli.values.yes));
          } catch (error) {
-            console.error(error instanceof Error ? error.message : String(error));
-            process.exit(1);
+            exitWithError(error);
          }
          break;
        }
@ -4383,6 +4444,7 @@ if (isMain) {
        default:
          console.error(`Unknown subcommand: ${subcommand}`);
          console.error("Run 'qmd skill help' for usage");
+          printDoctorHint();
          process.exit(1);
      }
      break;
@ -4420,6 +4482,7 @@ if (isMain) {
    default:
      console.error(`Unknown command: ${cli.command}`);
      console.error("Run 'qmd --help' for usage.");
+      printDoctorHint();
      process.exit(1);
  }

--- a/src/db.ts
+++ b/src/db.ts
@ -11,10 +11,16 @@
 * SQLite build before creating any database instances.
 */

-export const isBun = typeof globalThis.Bun !== "undefined";
+export const isBun = "Bun" in globalThis;

-let _Database: any;
-let _sqliteVecLoad: ((db: any) => void) | null;
+export type SQLiteValue = string | number | bigint | Buffer | Uint8Array | Float32Array | null;
+export type SQLiteParams = readonly SQLiteValue[];
+
+type DatabaseConstructor = new (path: string) => Database;
+type LoadableSqliteDatabase = Pick<Database, "loadExtension">;
+
+let _Database: DatabaseConstructor;
+let _sqliteVecLoad: ((db: LoadableSqliteDatabase) => void) | null;

 if (isBun) {
  // Dynamic string prevents tsc from resolving bun:sqlite on Node.js builds
@ -44,15 +50,15 @@ if (isBun) {
    const testDb = new BunDatabase(":memory:");
    testDb.loadExtension(vecPath);
    testDb.close();
-    _sqliteVecLoad = (db: any) => db.loadExtension(vecPath);
+    _sqliteVecLoad = (db: LoadableSqliteDatabase) => db.loadExtension(vecPath);
  } catch {
    // Vector search won't work, but BM25 and other operations are unaffected.
    _sqliteVecLoad = null;
  }
 } else {
-  _Database = (await import("better-sqlite3")).default;
+  _Database = (await import("better-sqlite3")).default as unknown as DatabaseConstructor;
  const sqliteVec = await import("sqlite-vec");
-  _sqliteVecLoad = (db: any) => sqliteVec.load(db);
+  _sqliteVecLoad = (db: LoadableSqliteDatabase) => sqliteVec.load(db as Parameters<typeof sqliteVec.load>[0]);
 }

 /**
@ -69,14 +75,14 @@ export interface Database {
  exec(sql: string): void;
  prepare(sql: string): Statement;
  loadExtension(path: string): void;
-  transaction<T extends (...args: any[]) => any>(fn: T): T;
+  transaction<T extends (...args: SQLiteValue[]) => unknown>(fn: T): T;
  close(): void;
 }

 export interface Statement {
-  run(...params: any[]): { changes: number; lastInsertRowid: number | bigint };
-  get(...params: any[]): any;
-  all(...params: any[]): any[];
+  run(...params: SQLiteValue[]): { changes: number; lastInsertRowid: number | bigint };
+  get<T = unknown>(...params: SQLiteValue[]): T | undefined;
+  all<T = unknown>(...params: SQLiteValue[]): T[];
 }

 /**
--- a/src/llm.ts
+++ b/src/llm.ts
@ -11,8 +11,12 @@ import type {
  Token as LlamaToken,
 } from "node-llama-cpp";

+type StdoutChunk = string | Uint8Array;
+type WriteCallback = (err?: Error | null) => void;
+
 type NodeLlamaCppModule = {
  getLlama: (options: Record<string, unknown>) => Promise<Llama>;
+  getLlamaGpuTypes?: (include?: "supported" | "allValid") => Promise<LlamaGpuMode[]>;
  resolveModelFile: (model: string, cacheDir: string) => Promise<string>;
  LlamaChatSession: new (options: { contextSequence: unknown }) => {
    prompt: (prompt: string, options?: Record<string, unknown>) => Promise<string>;
@ -47,8 +51,11 @@ let originalStdoutWrite: StdoutWrite | null = null;
 export async function withNativeStdoutRedirectedToStderr<T>(fn: () => Promise<T>): Promise<T> {
  if (nativeStdoutRedirectDepth === 0) {
    originalStdoutWrite = process.stdout.write.bind(process.stdout) as StdoutWrite;
-    process.stdout.write = ((chunk: any, encoding?: any, cb?: any) => {
-      return process.stderr.write(chunk, encoding, cb as any);
+    process.stdout.write = ((chunk: StdoutChunk, encodingOrCallback?: BufferEncoding | WriteCallback, callback?: WriteCallback) => {
+      if (typeof encodingOrCallback === "function") {
+        return process.stderr.write(chunk, encodingOrCallback);
+      }
+      return process.stderr.write(chunk, encodingOrCallback, callback);
    }) as StdoutWrite;
  }
  nativeStdoutRedirectDepth++;
@ -839,14 +846,15 @@ export class LlamaCpp implements LLM {
    if (!this.llama) {
      const gpuMode = resolveLlamaGpuMode();

-      const { getLlama, LlamaLogLevel } = await loadNodeLlamaCpp();
-      const loadLlama = async (gpu: LlamaGpuMode, sourceBuildAllowed = allowBuild) =>
+      const { getLlama, getLlamaGpuTypes, LlamaLogLevel } = await loadNodeLlamaCpp();
+      const loadLlama = async (gpu: LlamaGpuMode, sourceBuildAllowed = allowBuild, buildOverride?: "auto" | "never") =>
        await withNativeStdoutRedirectedToStderr(() => getLlama({
          // Prefer packaged prebuilt bindings before compiling llama.cpp locally.
-          // "autoAttempt" can try to compile a missing requested backend before
-          // falling back to another prebuilt backend; "auto" uses prebuilt/local
-          // binaries first and only builds when none are usable.
-          build: sourceBuildAllowed ? "auto" : "never",
+          // node-llama-cpp documents gpu:"auto" as the best default: Metal on
+          // Apple Silicon, CUDA when fully available, Vulkan where available,
+          // then CPU. Use build:"auto" for normal loads and build:"never" for
+          // diagnostic/probe paths that must not compile llama.cpp.
+          build: buildOverride ?? (sourceBuildAllowed ? "auto" : "never"),
          logLevel: LlamaLogLevel.error,
          gpu,
          progressLogs: false,
@ -881,6 +889,30 @@ export class LlamaCpp implements LLM {
      } else {
        try {
          llama = await loadLlama(gpuMode);
+
+          // If node-llama-cpp auto-detection chose CPU, do one no-build pass
+          // over all OS-valid packaged GPU backends. This preserves the
+          // documented auto mode for Metal/CUDA/Vulkan while recovering on
+          // systems where a packaged backend can load but detection is too
+          // conservative. Never compile during these extra probes.
+          if (gpuMode === "auto" && llama.gpu === false && getLlamaGpuTypes) {
+            const candidates = (await getLlamaGpuTypes("allValid"))
+              .filter((candidate): candidate is Exclude<LlamaGpuMode, "auto" | false> => candidate !== false && candidate !== "auto");
+            for (const candidate of candidates) {
+              if (failedGpuInitModes.has(candidate)) continue;
+              try {
+                const gpuLlama = await loadLlama(candidate, false, "never");
+                if (gpuLlama.gpu !== false) {
+                  await disposeWithTimeout("CPU llama runtime", () => llama.dispose());
+                  llama = gpuLlama;
+                  break;
+                }
+                await disposeWithTimeout(`${candidate} probe runtime`, () => gpuLlama.dispose());
+              } catch {
+                failedGpuInitModes.add(candidate);
+              }
+            }
+          }
        } catch (err) {
          // GPU backend (e.g. Vulkan/CUDA on headless/driverless machines) can throw at init.
          // Fall back to CPU so qmd still works, and cache the failure to avoid repeated
@ -896,7 +928,7 @@ export class LlamaCpp implements LLM {
      if (llama.gpu === false && !noGpuAccelerationWarningShown) {
        noGpuAccelerationWarningShown = true;
        process.stderr.write(
-          "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'QMD_STATUS_DEVICE_PROBE=1 qmd status' for device details.\n"
+          "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd doctor' for device diagnostics.\n"
        );
      }
      this.llama = llama;
@ -1143,9 +1175,8 @@ export class LlamaCpp implements LLM {
        try {
          this.rerankContexts.push(await model.createRankingContext({
            contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
-            flashAttention: true,
            ...(threads > 0 ? { threads } : {}),
-          } as any));
+          }));
        } catch {
          if (this.rerankContexts.length === 0) {
            // Flash attention might not be supported — retry without it
@ -1359,7 +1390,7 @@ export class LlamaCpp implements LLM {
        temperature,
        topK: 20,
        topP: 0.8,
-        onTextChunk: (text) => {
+        onTextChunk: (text: string) => {
          result += text;
        },
      });
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@ -626,9 +626,21 @@ export async function startMcpHttpServer(
    return new Date().toISOString().slice(11, 23); // HH:mm:ss.SSS
  }

+  type JsonRpcLikeBody = {
+    method?: unknown;
+    params?: {
+      name?: unknown;
+      arguments?: Record<string, unknown>;
+    };
+  };
+  type RestSearchInput = {
+    type?: unknown;
+    query?: unknown;
+  };
+
  /** Extract a human-readable label from a JSON-RPC body */
-  function describeRequest(body: any): string {
-    const method = body?.method ?? "unknown";
+  function describeRequest(body: JsonRpcLikeBody): string {
+    const method = typeof body.method === "string" ? body.method : "unknown";
    if (method === "tools/call") {
      const tool = body.params?.name ?? "?";
      const args = body.params?.arguments;
@ -672,7 +684,7 @@ export async function startMcpHttpServer(
      // REST endpoint: POST /query (alias: /search) — structured search without MCP protocol
      if ((pathname === "/query" || pathname === "/search") && nodeReq.method === "POST") {
        const rawBody = await collectBody(nodeReq);
-        const params = JSON.parse(rawBody);
+        const params = JSON.parse(rawBody) as Record<string, unknown>;

        // Validate required fields
        if (!params.searches || !Array.isArray(params.searches)) {
@ -682,31 +694,32 @@ export async function startMcpHttpServer(
        }

        // Map to internal format
-        const queries: ExpandedQuery[] = params.searches.map((s: any) => ({
+        const searches = params.searches as RestSearchInput[];
+        const queries: ExpandedQuery[] = searches.map((s) => ({
          type: s.type as 'lex' | 'vec' | 'hyde',
          query: String(s.query || ""),
        }));

        // Use default collections if none specified
-        const effectiveCollections = params.collections ?? defaultCollectionNames;
+        const effectiveCollections = Array.isArray(params.collections) ? params.collections.map(String) : defaultCollectionNames;

        const results = await store.search({
          queries,
          collections: effectiveCollections.length > 0 ? effectiveCollections : undefined,
-          limit: params.limit ?? 10,
-          minScore: params.minScore ?? 0,
-          candidateLimit: params.candidateLimit,
-          intent: params.intent,
-          rerank: params.rerank,
+          limit: typeof params.limit === "number" ? params.limit : 10,
+          minScore: typeof params.minScore === "number" ? params.minScore : 0,
+          candidateLimit: typeof params.candidateLimit === "number" ? params.candidateLimit : undefined,
+          intent: typeof params.intent === "string" ? params.intent : undefined,
+          rerank: typeof params.rerank === "boolean" ? params.rerank : undefined,
        });

        // Use first lex or vec query for snippet extraction
-        const primaryQuery = params.searches.find((s: any) => s.type === 'lex')?.query
-          || params.searches.find((s: any) => s.type === 'vec')?.query
-          || params.searches[0]?.query || "";
+        const primaryQuery = searches.find((s) => s.type === 'lex')?.query
+          || searches.find((s) => s.type === 'vec')?.query
+          || searches[0]?.query || "";

        const formatted = results.map(r => {
-          const { line, snippet } = extractSnippet(r.body, primaryQuery, 300, r.bestChunkPos, r.bestChunk.length, params.intent);
+          const { line, snippet } = extractSnippet(r.body, String(primaryQuery), 300, r.bestChunkPos, r.bestChunk.length, typeof params.intent === "string" ? params.intent : undefined);
          return {
            docid: `#${r.docid}`,
            file: r.displayPath,
--- a/src/store.ts
+++ b/src/store.ts
@ -1371,18 +1371,30 @@ export async function reindexCollection(
  return { indexed, updated, unchanged, removed, orphanedCleaned };
 }

+export type EmbedFailure = {
+  path: string;
+  hash: string;
+  seq: number;
+  attempts: number;
+  reason: string;
+};
+
 export type EmbedProgress = {
  chunksEmbedded: number;
  totalChunks: number;
  bytesProcessed: number;
  totalBytes: number;
+  /** Active failed chunks still awaiting a successful retry. */
  errors: number;
+  failures?: EmbedFailure[];
 };

 export type EmbedResult = {
  docsProcessed: number;
  chunksEmbedded: number;
+  /** Active failed chunks that did not recover after retries. */
  errors: number;
+  failures?: EmbedFailure[];
  durationMs: number;
 };

@ -1412,12 +1424,14 @@ type EmbeddingDoc = PendingEmbeddingDoc & {

 type ChunkItem = {
  hash: string;
+  path: string;
  title: string;
  text: string;
  seq: number;
  pos: number;
  tokens: number;
  bytes: number;
+  expectedTotalChunks: number;
 };

 function validatePositiveIntegerOption(name: string, value: number | undefined, fallback: number): number {
@ -1591,11 +1605,81 @@ export async function generateEmbeddings(
  // Create a session manager for this llm instance
  const result = await withLLMSessionForLlm(llm, async (session) => {
    let chunksEmbedded = 0;
-    let errors = 0;
    let bytesProcessed = 0;
    let totalChunks = 0;
    let vectorTableInitialized = false;
    const BATCH_SIZE = 32;
+    const RETRY_AFTER_SUCCESSFUL_CHUNKS = 64;
+    const MAX_RETRY_ATTEMPTS = 3;
+    const failures = new Map<string, EmbedFailure>();
+    const retryQueue = new Map<string, ChunkItem>();
+    let successesSinceRetry = 0;
+
+    const failureList = () => [...failures.values()];
+    const activeErrorCount = () => failures.size;
+    const chunkKey = (chunk: ChunkItem) => `${chunk.hash}:${chunk.seq}`;
+    const reasonFromError = (error: unknown) => {
+      const raw = error instanceof Error ? error.message : String(error);
+      return raw.length > 180 ? `${raw.slice(0, 177)}...` : raw;
+    };
+    const recordFailure = (chunk: ChunkItem, reason: string) => {
+      const key = chunkKey(chunk);
+      const previous = failures.get(key);
+      failures.set(key, {
+        path: chunk.path,
+        hash: chunk.hash,
+        seq: chunk.seq,
+        attempts: (previous?.attempts ?? 0) + 1,
+        reason,
+      });
+      retryQueue.set(key, chunk);
+    };
+    const clearFailure = (chunk: ChunkItem) => {
+      const key = chunkKey(chunk);
+      failures.delete(key);
+      retryQueue.delete(key);
+    };
+    const tryEmbedChunk = async (chunk: ChunkItem): Promise<boolean> => {
+      try {
+        const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
+        const result = await session.embed(text, { model });
+        if (!result) {
+          recordFailure(chunk, "embedding returned no vector");
+          return false;
+        }
+        insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
+        chunksEmbedded++;
+        successesSinceRetry++;
+        clearFailure(chunk);
+        return true;
+      } catch (error) {
+        recordFailure(chunk, reasonFromError(error));
+        return false;
+      }
+    };
+    const retryFailedChunks = async (force = false) => {
+      if (!session.isValid || retryQueue.size === 0) return;
+      if (!force && successesSinceRetry < RETRY_AFTER_SUCCESSFUL_CHUNKS) return;
+      successesSinceRetry = 0;
+
+      // Normal mode: one retry pass after enough unrelated chunks succeeded.
+      // Force mode: we have run out of other chunks for this batch, so keep
+      // retrying outstanding failures until they recover or hit the cap. The
+      // cap prevents endless loops on permanently bad chunks.
+      do {
+        let retried = 0;
+        for (const [key, chunk] of [...retryQueue]) {
+          const failure = failures.get(key);
+          if (!failure || failure.attempts >= MAX_RETRY_ATTEMPTS) continue;
+          retried++;
+          await tryEmbedChunk(chunk);
+        }
+        if (!force || retried === 0) break;
+      } while (session.isValid && [...retryQueue].some(([key]) => {
+        const failure = failures.get(key);
+        return !!failure && failure.attempts < MAX_RETRY_ATTEMPTS;
+      }));
+    };
    const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes);

    for (const batchMeta of batches) {
@ -1625,12 +1709,14 @@ export async function generateEmbeddings(
        for (let seq = 0; seq < chunks.length; seq++) {
          batchChunks.push({
            hash: doc.hash,
+            path: doc.path,
            title,
            text: chunks[seq]!.text,
            seq,
            pos: chunks[seq]!.pos,
            tokens: chunks[seq]!.tokens,
            bytes: encoder.encode(chunks[seq]!.text).length,
+            expectedTotalChunks: chunks.length,
          });
        }
        expectedChunksByHash.set(doc.hash, chunks.length);
@ -1640,7 +1726,7 @@ export async function generateEmbeddings(

      if (batchChunks.length === 0) {
        bytesProcessed += batchBytes;
-        options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
+        options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
        continue;
      }

@ -1661,18 +1747,18 @@ export async function generateEmbeddings(
      for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) {
        // Abort early if session has been invalidated (e.g. max duration exceeded)
        if (!session.isValid) {
-          const remaining = batchChunks.length - batchStart;
-          errors += remaining;
-          console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`);
+          const remainingChunks = batchChunks.slice(batchStart);
+          for (const chunk of remainingChunks) recordFailure(chunk, "LLM session expired before embedding chunk");
+          console.warn(`⚠ Session expired — skipping ${remainingChunks.length} remaining chunks`);
          break;
        }

-        // Abort early if error rate is too high (>80% of processed chunks failed)
-        const processed = chunksEmbedded + errors;
-        if (processed >= BATCH_SIZE && errors > processed * 0.8) {
-          const remaining = batchChunks.length - batchStart;
-          errors += remaining;
-          console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`);
+        // Abort early if active error rate is too high (>80% of attempted chunks failed)
+        const processed = chunksEmbedded + activeErrorCount();
+        if (processed >= BATCH_SIZE && activeErrorCount() > processed * 0.8) {
+          const remainingChunks = batchChunks.slice(batchStart);
+          for (const chunk of remainingChunks) recordFailure(chunk, "embedding aborted because error rate was too high");
+          console.warn(`⚠ Error rate too high (${activeErrorCount()}/${processed}) — aborting embedding`);
          break;
        }

@ -1686,34 +1772,29 @@ export async function generateEmbeddings(
            const chunk = chunkBatch[i]!;
            const embedding = embeddings[i];
            if (embedding) {
-              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, chunk.expectedTotalChunks, fingerprint);
              chunksEmbedded++;
+              successesSinceRetry++;
+              clearFailure(chunk);
            } else {
-              errors++;
+              recordFailure(chunk, "batch embedding returned no vector");
            }
            batchChunkBytesProcessed += chunk.bytes;
          }
-        } catch {
-          // Batch failed — try individual embeddings as fallback
-          // But skip if session is already invalid (avoids N doomed retries)
+          await retryFailedChunks();
+        } catch (error) {
+          // Batch failed — try individual embeddings as fallback. If an
+          // individual retry succeeds, any prior failure for that chunk is
+          // cleared, so the visible error count reflects outstanding failures.
+          const batchReason = reasonFromError(error);
          if (!session.isValid) {
-            errors += chunkBatch.length;
+            for (const chunk of chunkBatch) recordFailure(chunk, `batch failed and session expired: ${batchReason}`);
            batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0);
          } else {
            for (const chunk of chunkBatch) {
-              try {
-                const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
-                const result = await session.embed(text, { model });
-                if (result) {
-                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
-                  chunksEmbedded++;
-                } else {
-                  errors++;
-                }
-              } catch {
-                errors++;
-              }
+              await tryEmbedChunk(chunk);
              batchChunkBytesProcessed += chunk.bytes;
+              await retryFailedChunks();
            }
          }
        }
@ -1726,26 +1807,30 @@ export async function generateEmbeddings(
          totalChunks,
          bytesProcessed: bytesProcessed + proportionalBytes,
          totalBytes,
-          errors,
+          errors: activeErrorCount(),
+          failures: failureList(),
        });
      }

+      await retryFailedChunks(true);
+
      const removedPartialChunks = removeIncompleteEmbeddings(db, expectedChunksByHash, model);
      if (removedPartialChunks > 0) {
        chunksEmbedded = Math.max(0, chunksEmbedded - removedPartialChunks);
      }

      bytesProcessed += batchBytes;
-      options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors });
+      options?.onProgress?.({ chunksEmbedded, totalChunks, bytesProcessed, totalBytes, errors: activeErrorCount(), failures: failureList() });
    }

-    return { chunksEmbedded, errors };
+    return { chunksEmbedded, errors: activeErrorCount(), failures: failureList() };
  }, { maxDuration: 30 * 60 * 1000, name: 'generateEmbeddings' });

  return {
    docsProcessed: totalDocs,
    chunksEmbedded: result.chunksEmbedded,
    errors: result.errors,
+    failures: result.failures,
    durationMs: Date.now() - startTime,
  };
 }
@ -3635,12 +3720,14 @@ export async function expandQuery(query: string, model: string = DEFAULT_QUERY_M
  const cached = getCachedResult(db, cacheKey);
  if (cached) {
    try {
-      const parsed = JSON.parse(cached) as any[];
+      const parsed = JSON.parse(cached) as unknown;
+      if (!Array.isArray(parsed)) return [];
+      const rows = parsed as Array<Record<string, unknown>>;
      // Migrate old cache format: { type, text } → { type, query }
-      if (parsed.length > 0 && parsed[0].query) {
-        return parsed as ExpandedQuery[];
-      } else if (parsed.length > 0 && parsed[0].text) {
-        return parsed.map((r: any) => ({ type: r.type, query: r.text }));
+      if (rows.length > 0 && typeof rows[0]?.query === "string") {
+        return rows.map((r) => ({ type: r.type as ExpandedQuery["type"], query: String(r.query) }));
+      } else if (rows.length > 0 && typeof rows[0]?.text === "string") {
+        return rows.map((r) => ({ type: r.type as ExpandedQuery["type"], query: String(r.text) }));
      }
    } catch {
      // Old cache format (pre-typed, newline-separated text) — re-expand
--- a/src/types/picomatch.d.ts
+++ b/src/types/picomatch.d.ts
@ -0,0 +1,4 @@
+declare module "picomatch" {
+  export type Matcher = (input: string) => boolean;
+  export default function picomatch(pattern: string | string[], options?: Record<string, unknown>): Matcher;
+}
--- a/test/bin-wrapper.test.ts
+++ b/test/bin-wrapper.test.ts
@ -27,7 +27,7 @@ function makeTempFixture() {
  return { root, capturePath, runtimeBin };
 }

-function makePackage(root: string, packagePath: string, lockfiles: string[] = [], options: { dist?: boolean; source?: boolean; tsx?: boolean } = {}) {
+function makePackage(root: string, packagePath: string, lockfiles: string[] = [], options: { dist?: boolean; source?: boolean; tsx?: boolean; git?: boolean } = {}) {
  const packageRoot = join(root, packagePath);
  const includeDist = options.dist ?? true;
  mkdirSync(join(packageRoot, "bin"), { recursive: true });
@ -45,6 +45,9 @@ function makePackage(root: string, packagePath: string, lockfiles: string[] = []
    mkdirSync(join(packageRoot, "node_modules", "tsx", "dist"), { recursive: true });
    writeFileSync(join(packageRoot, "node_modules", "tsx", "dist", "cli.mjs"), "// tsx fixture\n");
  }
+  if (options.git) {
+    mkdirSync(join(packageRoot, ".git"), { recursive: true });
+  }
  for (const lockfile of lockfiles) {
    writeFileSync(join(packageRoot, lockfile), "");
  }
@ -173,9 +176,19 @@ describe("bin/qmd package wrapper", () => {
    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
  });

-  test("falls back to source with bun in an unbuilt Bun checkout", () => {
+  test("packaged tree uses dist even if source files are present", () => {
    const { root, runtimeBin, capturePath } = makeTempFixture();
-    const packageRoot = makePackage(root, "qmd", ["bun.lock"], { dist: false, source: true });
+    const packageRoot = makePackage(root, "node_modules/@tobilu/qmd", ["bun.lock"], { source: true });
+
+    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);
+
+    expect(result.runtime).toBe("bun");
+    expect(result.scriptPath).toBe(realpathSync(join(packageRoot, "dist", "cli", "qmd.js")));
+  });
+
+  test("prefers source with bun in a Bun checkout even when dist exists", () => {
+    const { root, runtimeBin, capturePath } = makeTempFixture();
+    const packageRoot = makePackage(root, "qmd", ["bun.lock"], { source: true, git: true });

    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);

@ -184,9 +197,9 @@ describe("bin/qmd package wrapper", () => {
    expect(result.args).toEqual(["--version"]);
  });

-  test("falls back to source through tsx in an unbuilt Node checkout", () => {
+  test("prefers source through tsx in a Node checkout even when dist exists", () => {
    const { root, runtimeBin, capturePath } = makeTempFixture();
-    const packageRoot = makePackage(root, "qmd", [], { dist: false, source: true, tsx: true });
+    const packageRoot = makePackage(root, "qmd", [], { source: true, tsx: true, git: true });

    const result = runWrapper(join(packageRoot, "bin", "qmd"), runtimeBin, capturePath);

@ -212,5 +225,6 @@ describe("bin/qmd package wrapper", () => {
    expect(result.stderr).toContain("qmd is not built");
    expect(result.stderr).toContain("bun install && bun run build");
    expect(result.stderr).toContain("npm install && npm run build");
+    expect(result.stderr).toContain("qmd doctor");
  });
 });
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@ -630,7 +630,6 @@ describe("CLI Status Command", () => {
    const overrides = {
      XDG_CACHE_HOME: join(env.configDir, "cache"),
      QMD_DOCTOR_DEVICE_PROBE: "0",
-      QMD_STATUS_DEVICE_PROBE: "1",
      QMD_FORCE_CPU: "1",
      QMD_LLAMA_GPU: "metal",
      QMD_EMBED_PARALLELISM: "2",
@ -665,15 +664,21 @@ describe("CLI Status Command", () => {
  test("qmd doctor flags mixed embedding fingerprints", async () => {
    const db = openDatabase(testDbPath);
    const doc = db.prepare(`SELECT hash FROM documents WHERE active = 1 LIMIT 1`).get() as { hash: string };
+    const now = new Date().toISOString();
    db.prepare(`
      INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at)
-      VALUES (?, 0, 0, ?, 'stale1', 1, ?)
-    `).run(doc.hash, resolveEmbedModelForCli(), new Date().toISOString());
+      VALUES (?, 0, 0, ?, 'stale1', 2, ?)
+    `).run(doc.hash, resolveEmbedModelForCli(), now);
+    db.prepare(`
+      INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at)
+      VALUES (?, 1, 1, ?, 'stale2', 2, ?)
+    `).run(doc.hash, resolveEmbedModelForCli(), now);
    db.close();

    const { stdout, exitCode } = await runQmd(["doctor"]);
    expect(exitCode).toBe(0);
    expect(stdout).toContain("embedding fingerprints");
+    expect(stdout).toContain("mixed named embedding fingerprints");
    expect(stdout).toContain("stale1");
  }, 20000);

@ -684,13 +689,12 @@ describe("CLI Status Command", () => {
    expect(stdout).toContain("Collection");
  });

-  test("shows device mode without native probing by default", async () => {
+  test("status omits device probing details; doctor owns GPU diagnostics", async () => {
    const { stdout, exitCode } = await runQmd(["status"]);
    expect(exitCode).toBe(0);
-    expect(stdout).toContain("Device");
-    expect(stdout).toContain("Mode:");
-    expect(stdout).toContain("not probed");
-    expect(stdout).toContain("QMD_STATUS_DEVICE_PROBE=1");
+    expect(stdout).not.toContain("Device");
+    expect(stdout).not.toContain("QMD_STATUS_DEVICE_PROBE");
+    expect(stdout).not.toContain("not probed");
  });
 });

@ -973,8 +977,9 @@ describe("CLI Error Handling", () => {
  test("handles unknown command", async () => {
    const { stderr, exitCode } = await runQmd(["unknowncommand"]);
    expect(exitCode).toBe(1);
-    // Should indicate unknown command
+    // Should indicate unknown command and point users to diagnostics
    expect(stderr).toContain("Unknown command");
+    expect(stderr).toContain("qmd doctor");
  });

  test("uses INDEX_PATH environment variable", async () => {
@ -1750,11 +1755,15 @@ describe("status and collection list hide filesystem paths", () => {
  });

  test("doctor does not show full filesystem paths", async () => {
-    const { stdout, exitCode } = await runQmd(["doctor"], { dbPath: localDbPath, configDir: localConfigDir });
+    const { stdout, exitCode } = await runQmd(["doctor"], {
+      dbPath: localDbPath,
+      configDir: localConfigDir,
+      env: { QMD_DOCTOR_DEVICE_PROBE: "0" },
+    });
    expect(exitCode).toBe(0);

    expect(stdout).toContain("QMD Doctor");
-    const lines = stdout.split('\n').filter(l => !l.includes('Index:'));
+    const lines = stdout.split('\n').filter(l => !l.includes('Index:') && !l.includes('INDEX_PATH=') && !l.includes('QMD_CONFIG_DIR='));
    const pathLines = lines.filter(l => l.includes('/Users/') || l.includes('/home/') || l.includes('/tmp/'));
    expect(pathLines.length).toBe(0);
  }, 20000);
@ -2079,6 +2088,7 @@ describe("mcp stdio launcher", () => {
    try {
      await mkdir(join(tempPackage, "bin"), { recursive: true });
      await mkdir(join(tempPackage, "dist", "cli"), { recursive: true });
+      await writeFile(join(tempPackage, "dist", "cli", "qmd.js"), "// fixture\n");
      await mkdir(join(tempPackage, "fake-bin"), { recursive: true });

      const qmdBin = join(tempPackage, "bin", "qmd");
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@ -263,7 +263,8 @@ describe("native llama stdout containment", () => {

      const stderr = String(stderrSpy.mock.calls.map(call => call[0]).join(""));
      expect(stderr.match(/no GPU acceleration/g)?.length).toBe(1);
-      expect(stderr).toContain("QMD_STATUS_DEVICE_PROBE=1 qmd status");
+      expect(stderr).toContain("qmd doctor");
+      expect(stderr).not.toContain("QMD_STATUS_DEVICE_PROBE");
    } finally {
      stderrSpy.mockRestore();
      setNodeLlamaCppModuleForTest(null);
--- a/test/store.test.ts
+++ b/test/store.test.ts
@ -3242,9 +3242,13 @@ describe("Embedding batching", () => {
  test("generateEmbeddings does not mark a partially embedded multi-chunk document complete", async () => {
    const store = await createTestStore();
    const db = store.db;
+    let embedCalls = 0;
    const fakeLlm = {
      async embed(_text: string, _options?: { model?: string }) {
-        return { embedding: [0.1, 0.2, 0.3], model: "fake-embed" };
+        embedCalls++;
+        return embedCalls === 1
+          ? { embedding: [0.1, 0.2, 0.3], model: "fake-embed" }
+          : null;
      },
      async embedBatch(texts: string[], _options?: { model?: string }) {
        return texts.map((_text, index) => index === 0
@ -3266,6 +3270,7 @@ describe("Embedding batching", () => {
      const result = await generateEmbeddings(store);

      expect(result.errors).toBeGreaterThan(0);
+      expect(result.failures?.[0]?.attempts).toBe(3);
      expect(db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get()).toEqual({ count: 0 });
      expect(db.prepare(`SELECT COUNT(*) as count FROM vectors_vec`).get()).toEqual({ count: 0 });
      expect(store.getHashesNeedingEmbedding()).toBe(1);
@ -3276,6 +3281,42 @@ describe("Embedding batching", () => {
    }
  });

+  test("generateEmbeddings clears chunk errors after successful retry", async () => {
+    const store = await createTestStore();
+    const db = store.db;
+    const fakeLlm = {
+      async embed(_text: string, _options?: { model?: string }) {
+        return { embedding: [0.1, 0.2, 0.3], model: "fake-embed" };
+      },
+      async embedBatch(texts: string[], _options?: { model?: string }) {
+        return texts.map((_text, index) => index === 0
+          ? { embedding: [1, 2, 3], model: "fake-embed" }
+          : null
+        );
+      },
+    };
+
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.llm = fakeLlm as any;
+
+    try {
+      await insertTestDocument(db, "docs", {
+        name: "retry-doc",
+        body: "# Retry doc\n\n" + "transient embedding failure ".repeat(260),
+      });
+
+      const result = await generateEmbeddings(store);
+
+      expect(result.errors).toBe(0);
+      expect(result.failures).toEqual([]);
+      expect(db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get()).toEqual({ count: result.chunksEmbedded });
+      expect(store.getHashesNeedingEmbedding()).toBe(0);
+    } finally {
+      setDefaultLlamaCpp(null);
+      await cleanupTestDb(store);
+    }
+  });
+
  test("generateEmbeddings opens a long-lived LLM session for embed runs", async () => {
    const store = await createTestStore();
    const fakeLlm = createFakeEmbedLlm();
--- a/tsconfig.build.json
+++ b/tsconfig.build.json
@ -4,7 +4,7 @@
    "noEmit": false,
    "outDir": "dist",
    "declaration": true,
-    "noImplicitAny": false
+    "noImplicitAny": true
  },
  "include": ["src/**/*.ts"],
  "exclude": ["src/**/*.test.ts", "src/test-preload.ts", "src/bench-*.ts"]