feat: add qmd doctor vector diagnostics

2026-05-18 01:52:05 +00:00 · 2026-05-18 01:52:05 +00:00 · ac6b154f0c
commit ac6b154f0c
parent ddbd6bd8be
6 changed files with 408 additions and 72 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -5,9 +5,12 @@
 ### Changes

 - Agent skills: add `qmd skills list|get|path` to serve version-matched runtime skill instructions from the installed CLI, and make `qmd skill install` write a stable discovery stub so installed agent skills do not go stale after QMD upgrades.
+- CLI: add `qmd doctor` for index/runtime diagnostics, including SQLite/sqlite-vec versions, embedding fingerprint freshness, mixed-fingerprint detection, safe legacy fingerprint adoption, and content-hash sampling.

 ### Fixes

+- Embedding: fingerprint vector metadata using the active embedding model and formatting/chunking parameters so stale vectors are treated as pending after search semantics change. Legacy `content_vectors` columns are migrated lazily on first vector-health/write use to preserve fast QMD startup.
+
 - Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable.
 - Tests: make `bun run test` execute the local unit suite under both Node/Vitest and Bun (`test:node` + `test:bun`) so runtime-specific regressions are caught before CI.
 - Model config: centralize embedding/rerank/generation model resolution so `qmd embed`, `status`, `query`, `vsearch`, `pull`, SDK vector search, and `bench` use the same active `.qmd/index.yaml` model hints and environment fallbacks.
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -1,4 +1,4 @@
-import { openDatabase } from "../db.js";
+import { isBun, openDatabase } from "../db.js";
 import type { Database } from "../db.js";
 import fastGlob from "fast-glob";
 import { execSync, spawn as nodeSpawn } from "child_process";
@ -31,6 +31,7 @@ import {
  hashContent,
  extractTitle,
  formatDocForEmbedding,
+  getEmbeddingFingerprint,
  chunkDocumentByTokens,
  clearCache,
  getCacheKey,
@ -74,6 +75,7 @@ import {
  getDefaultDbPath,
  reindexCollection,
  generateEmbeddings,
+  maybeAdoptLegacyEmbeddingFingerprint,
  syncConfigToDb,
  type ReindexResult,
  type ChunkStrategy,
@ -3228,10 +3230,103 @@ function showHelp(): void {
  console.log(`Index: ${getDbPath()}`);
 }

-async function showVersion(): Promise<void> {
+function doctorCheck(label: string, ok: boolean, details: string): void {
+  const mark = ok ? `${c.green}✓${c.reset}` : `${c.yellow}⚠${c.reset}`;
+  console.log(`${mark} ${label}: ${details}`);
+}
+
+async function showDoctor(): Promise<void> {
+  const storeInstance = getStore();
+  const db = storeInstance.db;
+  const pkg = readPackageJson();
+  const embedModel = resolveEmbedModelForCli();
+  const fingerprint = getEmbeddingFingerprint(embedModel);
+
+  console.log(`${c.bold}QMD Doctor${c.reset}\n`);
+  console.log(`Index: ${getDbPath()}`);
+  console.log(`Runtime: ${isBun ? "bun:sqlite" : "better-sqlite3"}`);
+
+  try {
+    const row = db.prepare(`SELECT sqlite_version() AS version`).get() as { version: string };
+    doctorCheck("SQLite runtime", true, row.version);
+  } catch (error) {
+    doctorCheck("SQLite runtime", false, error instanceof Error ? error.message : String(error));
+  }
+
+  const betterSqliteVersion = pkg.dependencies?.["better-sqlite3"] ?? pkg.devDependencies?.["better-sqlite3"] ?? "not declared";
+  doctorCheck("better_sqlite version", true, String(betterSqliteVersion));
+
+  try {
+    const row = db.prepare(`SELECT vec_version() AS version`).get() as { version: string };
+    doctorCheck("sqlite-vec", true, row.version);
+  } catch (error) {
+    doctorCheck("sqlite-vec", false, error instanceof Error ? error.message : String(error));
+  }
+
+  try {
+    const adoption = await maybeAdoptLegacyEmbeddingFingerprint(storeInstance, embedModel);
+    if (adoption.checked || adoption.adopted > 0) {
+      doctorCheck("legacy fingerprint adoption", adoption.adopted > 0, adoption.adopted > 0 ? `adopted ${adoption.adopted} legacy chunks; ${adoption.reason}` : adoption.reason);
+    }
+  } catch (error) {
+    doctorCheck("legacy fingerprint adoption", false, error instanceof Error ? error.message : String(error));
+  }
+
+  try {
+    const pending = getHashesNeedingEmbedding(db, undefined, embedModel);
+    doctorCheck("embedding freshness", pending === 0, pending === 0 ? "all active documents match current fingerprint" : `${pending} active documents need embedding`);
+  } catch (error) {
+    doctorCheck("embedding freshness", false, error instanceof Error ? error.message : String(error));
+  }
+
+  try {
+    const rows = db.prepare(`
+      SELECT model, embed_fingerprint AS fingerprint, COUNT(DISTINCT hash) AS docs, COUNT(*) AS chunks
+      FROM content_vectors
+      GROUP BY model, embed_fingerprint
+      ORDER BY chunks DESC, model, embed_fingerprint
+    `).all() as { model: string; fingerprint: string; docs: number; chunks: number }[];
+    const uniqueFingerprints = new Set(rows.map(row => row.fingerprint));
+    const offCurrent = rows.filter(row => row.model === embedModel && row.fingerprint !== fingerprint);
+    const ok = rows.length === 0 || (uniqueFingerprints.size === 1 && rows[0]?.fingerprint === fingerprint && offCurrent.length === 0);
+    const details = rows.length === 0
+      ? `none yet; current ${fingerprint}`
+      : rows.map(row => {
+          const label = row.fingerprint === fingerprint ? "current" : (row.fingerprint || "legacy");
+          return `${row.model}:${label} ${row.docs} docs/${row.chunks} chunks`;
+        }).join("; ");
+    doctorCheck("embedding fingerprints", ok, details);
+  } catch (error) {
+    doctorCheck("embedding fingerprints", false, error instanceof Error ? error.message : String(error));
+  }
+
+  const sample = db.prepare(`
+    SELECT c.hash, c.doc
+    FROM documents d
+    JOIN content c ON c.hash = d.hash
+    WHERE d.active = 1
+    ORDER BY random()
+    LIMIT 1
+  `).get() as { hash: string; doc: string } | undefined;
+  if (sample) {
+    const rehashed = await hashContent(sample.doc);
+    doctorCheck("content hash sample", rehashed === sample.hash, `${sample.hash.slice(0, 12)} ${rehashed === sample.hash ? "matches" : `!= ${rehashed.slice(0, 12)}`}`);
+  } else {
+    doctorCheck("content hash sample", true, "no active documents indexed");
+  }
+
+  closeDb();
+}
+
+function readPackageJson(): any {
  const scriptDir = dirname(fileURLToPath(import.meta.url));
  const pkgPath = resolve(scriptDir, "..", "..", "package.json");
-  const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
+  return JSON.parse(readFileSync(pkgPath, "utf-8"));
+}
+
+async function showVersion(): Promise<void> {
+  const scriptDir = dirname(fileURLToPath(import.meta.url));
+  const pkg = readPackageJson();

  let commit = "";
  try {
@ -3539,6 +3634,10 @@ if (isMain) {
      await showStatus();
      break;

+    case "doctor":
+      await showDoctor();
+      break;
+
    case "update":
      await updateCollections();
      break;
--- a/src/store.ts
+++ b/src/store.ts
@ -50,6 +50,10 @@ export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
 export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
 export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB

+const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
+const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
+const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
+
 // Chunking: 900 tokens per chunk with 15% overlap
 // Increased from 800 to accommodate smart chunking finding natural break points
 export const CHUNK_SIZE_TOKENS = 900;
@ -61,6 +65,17 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4;  // 540 chars
 export const CHUNK_WINDOW_TOKENS = 200;
 export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4;  // 800 chars

+export function getEmbeddingFingerprint(model: string = DEFAULT_EMBED_MODEL): string {
+  const significant = [
+    `model:${model}`,
+    `query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
+    `doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
+    `chunk_tokens:${CHUNK_SIZE_TOKENS}`,
+    `chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
+  ].join("\n");
+  return createHash("sha256").update(significant).digest("hex").slice(0, 6);
+}
+
 /**
 * Get the LlamaCpp instance for a store — prefers the store's own instance,
 * falls back to the global singleton.
@ -861,28 +876,20 @@ function initializeDatabase(db: Database): void {
    )
  `);

-  // Content vectors
-  const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
-  const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
-  if (cvInfo.length > 0 && !hasSeqColumn) {
-    db.exec(`DROP TABLE IF EXISTS content_vectors`);
-    db.exec(`DROP TABLE IF EXISTS vectors_vec`);
-  }
+  // Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
+  // columns are repaired lazily when a vector/embedding query first needs them.
  db.exec(`
    CREATE TABLE IF NOT EXISTS content_vectors (
      hash TEXT NOT NULL,
      seq INTEGER NOT NULL DEFAULT 0,
      pos INTEGER NOT NULL DEFAULT 0,
      model TEXT NOT NULL,
+      embed_fingerprint TEXT NOT NULL DEFAULT '',
      total_chunks INTEGER NOT NULL DEFAULT 1,
      embedded_at TEXT NOT NULL,
      PRIMARY KEY (hash, seq)
    )
  `);
-  const cvInfoAfterCreate = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
-  if (!cvInfoAfterCreate.some(col => col.name === 'total_chunks')) {
-    db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
-  }

  // Store collections — makes the DB self-contained (no external config needed)
  db.exec(`
@ -1237,7 +1244,7 @@ export type Store = {
  // Vector/embedding operations
  getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
  clearAllEmbeddings: () => void;
-  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => void;
+  insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => void;
 };

 // =============================================================================
@ -1428,31 +1435,77 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
  };
 }

-function contentVectorExpectedChunksExpr(db: Database): string {
-  const columns = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
-  return columns.some(col => col.name === 'total_chunks') ? 'MAX(total_chunks)' : '1';
+function contentVectorSchemaRepairFor(error: unknown): "embed_fingerprint" | "total_chunks" | null {
+  const message = error instanceof Error ? error.message : String(error);
+  if (
+    message.includes("no such column: embed_fingerprint") ||
+    message.includes("has no column named embed_fingerprint")
+  ) {
+    return "embed_fingerprint";
+  }
+  if (
+    message.includes("no such column: total_chunks") ||
+    message.includes("has no column named total_chunks")
+  ) {
+    return "total_chunks";
+  }
+  return null;
+}
+
+function repairContentVectorColumn(db: Database, column: "embed_fingerprint" | "total_chunks"): void {
+  try {
+    if (column === "embed_fingerprint") {
+      db.exec(`ALTER TABLE content_vectors ADD COLUMN embed_fingerprint TEXT NOT NULL DEFAULT ''`);
+    } else {
+      db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
+    }
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    // Another caller may have already repaired the column between error and ALTER.
+    if (!message.includes("duplicate column name")) {
+      throw error;
+    }
+  }
+}
+
+function withLazyContentVectorMigration<T>(db: Database, operation: () => T): T {
+  const repaired = new Set<string>();
+  while (true) {
+    try {
+      return operation();
+    } catch (error) {
+      const column = contentVectorSchemaRepairFor(error);
+      if (!column || repaired.has(column)) {
+        throw error;
+      }
+      repairContentVectorColumn(db, column);
+      repaired.add(column);
+    }
+  }
 }

 function getPendingEmbeddingDocs(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): PendingEmbeddingDoc[] {
  const collectionFilter = collection ? `AND d.collection = ?` : ``;
-  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
-  const stmt = db.prepare(`
-    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
-    FROM documents d
-    JOIN content c ON d.hash = c.hash
-    LEFT JOIN (
-      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
-      FROM content_vectors
-      WHERE model = ?
-      GROUP BY hash, model
-    ) v ON d.hash = v.hash
-    WHERE d.active = 1
-      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
-      ${collectionFilter}
-    GROUP BY d.hash
-    ORDER BY MIN(d.path)
-  `);
-  return (collection ? stmt.all(model, collection) : stmt.all(model)) as PendingEmbeddingDoc[];
+  const fingerprint = getEmbeddingFingerprint(model);
+  return withLazyContentVectorMigration(db, () => {
+    const stmt = db.prepare(`
+      SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
+      FROM documents d
+      JOIN content c ON d.hash = c.hash
+      LEFT JOIN (
+        SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
+        FROM content_vectors
+        WHERE model = ? AND embed_fingerprint = ?
+        GROUP BY hash, model, embed_fingerprint
+      ) v ON d.hash = v.hash
+      WHERE d.active = 1
+        AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
+        ${collectionFilter}
+      GROUP BY d.hash
+      ORDER BY MIN(d.path)
+    `);
+    return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint)) as PendingEmbeddingDoc[];
+  });
 }

 function buildEmbeddingBatches(
@ -1515,6 +1568,7 @@ export async function generateEmbeddings(
  const db = store.db;
  const llm = getLlm(store);
  const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
+  const fingerprint = getEmbeddingFingerprint(model);
  const now = new Date().toISOString();
  const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
  const encoder = new TextEncoder();
@ -1633,7 +1687,7 @@ export async function generateEmbeddings(
            const chunk = chunkBatch[i]!;
            const embedding = embeddings[i];
            if (embedding) {
-              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
+              insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
              chunksEmbedded++;
            } else {
              errors++;
@ -1652,7 +1706,7 @@ export async function generateEmbeddings(
                const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
                const result = await session.embed(text, { model });
                if (result) {
-                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
+                  insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
                  chunksEmbedded++;
                } else {
                  errors++;
@ -1778,7 +1832,7 @@ export function createStore(dbPath?: string): Store {
    // Vector/embedding operations
    getHashesForEmbedding: () => getHashesForEmbedding(db),
    clearAllEmbeddings: () => clearAllEmbeddings(db),
-    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks),
+    insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
  };

  return store;
@ -1979,22 +2033,24 @@ export type IndexStatus = {

 export function getHashesNeedingEmbedding(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): number {
  const collectionFilter = collection ? `AND d.collection = ?` : ``;
-  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
-  const stmt = db.prepare(`
-    SELECT COUNT(DISTINCT d.hash) as count
-    FROM documents d
-    LEFT JOIN (
-      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
-      FROM content_vectors
-      WHERE model = ?
-      GROUP BY hash, model
-    ) v ON d.hash = v.hash
-    WHERE d.active = 1
-      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
-      ${collectionFilter}
-  `);
-  const result = (collection ? stmt.get(model, collection) : stmt.get(model)) as { count: number };
-  return result.count;
+  const fingerprint = getEmbeddingFingerprint(model);
+  return withLazyContentVectorMigration(db, () => {
+    const stmt = db.prepare(`
+      SELECT COUNT(DISTINCT d.hash) as count
+      FROM documents d
+      LEFT JOIN (
+        SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
+        FROM content_vectors
+        WHERE model = ? AND embed_fingerprint = ?
+        GROUP BY hash, model, embed_fingerprint
+      ) v ON d.hash = v.hash
+      WHERE d.active = 1
+        AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
+        ${collectionFilter}
+    `);
+    const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint)) as { count: number };
+    return result.count;
+  });
 }

 export type IndexHealthInfo = {
@ -2003,6 +2059,79 @@ export type IndexHealthInfo = {
  daysStale: number | null;
 };

+export type LegacyFingerprintAdoptionResult = {
+  checked: boolean;
+  adopted: number;
+  reason: string;
+};
+
+export async function maybeAdoptLegacyEmbeddingFingerprint(store: Store, model: string = DEFAULT_EMBED_MODEL): Promise<LegacyFingerprintAdoptionResult> {
+  const db = store.db;
+  const fingerprint = getEmbeddingFingerprint(model);
+  const legacyCount = withLazyContentVectorMigration(db, () => {
+    const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model) as { count: number };
+    return row.count;
+  });
+  if (legacyCount === 0) {
+    return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
+  }
+
+  const sample = db.prepare(`
+    SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
+    FROM content_vectors cv
+    JOIN documents d ON d.hash = cv.hash AND d.active = 1
+    JOIN content c ON c.hash = cv.hash
+    WHERE cv.model = ? AND cv.embed_fingerprint = ''
+    GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
+    ORDER BY cv.hash, cv.seq
+    LIMIT 1
+  `).get(model) as { hash: string; seq: number; pos: number; total_chunks: number; body: string; path: string } | undefined;
+
+  if (!sample) {
+    return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
+  }
+
+  const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
+  if (!tableExists) {
+    return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
+  }
+
+  const expectedHashSeq = `${sample.hash}_${sample.seq}`;
+  const title = extractTitle(sample.body, sample.path);
+  const llm = getLlm(store);
+
+  return await withLLMSessionForLlm(llm, async (session) => {
+    const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
+    const chunk = chunks[sample.seq];
+    if (!chunk) {
+      return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
+    }
+
+    const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
+    if (!result) {
+      return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
+    }
+
+    const nearest = db.prepare(`
+      SELECT hash_seq, distance
+      FROM vectors_vec
+      WHERE embedding MATCH ? AND k = 1
+    `).get(new Float32Array(result.embedding)) as { hash_seq: string; distance: number } | undefined;
+
+    if (!nearest) {
+      return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
+    }
+
+    const threshold = 0.0001;
+    if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
+      return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
+    }
+
+    const update = db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model);
+    return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
+  });
+}
+
 export function getIndexHealth(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexHealthInfo {
  const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
  const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
@ -3353,21 +3482,21 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi
 * Returns hash, document body, and a sample path for display purposes.
 */
 export function getHashesForEmbedding(db: Database, model: string = DEFAULT_EMBED_MODEL): { hash: string; body: string; path: string }[] {
-  const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
-  return db.prepare(`
+  const fingerprint = getEmbeddingFingerprint(model);
+  return withLazyContentVectorMigration(db, () => db.prepare(`
    SELECT d.hash, c.doc as body, MIN(d.path) as path
    FROM documents d
    JOIN content c ON d.hash = c.hash
    LEFT JOIN (
-      SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
+      SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
      FROM content_vectors
-      WHERE model = ?
-      GROUP BY hash, model
+      WHERE model = ? AND embed_fingerprint = ?
+      GROUP BY hash, model, embed_fingerprint
    ) v ON d.hash = v.hash
    WHERE d.active = 1
      AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
    GROUP BY d.hash
-  `).all(model) as { hash: string; body: string; path: string }[];
+  `).all(model, fingerprint) as { hash: string; body: string; path: string }[]);
 }

 /**
@ -3453,19 +3582,22 @@ export function insertEmbedding(
  embedding: Float32Array,
  model: string,
  embeddedAt: string,
-  totalChunks: number = 1
+  totalChunks: number = 1,
+  fingerprint: string = getEmbeddingFingerprint(model)
 ): void {
  const hashSeq = `${hash}_${seq}`;

-  // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
-  const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`);
-  insertContentVectorStmt.run(hash, seq, pos, model, totalChunks, embeddedAt);
+  withLazyContentVectorMigration(db, () => {
+    // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
+    const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
+    insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);

-  // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
-  const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
-  const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
-  deleteVecStmt.run(hashSeq);
-  insertVecStmt.run(hashSeq, embedding);
+    // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
+    const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
+    const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
+    deleteVecStmt.run(hashSeq);
+    insertVecStmt.run(hashSeq, embedding);
+  });
 }

 function removeIncompleteEmbeddings(db: Database, expectedChunksByHash: Map<string, number>, model: string): number {
--- a/test/cli.test.ts
+++ b/test/cli.test.ts
@ -14,6 +14,7 @@ import { fileURLToPath } from "url";
 import { spawn } from "child_process";
 import { setTimeout as sleep } from "timers/promises";
 import { buildEditorUri, termLink, resolveEmbedModelForCli } from "../src/cli/qmd.ts";
+import { openDatabase } from "../src/db.ts";
 import { DEFAULT_EMBED_MODEL_URI } from "../src/llm.ts";

 // Test fixtures directory and database path
@ -465,6 +466,32 @@ describe("CLI Status Command", () => {
    await runQmd(["collection", "add", "."]);
  });

+  test("qmd doctor reports core index health checks", async () => {
+    const { stdout, exitCode } = await runQmd(["doctor"]);
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("QMD Doctor");
+    expect(stdout).toContain("SQLite runtime");
+    expect(stdout).toContain("sqlite-vec");
+    expect(stdout).toContain("embedding freshness");
+    expect(stdout).toContain("embedding fingerprints");
+    expect(stdout).toContain("content hash sample");
+  });
+
+  test("qmd doctor flags mixed embedding fingerprints", async () => {
+    const db = openDatabase(testDbPath);
+    const doc = db.prepare(`SELECT hash FROM documents WHERE active = 1 LIMIT 1`).get() as { hash: string };
+    db.prepare(`
+      INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at)
+      VALUES (?, 0, 0, ?, 'stale1', 1, ?)
+    `).run(doc.hash, resolveEmbedModelForCli(), new Date().toISOString());
+    db.close();
+
+    const { stdout, exitCode } = await runQmd(["doctor"]);
+    expect(exitCode).toBe(0);
+    expect(stdout).toContain("embedding fingerprints");
+    expect(stdout).toContain("stale1");
+  });
+
  test("shows index status", async () => {
    const { stdout, exitCode } = await runQmd(["status"]);
    expect(exitCode).toBe(0);
--- a/test/mcp.test.ts
+++ b/test/mcp.test.ts
@ -80,6 +80,7 @@ function initTestDatabase(db: Database): void {
      seq INTEGER NOT NULL DEFAULT 0,
      pos INTEGER NOT NULL DEFAULT 0,
      model TEXT NOT NULL,
+      embed_fingerprint TEXT NOT NULL DEFAULT '',
      embedded_at TEXT NOT NULL,
      PRIMARY KEY (hash, seq)
    )
@ -186,7 +187,7 @@ function seedTestData(db: Database): void {
  for (let i = 0; i < 768; i++) embedding[i] = Math.random();

  for (const doc of docs.slice(0, 4)) { // Skip large file for embeddings
-    db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, now);
+    db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embed_fingerprint, embedded_at) VALUES (?, 0, 0, ?, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, getEmbeddingFingerprint(DEFAULT_EMBED_MODEL), now);
    db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${doc.hash}_0`, embedding);
  }
 }
@ -211,6 +212,7 @@ import {
  findDocuments,
  getStatus,
  DEFAULT_EMBED_MODEL,
+  getEmbeddingFingerprint,
  DEFAULT_QUERY_MODEL,
  DEFAULT_RERANK_MODEL,
  DEFAULT_MULTI_GET_MAX_BYTES,
--- a/test/store.test.ts
+++ b/test/store.test.ts
@ -26,6 +26,7 @@ import {
  extractTitle,
  formatQueryForEmbedding,
  formatDocForEmbedding,
+  getEmbeddingFingerprint,
  chunkDocument,
  chunkDocumentByTokens,
  chunkDocumentAsync,
@ -311,19 +312,74 @@ describe("Store Creation", () => {

    // Check tables exist
    const tables = store.db.prepare(`
-      SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
+      SELECT name FROM sqlite_master
+      WHERE type='table'
+      ORDER BY name
    `).all() as { name: string }[];

    const tableNames = tables.map(t => t.name);
    expect(tableNames).toContain("documents");
    expect(tableNames).toContain("documents_fts");
    expect(tableNames).toContain("content_vectors");
+    expect(tableNames).toContain("content");
    expect(tableNames).toContain("llm_cache");
    // Note: path_contexts table removed in favor of YAML-based context storage

    await cleanupTestDb(store);
  });

+  test("createStore defers content_vectors embed_fingerprint migration until embedding health needs it", async () => {
+    const dbPath = join(testDir, `legacy-${Date.now()}-${Math.random().toString(36).slice(2)}.sqlite`);
+    const model = "hf:test/embed-model.gguf";
+    const legacyDb = openDatabase(dbPath);
+    legacyDb.exec(`
+      CREATE TABLE content (
+        hash TEXT PRIMARY KEY,
+        doc TEXT NOT NULL,
+        created_at TEXT NOT NULL
+      );
+      CREATE TABLE documents (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        collection TEXT NOT NULL,
+        path TEXT NOT NULL,
+        title TEXT,
+        hash TEXT NOT NULL,
+        created_at TEXT NOT NULL,
+        modified_at TEXT NOT NULL,
+        active INTEGER NOT NULL DEFAULT 1,
+        FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
+        UNIQUE(collection, path)
+      );
+      CREATE TABLE content_vectors (
+        hash TEXT NOT NULL,
+        seq INTEGER NOT NULL DEFAULT 0,
+        pos INTEGER NOT NULL DEFAULT 0,
+        model TEXT NOT NULL,
+        total_chunks INTEGER NOT NULL DEFAULT 1,
+        embedded_at TEXT NOT NULL,
+        PRIMARY KEY (hash, seq)
+      )
+    `);
+    const now = new Date().toISOString();
+    legacyDb.prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`).run("hash1", "# Legacy\nbody", now);
+    legacyDb.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1)`).run("test", "legacy.md", "Legacy", "hash1", now, now);
+    legacyDb.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`).run("hash1", 0, 0, model, 1, now);
+    legacyDb.close();
+
+    const store = createStore(dbPath);
+    let columns = store.db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
+    expect(columns.map(col => col.name)).not.toContain("embed_fingerprint");
+
+    expect(store.getHashesNeedingEmbedding(model)).toBe(1);
+
+    columns = store.db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
+    const migratedRow = store.db.prepare(`SELECT embed_fingerprint FROM content_vectors WHERE hash = ?`).get("hash1") as { embed_fingerprint: string };
+    expect(columns.map(col => col.name)).toContain("embed_fingerprint");
+    expect(migratedRow.embed_fingerprint).toBe("");
+
+    await cleanupTestDb(store);
+  });
+
  test("createStore sets WAL journal mode", async () => {
    const store = await createTestStore();
    const result = store.db.prepare("PRAGMA journal_mode").get() as { journal_mode: string };
@ -2301,6 +2357,23 @@ describe("Index Status", () => {
    await cleanupTestDb(store);
  });

+  test("embedding health treats stale fingerprints as needing re-embedding", async () => {
+    const store = await createTestStore();
+    const collectionName = await createTestCollection();
+    const model = "hf:test/embed-model.gguf";
+    const now = new Date().toISOString();
+
+    store.llm = { embedModelName: model } as any;
+    store.ensureVecTable(3);
+    await insertTestDocument(store.db, collectionName, { name: "doc1", hash: "hash1" });
+    store.insertEmbedding("hash1", 0, 0, new Float32Array([1, 2, 3]), model, now, 1, "stale1");
+
+    expect(getEmbeddingFingerprint(model)).toMatch(/^[a-f0-9]{6}$/);
+    expect(store.getHashesNeedingEmbedding()).toBe(1);
+
+    await cleanupTestDb(store);
+  });
+
  test("getIndexHealth returns health info", async () => {
    const store = await createTestStore();
    const collectionName = await createTestCollection();