feat: add qmd doctor vector diagnostics

This commit is contained in:
Tobi Lütke 2026-05-18 01:52:05 +00:00
parent ddbd6bd8be
commit ac6b154f0c
No known key found for this signature in database
6 changed files with 408 additions and 72 deletions

View File

@ -5,9 +5,12 @@
### Changes
- Agent skills: add `qmd skills list|get|path` to serve version-matched runtime skill instructions from the installed CLI, and make `qmd skill install` write a stable discovery stub so installed agent skills do not go stale after QMD upgrades.
- CLI: add `qmd doctor` for index/runtime diagnostics, including SQLite/sqlite-vec versions, embedding fingerprint freshness, mixed-fingerprint detection, safe legacy fingerprint adoption, and content-hash sampling.
### Fixes
- Embedding: fingerprint vector metadata using the active embedding model and formatting/chunking parameters so stale vectors are treated as pending after search semantics change. Legacy `content_vectors` columns are migrated lazily on first vector-health/write use to preserve fast QMD startup.
- Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable.
- Tests: make `bun run test` execute the local unit suite under both Node/Vitest and Bun (`test:node` + `test:bun`) so runtime-specific regressions are caught before CI.
- Model config: centralize embedding/rerank/generation model resolution so `qmd embed`, `status`, `query`, `vsearch`, `pull`, SDK vector search, and `bench` use the same active `.qmd/index.yaml` model hints and environment fallbacks.

View File

@ -1,4 +1,4 @@
import { openDatabase } from "../db.js";
import { isBun, openDatabase } from "../db.js";
import type { Database } from "../db.js";
import fastGlob from "fast-glob";
import { execSync, spawn as nodeSpawn } from "child_process";
@ -31,6 +31,7 @@ import {
hashContent,
extractTitle,
formatDocForEmbedding,
getEmbeddingFingerprint,
chunkDocumentByTokens,
clearCache,
getCacheKey,
@ -74,6 +75,7 @@ import {
getDefaultDbPath,
reindexCollection,
generateEmbeddings,
maybeAdoptLegacyEmbeddingFingerprint,
syncConfigToDb,
type ReindexResult,
type ChunkStrategy,
@ -3228,10 +3230,103 @@ function showHelp(): void {
console.log(`Index: ${getDbPath()}`);
}
async function showVersion(): Promise<void> {
function doctorCheck(label: string, ok: boolean, details: string): void {
const mark = ok ? `${c.green}${c.reset}` : `${c.yellow}${c.reset}`;
console.log(`${mark} ${label}: ${details}`);
}
async function showDoctor(): Promise<void> {
const storeInstance = getStore();
const db = storeInstance.db;
const pkg = readPackageJson();
const embedModel = resolveEmbedModelForCli();
const fingerprint = getEmbeddingFingerprint(embedModel);
console.log(`${c.bold}QMD Doctor${c.reset}\n`);
console.log(`Index: ${getDbPath()}`);
console.log(`Runtime: ${isBun ? "bun:sqlite" : "better-sqlite3"}`);
try {
const row = db.prepare(`SELECT sqlite_version() AS version`).get() as { version: string };
doctorCheck("SQLite runtime", true, row.version);
} catch (error) {
doctorCheck("SQLite runtime", false, error instanceof Error ? error.message : String(error));
}
const betterSqliteVersion = pkg.dependencies?.["better-sqlite3"] ?? pkg.devDependencies?.["better-sqlite3"] ?? "not declared";
doctorCheck("better_sqlite version", true, String(betterSqliteVersion));
try {
const row = db.prepare(`SELECT vec_version() AS version`).get() as { version: string };
doctorCheck("sqlite-vec", true, row.version);
} catch (error) {
doctorCheck("sqlite-vec", false, error instanceof Error ? error.message : String(error));
}
try {
const adoption = await maybeAdoptLegacyEmbeddingFingerprint(storeInstance, embedModel);
if (adoption.checked || adoption.adopted > 0) {
doctorCheck("legacy fingerprint adoption", adoption.adopted > 0, adoption.adopted > 0 ? `adopted ${adoption.adopted} legacy chunks; ${adoption.reason}` : adoption.reason);
}
} catch (error) {
doctorCheck("legacy fingerprint adoption", false, error instanceof Error ? error.message : String(error));
}
try {
const pending = getHashesNeedingEmbedding(db, undefined, embedModel);
doctorCheck("embedding freshness", pending === 0, pending === 0 ? "all active documents match current fingerprint" : `${pending} active documents need embedding`);
} catch (error) {
doctorCheck("embedding freshness", false, error instanceof Error ? error.message : String(error));
}
try {
const rows = db.prepare(`
SELECT model, embed_fingerprint AS fingerprint, COUNT(DISTINCT hash) AS docs, COUNT(*) AS chunks
FROM content_vectors
GROUP BY model, embed_fingerprint
ORDER BY chunks DESC, model, embed_fingerprint
`).all() as { model: string; fingerprint: string; docs: number; chunks: number }[];
const uniqueFingerprints = new Set(rows.map(row => row.fingerprint));
const offCurrent = rows.filter(row => row.model === embedModel && row.fingerprint !== fingerprint);
const ok = rows.length === 0 || (uniqueFingerprints.size === 1 && rows[0]?.fingerprint === fingerprint && offCurrent.length === 0);
const details = rows.length === 0
? `none yet; current ${fingerprint}`
: rows.map(row => {
const label = row.fingerprint === fingerprint ? "current" : (row.fingerprint || "legacy");
return `${row.model}:${label} ${row.docs} docs/${row.chunks} chunks`;
}).join("; ");
doctorCheck("embedding fingerprints", ok, details);
} catch (error) {
doctorCheck("embedding fingerprints", false, error instanceof Error ? error.message : String(error));
}
const sample = db.prepare(`
SELECT c.hash, c.doc
FROM documents d
JOIN content c ON c.hash = d.hash
WHERE d.active = 1
ORDER BY random()
LIMIT 1
`).get() as { hash: string; doc: string } | undefined;
if (sample) {
const rehashed = await hashContent(sample.doc);
doctorCheck("content hash sample", rehashed === sample.hash, `${sample.hash.slice(0, 12)} ${rehashed === sample.hash ? "matches" : `!= ${rehashed.slice(0, 12)}`}`);
} else {
doctorCheck("content hash sample", true, "no active documents indexed");
}
closeDb();
}
function readPackageJson(): any {
const scriptDir = dirname(fileURLToPath(import.meta.url));
const pkgPath = resolve(scriptDir, "..", "..", "package.json");
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
return JSON.parse(readFileSync(pkgPath, "utf-8"));
}
async function showVersion(): Promise<void> {
const scriptDir = dirname(fileURLToPath(import.meta.url));
const pkg = readPackageJson();
let commit = "";
try {
@ -3539,6 +3634,10 @@ if (isMain) {
await showStatus();
break;
case "doctor":
await showDoctor();
break;
case "update":
await updateCollections();
break;

View File

@ -50,6 +50,10 @@ export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
// Chunking: 900 tokens per chunk with 15% overlap
// Increased from 800 to accommodate smart chunking finding natural break points
export const CHUNK_SIZE_TOKENS = 900;
@ -61,6 +65,17 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
export const CHUNK_WINDOW_TOKENS = 200;
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
export function getEmbeddingFingerprint(model: string = DEFAULT_EMBED_MODEL): string {
const significant = [
`model:${model}`,
`query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
`doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
`chunk_tokens:${CHUNK_SIZE_TOKENS}`,
`chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
].join("\n");
return createHash("sha256").update(significant).digest("hex").slice(0, 6);
}
/**
* Get the LlamaCpp instance for a store prefers the store's own instance,
* falls back to the global singleton.
@ -861,28 +876,20 @@ function initializeDatabase(db: Database): void {
)
`);
// Content vectors
const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
if (cvInfo.length > 0 && !hasSeqColumn) {
db.exec(`DROP TABLE IF EXISTS content_vectors`);
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
}
// Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
// columns are repaired lazily when a vector/embedding query first needs them.
db.exec(`
CREATE TABLE IF NOT EXISTS content_vectors (
hash TEXT NOT NULL,
seq INTEGER NOT NULL DEFAULT 0,
pos INTEGER NOT NULL DEFAULT 0,
model TEXT NOT NULL,
embed_fingerprint TEXT NOT NULL DEFAULT '',
total_chunks INTEGER NOT NULL DEFAULT 1,
embedded_at TEXT NOT NULL,
PRIMARY KEY (hash, seq)
)
`);
const cvInfoAfterCreate = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
if (!cvInfoAfterCreate.some(col => col.name === 'total_chunks')) {
db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
}
// Store collections — makes the DB self-contained (no external config needed)
db.exec(`
@ -1237,7 +1244,7 @@ export type Store = {
// Vector/embedding operations
getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
clearAllEmbeddings: () => void;
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => void;
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => void;
};
// =============================================================================
@ -1428,31 +1435,77 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
};
}
function contentVectorExpectedChunksExpr(db: Database): string {
const columns = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
return columns.some(col => col.name === 'total_chunks') ? 'MAX(total_chunks)' : '1';
function contentVectorSchemaRepairFor(error: unknown): "embed_fingerprint" | "total_chunks" | null {
const message = error instanceof Error ? error.message : String(error);
if (
message.includes("no such column: embed_fingerprint") ||
message.includes("has no column named embed_fingerprint")
) {
return "embed_fingerprint";
}
if (
message.includes("no such column: total_chunks") ||
message.includes("has no column named total_chunks")
) {
return "total_chunks";
}
return null;
}
function repairContentVectorColumn(db: Database, column: "embed_fingerprint" | "total_chunks"): void {
try {
if (column === "embed_fingerprint") {
db.exec(`ALTER TABLE content_vectors ADD COLUMN embed_fingerprint TEXT NOT NULL DEFAULT ''`);
} else {
db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
}
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
// Another caller may have already repaired the column between error and ALTER.
if (!message.includes("duplicate column name")) {
throw error;
}
}
}
function withLazyContentVectorMigration<T>(db: Database, operation: () => T): T {
const repaired = new Set<string>();
while (true) {
try {
return operation();
} catch (error) {
const column = contentVectorSchemaRepairFor(error);
if (!column || repaired.has(column)) {
throw error;
}
repairContentVectorColumn(db, column);
repaired.add(column);
}
}
}
function getPendingEmbeddingDocs(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): PendingEmbeddingDoc[] {
const collectionFilter = collection ? `AND d.collection = ?` : ``;
const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
const stmt = db.prepare(`
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
FROM documents d
JOIN content c ON d.hash = c.hash
LEFT JOIN (
SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
FROM content_vectors
WHERE model = ?
GROUP BY hash, model
) v ON d.hash = v.hash
WHERE d.active = 1
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
${collectionFilter}
GROUP BY d.hash
ORDER BY MIN(d.path)
`);
return (collection ? stmt.all(model, collection) : stmt.all(model)) as PendingEmbeddingDoc[];
const fingerprint = getEmbeddingFingerprint(model);
return withLazyContentVectorMigration(db, () => {
const stmt = db.prepare(`
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
FROM documents d
JOIN content c ON d.hash = c.hash
LEFT JOIN (
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
FROM content_vectors
WHERE model = ? AND embed_fingerprint = ?
GROUP BY hash, model, embed_fingerprint
) v ON d.hash = v.hash
WHERE d.active = 1
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
${collectionFilter}
GROUP BY d.hash
ORDER BY MIN(d.path)
`);
return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint)) as PendingEmbeddingDoc[];
});
}
function buildEmbeddingBatches(
@ -1515,6 +1568,7 @@ export async function generateEmbeddings(
const db = store.db;
const llm = getLlm(store);
const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
const fingerprint = getEmbeddingFingerprint(model);
const now = new Date().toISOString();
const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
const encoder = new TextEncoder();
@ -1633,7 +1687,7 @@ export async function generateEmbeddings(
const chunk = chunkBatch[i]!;
const embedding = embeddings[i];
if (embedding) {
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
chunksEmbedded++;
} else {
errors++;
@ -1652,7 +1706,7 @@ export async function generateEmbeddings(
const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
const result = await session.embed(text, { model });
if (result) {
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
chunksEmbedded++;
} else {
errors++;
@ -1778,7 +1832,7 @@ export function createStore(dbPath?: string): Store {
// Vector/embedding operations
getHashesForEmbedding: () => getHashesForEmbedding(db),
clearAllEmbeddings: () => clearAllEmbeddings(db),
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks),
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
};
return store;
@ -1979,22 +2033,24 @@ export type IndexStatus = {
export function getHashesNeedingEmbedding(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): number {
const collectionFilter = collection ? `AND d.collection = ?` : ``;
const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
const stmt = db.prepare(`
SELECT COUNT(DISTINCT d.hash) as count
FROM documents d
LEFT JOIN (
SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
FROM content_vectors
WHERE model = ?
GROUP BY hash, model
) v ON d.hash = v.hash
WHERE d.active = 1
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
${collectionFilter}
`);
const result = (collection ? stmt.get(model, collection) : stmt.get(model)) as { count: number };
return result.count;
const fingerprint = getEmbeddingFingerprint(model);
return withLazyContentVectorMigration(db, () => {
const stmt = db.prepare(`
SELECT COUNT(DISTINCT d.hash) as count
FROM documents d
LEFT JOIN (
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
FROM content_vectors
WHERE model = ? AND embed_fingerprint = ?
GROUP BY hash, model, embed_fingerprint
) v ON d.hash = v.hash
WHERE d.active = 1
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
${collectionFilter}
`);
const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint)) as { count: number };
return result.count;
});
}
export type IndexHealthInfo = {
@ -2003,6 +2059,79 @@ export type IndexHealthInfo = {
daysStale: number | null;
};
export type LegacyFingerprintAdoptionResult = {
checked: boolean;
adopted: number;
reason: string;
};
export async function maybeAdoptLegacyEmbeddingFingerprint(store: Store, model: string = DEFAULT_EMBED_MODEL): Promise<LegacyFingerprintAdoptionResult> {
const db = store.db;
const fingerprint = getEmbeddingFingerprint(model);
const legacyCount = withLazyContentVectorMigration(db, () => {
const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model) as { count: number };
return row.count;
});
if (legacyCount === 0) {
return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
}
const sample = db.prepare(`
SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
FROM content_vectors cv
JOIN documents d ON d.hash = cv.hash AND d.active = 1
JOIN content c ON c.hash = cv.hash
WHERE cv.model = ? AND cv.embed_fingerprint = ''
GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
ORDER BY cv.hash, cv.seq
LIMIT 1
`).get(model) as { hash: string; seq: number; pos: number; total_chunks: number; body: string; path: string } | undefined;
if (!sample) {
return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
}
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
if (!tableExists) {
return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
}
const expectedHashSeq = `${sample.hash}_${sample.seq}`;
const title = extractTitle(sample.body, sample.path);
const llm = getLlm(store);
return await withLLMSessionForLlm(llm, async (session) => {
const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
const chunk = chunks[sample.seq];
if (!chunk) {
return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
}
const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
if (!result) {
return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
}
const nearest = db.prepare(`
SELECT hash_seq, distance
FROM vectors_vec
WHERE embedding MATCH ? AND k = 1
`).get(new Float32Array(result.embedding)) as { hash_seq: string; distance: number } | undefined;
if (!nearest) {
return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
}
const threshold = 0.0001;
if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
}
const update = db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model);
return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
});
}
export function getIndexHealth(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexHealthInfo {
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
@ -3353,21 +3482,21 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi
* Returns hash, document body, and a sample path for display purposes.
*/
export function getHashesForEmbedding(db: Database, model: string = DEFAULT_EMBED_MODEL): { hash: string; body: string; path: string }[] {
const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
return db.prepare(`
const fingerprint = getEmbeddingFingerprint(model);
return withLazyContentVectorMigration(db, () => db.prepare(`
SELECT d.hash, c.doc as body, MIN(d.path) as path
FROM documents d
JOIN content c ON d.hash = c.hash
LEFT JOIN (
SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
FROM content_vectors
WHERE model = ?
GROUP BY hash, model
WHERE model = ? AND embed_fingerprint = ?
GROUP BY hash, model, embed_fingerprint
) v ON d.hash = v.hash
WHERE d.active = 1
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
GROUP BY d.hash
`).all(model) as { hash: string; body: string; path: string }[];
`).all(model, fingerprint) as { hash: string; body: string; path: string }[]);
}
/**
@ -3453,19 +3582,22 @@ export function insertEmbedding(
embedding: Float32Array,
model: string,
embeddedAt: string,
totalChunks: number = 1
totalChunks: number = 1,
fingerprint: string = getEmbeddingFingerprint(model)
): void {
const hashSeq = `${hash}_${seq}`;
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`);
insertContentVectorStmt.run(hash, seq, pos, model, totalChunks, embeddedAt);
withLazyContentVectorMigration(db, () => {
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
deleteVecStmt.run(hashSeq);
insertVecStmt.run(hashSeq, embedding);
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
deleteVecStmt.run(hashSeq);
insertVecStmt.run(hashSeq, embedding);
});
}
function removeIncompleteEmbeddings(db: Database, expectedChunksByHash: Map<string, number>, model: string): number {

View File

@ -14,6 +14,7 @@ import { fileURLToPath } from "url";
import { spawn } from "child_process";
import { setTimeout as sleep } from "timers/promises";
import { buildEditorUri, termLink, resolveEmbedModelForCli } from "../src/cli/qmd.ts";
import { openDatabase } from "../src/db.ts";
import { DEFAULT_EMBED_MODEL_URI } from "../src/llm.ts";
// Test fixtures directory and database path
@ -465,6 +466,32 @@ describe("CLI Status Command", () => {
await runQmd(["collection", "add", "."]);
});
test("qmd doctor reports core index health checks", async () => {
const { stdout, exitCode } = await runQmd(["doctor"]);
expect(exitCode).toBe(0);
expect(stdout).toContain("QMD Doctor");
expect(stdout).toContain("SQLite runtime");
expect(stdout).toContain("sqlite-vec");
expect(stdout).toContain("embedding freshness");
expect(stdout).toContain("embedding fingerprints");
expect(stdout).toContain("content hash sample");
});
test("qmd doctor flags mixed embedding fingerprints", async () => {
const db = openDatabase(testDbPath);
const doc = db.prepare(`SELECT hash FROM documents WHERE active = 1 LIMIT 1`).get() as { hash: string };
db.prepare(`
INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at)
VALUES (?, 0, 0, ?, 'stale1', 1, ?)
`).run(doc.hash, resolveEmbedModelForCli(), new Date().toISOString());
db.close();
const { stdout, exitCode } = await runQmd(["doctor"]);
expect(exitCode).toBe(0);
expect(stdout).toContain("embedding fingerprints");
expect(stdout).toContain("stale1");
});
test("shows index status", async () => {
const { stdout, exitCode } = await runQmd(["status"]);
expect(exitCode).toBe(0);

View File

@ -80,6 +80,7 @@ function initTestDatabase(db: Database): void {
seq INTEGER NOT NULL DEFAULT 0,
pos INTEGER NOT NULL DEFAULT 0,
model TEXT NOT NULL,
embed_fingerprint TEXT NOT NULL DEFAULT '',
embedded_at TEXT NOT NULL,
PRIMARY KEY (hash, seq)
)
@ -186,7 +187,7 @@ function seedTestData(db: Database): void {
for (let i = 0; i < 768; i++) embedding[i] = Math.random();
for (const doc of docs.slice(0, 4)) { // Skip large file for embeddings
db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, now);
db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embed_fingerprint, embedded_at) VALUES (?, 0, 0, ?, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, getEmbeddingFingerprint(DEFAULT_EMBED_MODEL), now);
db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${doc.hash}_0`, embedding);
}
}
@ -211,6 +212,7 @@ import {
findDocuments,
getStatus,
DEFAULT_EMBED_MODEL,
getEmbeddingFingerprint,
DEFAULT_QUERY_MODEL,
DEFAULT_RERANK_MODEL,
DEFAULT_MULTI_GET_MAX_BYTES,

View File

@ -26,6 +26,7 @@ import {
extractTitle,
formatQueryForEmbedding,
formatDocForEmbedding,
getEmbeddingFingerprint,
chunkDocument,
chunkDocumentByTokens,
chunkDocumentAsync,
@ -311,19 +312,74 @@ describe("Store Creation", () => {
// Check tables exist
const tables = store.db.prepare(`
SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
SELECT name FROM sqlite_master
WHERE type='table'
ORDER BY name
`).all() as { name: string }[];
const tableNames = tables.map(t => t.name);
expect(tableNames).toContain("documents");
expect(tableNames).toContain("documents_fts");
expect(tableNames).toContain("content_vectors");
expect(tableNames).toContain("content");
expect(tableNames).toContain("llm_cache");
// Note: path_contexts table removed in favor of YAML-based context storage
await cleanupTestDb(store);
});
test("createStore defers content_vectors embed_fingerprint migration until embedding health needs it", async () => {
const dbPath = join(testDir, `legacy-${Date.now()}-${Math.random().toString(36).slice(2)}.sqlite`);
const model = "hf:test/embed-model.gguf";
const legacyDb = openDatabase(dbPath);
legacyDb.exec(`
CREATE TABLE content (
hash TEXT PRIMARY KEY,
doc TEXT NOT NULL,
created_at TEXT NOT NULL
);
CREATE TABLE documents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
collection TEXT NOT NULL,
path TEXT NOT NULL,
title TEXT,
hash TEXT NOT NULL,
created_at TEXT NOT NULL,
modified_at TEXT NOT NULL,
active INTEGER NOT NULL DEFAULT 1,
FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
UNIQUE(collection, path)
);
CREATE TABLE content_vectors (
hash TEXT NOT NULL,
seq INTEGER NOT NULL DEFAULT 0,
pos INTEGER NOT NULL DEFAULT 0,
model TEXT NOT NULL,
total_chunks INTEGER NOT NULL DEFAULT 1,
embedded_at TEXT NOT NULL,
PRIMARY KEY (hash, seq)
)
`);
const now = new Date().toISOString();
legacyDb.prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`).run("hash1", "# Legacy\nbody", now);
legacyDb.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1)`).run("test", "legacy.md", "Legacy", "hash1", now, now);
legacyDb.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`).run("hash1", 0, 0, model, 1, now);
legacyDb.close();
const store = createStore(dbPath);
let columns = store.db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
expect(columns.map(col => col.name)).not.toContain("embed_fingerprint");
expect(store.getHashesNeedingEmbedding(model)).toBe(1);
columns = store.db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
const migratedRow = store.db.prepare(`SELECT embed_fingerprint FROM content_vectors WHERE hash = ?`).get("hash1") as { embed_fingerprint: string };
expect(columns.map(col => col.name)).toContain("embed_fingerprint");
expect(migratedRow.embed_fingerprint).toBe("");
await cleanupTestDb(store);
});
test("createStore sets WAL journal mode", async () => {
const store = await createTestStore();
const result = store.db.prepare("PRAGMA journal_mode").get() as { journal_mode: string };
@ -2301,6 +2357,23 @@ describe("Index Status", () => {
await cleanupTestDb(store);
});
test("embedding health treats stale fingerprints as needing re-embedding", async () => {
const store = await createTestStore();
const collectionName = await createTestCollection();
const model = "hf:test/embed-model.gguf";
const now = new Date().toISOString();
store.llm = { embedModelName: model } as any;
store.ensureVecTable(3);
await insertTestDocument(store.db, collectionName, { name: "doc1", hash: "hash1" });
store.insertEmbedding("hash1", 0, 0, new Float32Array([1, 2, 3]), model, now, 1, "stale1");
expect(getEmbeddingFingerprint(model)).toMatch(/^[a-f0-9]{6}$/);
expect(store.getHashesNeedingEmbedding()).toBe(1);
await cleanupTestDb(store);
});
test("getIndexHealth returns health info", async () => {
const store = await createTestStore();
const collectionName = await createTestCollection();