feat: add qmd doctor vector diagnostics
This commit is contained in:
parent
ddbd6bd8be
commit
ac6b154f0c
@ -5,9 +5,12 @@
|
||||
### Changes
|
||||
|
||||
- Agent skills: add `qmd skills list|get|path` to serve version-matched runtime skill instructions from the installed CLI, and make `qmd skill install` write a stable discovery stub so installed agent skills do not go stale after QMD upgrades.
|
||||
- CLI: add `qmd doctor` for index/runtime diagnostics, including SQLite/sqlite-vec versions, embedding fingerprint freshness, mixed-fingerprint detection, safe legacy fingerprint adoption, and content-hash sampling.
|
||||
|
||||
### Fixes
|
||||
|
||||
- Embedding: fingerprint vector metadata using the active embedding model and formatting/chunking parameters so stale vectors are treated as pending after search semantics change. Legacy `content_vectors` columns are migrated lazily on first vector-health/write use to preserve fast QMD startup.
|
||||
|
||||
- Skill: expand the packaged QMD skill with retrieval-first workflows, structured query examples, wiki/source collection guidance, and safe fallbacks when model-backed search is unavailable.
|
||||
- Tests: make `bun run test` execute the local unit suite under both Node/Vitest and Bun (`test:node` + `test:bun`) so runtime-specific regressions are caught before CI.
|
||||
- Model config: centralize embedding/rerank/generation model resolution so `qmd embed`, `status`, `query`, `vsearch`, `pull`, SDK vector search, and `bench` use the same active `.qmd/index.yaml` model hints and environment fallbacks.
|
||||
|
||||
105
src/cli/qmd.ts
105
src/cli/qmd.ts
@ -1,4 +1,4 @@
|
||||
import { openDatabase } from "../db.js";
|
||||
import { isBun, openDatabase } from "../db.js";
|
||||
import type { Database } from "../db.js";
|
||||
import fastGlob from "fast-glob";
|
||||
import { execSync, spawn as nodeSpawn } from "child_process";
|
||||
@ -31,6 +31,7 @@ import {
|
||||
hashContent,
|
||||
extractTitle,
|
||||
formatDocForEmbedding,
|
||||
getEmbeddingFingerprint,
|
||||
chunkDocumentByTokens,
|
||||
clearCache,
|
||||
getCacheKey,
|
||||
@ -74,6 +75,7 @@ import {
|
||||
getDefaultDbPath,
|
||||
reindexCollection,
|
||||
generateEmbeddings,
|
||||
maybeAdoptLegacyEmbeddingFingerprint,
|
||||
syncConfigToDb,
|
||||
type ReindexResult,
|
||||
type ChunkStrategy,
|
||||
@ -3228,10 +3230,103 @@ function showHelp(): void {
|
||||
console.log(`Index: ${getDbPath()}`);
|
||||
}
|
||||
|
||||
async function showVersion(): Promise<void> {
|
||||
function doctorCheck(label: string, ok: boolean, details: string): void {
|
||||
const mark = ok ? `${c.green}✓${c.reset}` : `${c.yellow}⚠${c.reset}`;
|
||||
console.log(`${mark} ${label}: ${details}`);
|
||||
}
|
||||
|
||||
async function showDoctor(): Promise<void> {
|
||||
const storeInstance = getStore();
|
||||
const db = storeInstance.db;
|
||||
const pkg = readPackageJson();
|
||||
const embedModel = resolveEmbedModelForCli();
|
||||
const fingerprint = getEmbeddingFingerprint(embedModel);
|
||||
|
||||
console.log(`${c.bold}QMD Doctor${c.reset}\n`);
|
||||
console.log(`Index: ${getDbPath()}`);
|
||||
console.log(`Runtime: ${isBun ? "bun:sqlite" : "better-sqlite3"}`);
|
||||
|
||||
try {
|
||||
const row = db.prepare(`SELECT sqlite_version() AS version`).get() as { version: string };
|
||||
doctorCheck("SQLite runtime", true, row.version);
|
||||
} catch (error) {
|
||||
doctorCheck("SQLite runtime", false, error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
const betterSqliteVersion = pkg.dependencies?.["better-sqlite3"] ?? pkg.devDependencies?.["better-sqlite3"] ?? "not declared";
|
||||
doctorCheck("better_sqlite version", true, String(betterSqliteVersion));
|
||||
|
||||
try {
|
||||
const row = db.prepare(`SELECT vec_version() AS version`).get() as { version: string };
|
||||
doctorCheck("sqlite-vec", true, row.version);
|
||||
} catch (error) {
|
||||
doctorCheck("sqlite-vec", false, error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
try {
|
||||
const adoption = await maybeAdoptLegacyEmbeddingFingerprint(storeInstance, embedModel);
|
||||
if (adoption.checked || adoption.adopted > 0) {
|
||||
doctorCheck("legacy fingerprint adoption", adoption.adopted > 0, adoption.adopted > 0 ? `adopted ${adoption.adopted} legacy chunks; ${adoption.reason}` : adoption.reason);
|
||||
}
|
||||
} catch (error) {
|
||||
doctorCheck("legacy fingerprint adoption", false, error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
try {
|
||||
const pending = getHashesNeedingEmbedding(db, undefined, embedModel);
|
||||
doctorCheck("embedding freshness", pending === 0, pending === 0 ? "all active documents match current fingerprint" : `${pending} active documents need embedding`);
|
||||
} catch (error) {
|
||||
doctorCheck("embedding freshness", false, error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
try {
|
||||
const rows = db.prepare(`
|
||||
SELECT model, embed_fingerprint AS fingerprint, COUNT(DISTINCT hash) AS docs, COUNT(*) AS chunks
|
||||
FROM content_vectors
|
||||
GROUP BY model, embed_fingerprint
|
||||
ORDER BY chunks DESC, model, embed_fingerprint
|
||||
`).all() as { model: string; fingerprint: string; docs: number; chunks: number }[];
|
||||
const uniqueFingerprints = new Set(rows.map(row => row.fingerprint));
|
||||
const offCurrent = rows.filter(row => row.model === embedModel && row.fingerprint !== fingerprint);
|
||||
const ok = rows.length === 0 || (uniqueFingerprints.size === 1 && rows[0]?.fingerprint === fingerprint && offCurrent.length === 0);
|
||||
const details = rows.length === 0
|
||||
? `none yet; current ${fingerprint}`
|
||||
: rows.map(row => {
|
||||
const label = row.fingerprint === fingerprint ? "current" : (row.fingerprint || "legacy");
|
||||
return `${row.model}:${label} ${row.docs} docs/${row.chunks} chunks`;
|
||||
}).join("; ");
|
||||
doctorCheck("embedding fingerprints", ok, details);
|
||||
} catch (error) {
|
||||
doctorCheck("embedding fingerprints", false, error instanceof Error ? error.message : String(error));
|
||||
}
|
||||
|
||||
const sample = db.prepare(`
|
||||
SELECT c.hash, c.doc
|
||||
FROM documents d
|
||||
JOIN content c ON c.hash = d.hash
|
||||
WHERE d.active = 1
|
||||
ORDER BY random()
|
||||
LIMIT 1
|
||||
`).get() as { hash: string; doc: string } | undefined;
|
||||
if (sample) {
|
||||
const rehashed = await hashContent(sample.doc);
|
||||
doctorCheck("content hash sample", rehashed === sample.hash, `${sample.hash.slice(0, 12)} ${rehashed === sample.hash ? "matches" : `!= ${rehashed.slice(0, 12)}`}`);
|
||||
} else {
|
||||
doctorCheck("content hash sample", true, "no active documents indexed");
|
||||
}
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
function readPackageJson(): any {
|
||||
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
||||
const pkgPath = resolve(scriptDir, "..", "..", "package.json");
|
||||
const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
|
||||
return JSON.parse(readFileSync(pkgPath, "utf-8"));
|
||||
}
|
||||
|
||||
async function showVersion(): Promise<void> {
|
||||
const scriptDir = dirname(fileURLToPath(import.meta.url));
|
||||
const pkg = readPackageJson();
|
||||
|
||||
let commit = "";
|
||||
try {
|
||||
@ -3539,6 +3634,10 @@ if (isMain) {
|
||||
await showStatus();
|
||||
break;
|
||||
|
||||
case "doctor":
|
||||
await showDoctor();
|
||||
break;
|
||||
|
||||
case "update":
|
||||
await updateCollections();
|
||||
break;
|
||||
|
||||
266
src/store.ts
266
src/store.ts
@ -50,6 +50,10 @@ export const DEFAULT_MULTI_GET_MAX_BYTES = 10 * 1024; // 10KB
|
||||
export const DEFAULT_EMBED_MAX_DOCS_PER_BATCH = 64;
|
||||
export const DEFAULT_EMBED_MAX_BATCH_BYTES = 64 * 1024 * 1024; // 64MB
|
||||
|
||||
const EMBED_FINGERPRINT_PROBE_QUERY = "__qmd_embedding_query_probe__";
|
||||
const EMBED_FINGERPRINT_PROBE_TITLE = "__qmd_embedding_title_probe__";
|
||||
const EMBED_FINGERPRINT_PROBE_DOC = "__qmd_embedding_document_probe__";
|
||||
|
||||
// Chunking: 900 tokens per chunk with 15% overlap
|
||||
// Increased from 800 to accommodate smart chunking finding natural break points
|
||||
export const CHUNK_SIZE_TOKENS = 900;
|
||||
@ -61,6 +65,17 @@ export const CHUNK_OVERLAP_CHARS = CHUNK_OVERLAP_TOKENS * 4; // 540 chars
|
||||
export const CHUNK_WINDOW_TOKENS = 200;
|
||||
export const CHUNK_WINDOW_CHARS = CHUNK_WINDOW_TOKENS * 4; // 800 chars
|
||||
|
||||
export function getEmbeddingFingerprint(model: string = DEFAULT_EMBED_MODEL): string {
|
||||
const significant = [
|
||||
`model:${model}`,
|
||||
`query:${formatQueryForEmbedding(EMBED_FINGERPRINT_PROBE_QUERY, model)}`,
|
||||
`doc:${formatDocForEmbedding(EMBED_FINGERPRINT_PROBE_DOC, EMBED_FINGERPRINT_PROBE_TITLE, model)}`,
|
||||
`chunk_tokens:${CHUNK_SIZE_TOKENS}`,
|
||||
`chunk_overlap_tokens:${CHUNK_OVERLAP_TOKENS}`,
|
||||
].join("\n");
|
||||
return createHash("sha256").update(significant).digest("hex").slice(0, 6);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the LlamaCpp instance for a store — prefers the store's own instance,
|
||||
* falls back to the global singleton.
|
||||
@ -861,28 +876,20 @@ function initializeDatabase(db: Database): void {
|
||||
)
|
||||
`);
|
||||
|
||||
// Content vectors
|
||||
const cvInfo = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
|
||||
const hasSeqColumn = cvInfo.some(col => col.name === 'seq');
|
||||
if (cvInfo.length > 0 && !hasSeqColumn) {
|
||||
db.exec(`DROP TABLE IF EXISTS content_vectors`);
|
||||
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
||||
}
|
||||
// Content vectors. Avoid PRAGMA schema probes during startup; legacy vector
|
||||
// columns are repaired lazily when a vector/embedding query first needs them.
|
||||
db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS content_vectors (
|
||||
hash TEXT NOT NULL,
|
||||
seq INTEGER NOT NULL DEFAULT 0,
|
||||
pos INTEGER NOT NULL DEFAULT 0,
|
||||
model TEXT NOT NULL,
|
||||
embed_fingerprint TEXT NOT NULL DEFAULT '',
|
||||
total_chunks INTEGER NOT NULL DEFAULT 1,
|
||||
embedded_at TEXT NOT NULL,
|
||||
PRIMARY KEY (hash, seq)
|
||||
)
|
||||
`);
|
||||
const cvInfoAfterCreate = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
|
||||
if (!cvInfoAfterCreate.some(col => col.name === 'total_chunks')) {
|
||||
db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
|
||||
}
|
||||
|
||||
// Store collections — makes the DB self-contained (no external config needed)
|
||||
db.exec(`
|
||||
@ -1237,7 +1244,7 @@ export type Store = {
|
||||
// Vector/embedding operations
|
||||
getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
|
||||
clearAllEmbeddings: () => void;
|
||||
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => void;
|
||||
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => void;
|
||||
};
|
||||
|
||||
// =============================================================================
|
||||
@ -1428,31 +1435,77 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
|
||||
};
|
||||
}
|
||||
|
||||
function contentVectorExpectedChunksExpr(db: Database): string {
|
||||
const columns = db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
|
||||
return columns.some(col => col.name === 'total_chunks') ? 'MAX(total_chunks)' : '1';
|
||||
function contentVectorSchemaRepairFor(error: unknown): "embed_fingerprint" | "total_chunks" | null {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
if (
|
||||
message.includes("no such column: embed_fingerprint") ||
|
||||
message.includes("has no column named embed_fingerprint")
|
||||
) {
|
||||
return "embed_fingerprint";
|
||||
}
|
||||
if (
|
||||
message.includes("no such column: total_chunks") ||
|
||||
message.includes("has no column named total_chunks")
|
||||
) {
|
||||
return "total_chunks";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function repairContentVectorColumn(db: Database, column: "embed_fingerprint" | "total_chunks"): void {
|
||||
try {
|
||||
if (column === "embed_fingerprint") {
|
||||
db.exec(`ALTER TABLE content_vectors ADD COLUMN embed_fingerprint TEXT NOT NULL DEFAULT ''`);
|
||||
} else {
|
||||
db.exec(`ALTER TABLE content_vectors ADD COLUMN total_chunks INTEGER NOT NULL DEFAULT 1`);
|
||||
}
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
// Another caller may have already repaired the column between error and ALTER.
|
||||
if (!message.includes("duplicate column name")) {
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function withLazyContentVectorMigration<T>(db: Database, operation: () => T): T {
|
||||
const repaired = new Set<string>();
|
||||
while (true) {
|
||||
try {
|
||||
return operation();
|
||||
} catch (error) {
|
||||
const column = contentVectorSchemaRepairFor(error);
|
||||
if (!column || repaired.has(column)) {
|
||||
throw error;
|
||||
}
|
||||
repairContentVectorColumn(db, column);
|
||||
repaired.add(column);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function getPendingEmbeddingDocs(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): PendingEmbeddingDoc[] {
|
||||
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
||||
const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
|
||||
const stmt = db.prepare(`
|
||||
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
||||
FROM documents d
|
||||
JOIN content c ON d.hash = c.hash
|
||||
LEFT JOIN (
|
||||
SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
|
||||
FROM content_vectors
|
||||
WHERE model = ?
|
||||
GROUP BY hash, model
|
||||
) v ON d.hash = v.hash
|
||||
WHERE d.active = 1
|
||||
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
||||
${collectionFilter}
|
||||
GROUP BY d.hash
|
||||
ORDER BY MIN(d.path)
|
||||
`);
|
||||
return (collection ? stmt.all(model, collection) : stmt.all(model)) as PendingEmbeddingDoc[];
|
||||
const fingerprint = getEmbeddingFingerprint(model);
|
||||
return withLazyContentVectorMigration(db, () => {
|
||||
const stmt = db.prepare(`
|
||||
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
||||
FROM documents d
|
||||
JOIN content c ON d.hash = c.hash
|
||||
LEFT JOIN (
|
||||
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
||||
FROM content_vectors
|
||||
WHERE model = ? AND embed_fingerprint = ?
|
||||
GROUP BY hash, model, embed_fingerprint
|
||||
) v ON d.hash = v.hash
|
||||
WHERE d.active = 1
|
||||
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
||||
${collectionFilter}
|
||||
GROUP BY d.hash
|
||||
ORDER BY MIN(d.path)
|
||||
`);
|
||||
return (collection ? stmt.all(model, fingerprint, collection) : stmt.all(model, fingerprint)) as PendingEmbeddingDoc[];
|
||||
});
|
||||
}
|
||||
|
||||
function buildEmbeddingBatches(
|
||||
@ -1515,6 +1568,7 @@ export async function generateEmbeddings(
|
||||
const db = store.db;
|
||||
const llm = getLlm(store);
|
||||
const model = options?.model ?? llm.embedModelName ?? DEFAULT_EMBED_MODEL;
|
||||
const fingerprint = getEmbeddingFingerprint(model);
|
||||
const now = new Date().toISOString();
|
||||
const { maxDocsPerBatch, maxBatchBytes } = resolveEmbedOptions(options);
|
||||
const encoder = new TextEncoder();
|
||||
@ -1633,7 +1687,7 @@ export async function generateEmbeddings(
|
||||
const chunk = chunkBatch[i]!;
|
||||
const embedding = embeddings[i];
|
||||
if (embedding) {
|
||||
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
|
||||
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
|
||||
chunksEmbedded++;
|
||||
} else {
|
||||
errors++;
|
||||
@ -1652,7 +1706,7 @@ export async function generateEmbeddings(
|
||||
const text = formatDocForEmbedding(chunk.text, chunk.title, embedModelUri);
|
||||
const result = await session.embed(text, { model });
|
||||
if (result) {
|
||||
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1);
|
||||
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now, expectedChunksByHash.get(chunk.hash) ?? 1, fingerprint);
|
||||
chunksEmbedded++;
|
||||
} else {
|
||||
errors++;
|
||||
@ -1778,7 +1832,7 @@ export function createStore(dbPath?: string): Store {
|
||||
// Vector/embedding operations
|
||||
getHashesForEmbedding: () => getHashesForEmbedding(db),
|
||||
clearAllEmbeddings: () => clearAllEmbeddings(db),
|
||||
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks),
|
||||
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string, totalChunks?: number, fingerprint?: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt, totalChunks, fingerprint),
|
||||
};
|
||||
|
||||
return store;
|
||||
@ -1979,22 +2033,24 @@ export type IndexStatus = {
|
||||
|
||||
export function getHashesNeedingEmbedding(db: Database, collection?: string, model: string = DEFAULT_EMBED_MODEL): number {
|
||||
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
||||
const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
|
||||
const stmt = db.prepare(`
|
||||
SELECT COUNT(DISTINCT d.hash) as count
|
||||
FROM documents d
|
||||
LEFT JOIN (
|
||||
SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
|
||||
FROM content_vectors
|
||||
WHERE model = ?
|
||||
GROUP BY hash, model
|
||||
) v ON d.hash = v.hash
|
||||
WHERE d.active = 1
|
||||
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
||||
${collectionFilter}
|
||||
`);
|
||||
const result = (collection ? stmt.get(model, collection) : stmt.get(model)) as { count: number };
|
||||
return result.count;
|
||||
const fingerprint = getEmbeddingFingerprint(model);
|
||||
return withLazyContentVectorMigration(db, () => {
|
||||
const stmt = db.prepare(`
|
||||
SELECT COUNT(DISTINCT d.hash) as count
|
||||
FROM documents d
|
||||
LEFT JOIN (
|
||||
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
||||
FROM content_vectors
|
||||
WHERE model = ? AND embed_fingerprint = ?
|
||||
GROUP BY hash, model, embed_fingerprint
|
||||
) v ON d.hash = v.hash
|
||||
WHERE d.active = 1
|
||||
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
||||
${collectionFilter}
|
||||
`);
|
||||
const result = (collection ? stmt.get(model, fingerprint, collection) : stmt.get(model, fingerprint)) as { count: number };
|
||||
return result.count;
|
||||
});
|
||||
}
|
||||
|
||||
export type IndexHealthInfo = {
|
||||
@ -2003,6 +2059,79 @@ export type IndexHealthInfo = {
|
||||
daysStale: number | null;
|
||||
};
|
||||
|
||||
export type LegacyFingerprintAdoptionResult = {
|
||||
checked: boolean;
|
||||
adopted: number;
|
||||
reason: string;
|
||||
};
|
||||
|
||||
export async function maybeAdoptLegacyEmbeddingFingerprint(store: Store, model: string = DEFAULT_EMBED_MODEL): Promise<LegacyFingerprintAdoptionResult> {
|
||||
const db = store.db;
|
||||
const fingerprint = getEmbeddingFingerprint(model);
|
||||
const legacyCount = withLazyContentVectorMigration(db, () => {
|
||||
const row = db.prepare(`SELECT COUNT(DISTINCT hash) AS count FROM content_vectors WHERE model = ? AND embed_fingerprint = ''`).get(model) as { count: number };
|
||||
return row.count;
|
||||
});
|
||||
if (legacyCount === 0) {
|
||||
return { checked: false, adopted: 0, reason: "no legacy empty-fingerprint embeddings" };
|
||||
}
|
||||
|
||||
const sample = db.prepare(`
|
||||
SELECT cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc AS body, MIN(d.path) AS path
|
||||
FROM content_vectors cv
|
||||
JOIN documents d ON d.hash = cv.hash AND d.active = 1
|
||||
JOIN content c ON c.hash = cv.hash
|
||||
WHERE cv.model = ? AND cv.embed_fingerprint = ''
|
||||
GROUP BY cv.hash, cv.seq, cv.pos, cv.total_chunks, c.doc
|
||||
ORDER BY cv.hash, cv.seq
|
||||
LIMIT 1
|
||||
`).get(model) as { hash: string; seq: number; pos: number; total_chunks: number; body: string; path: string } | undefined;
|
||||
|
||||
if (!sample) {
|
||||
return { checked: false, adopted: 0, reason: `${legacyCount} legacy docs have no active sample` };
|
||||
}
|
||||
|
||||
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
||||
if (!tableExists) {
|
||||
return { checked: false, adopted: 0, reason: "vectors_vec table is missing" };
|
||||
}
|
||||
|
||||
const expectedHashSeq = `${sample.hash}_${sample.seq}`;
|
||||
const title = extractTitle(sample.body, sample.path);
|
||||
const llm = getLlm(store);
|
||||
|
||||
return await withLLMSessionForLlm(llm, async (session) => {
|
||||
const chunks = await chunkDocumentByTokens(sample.body, undefined, undefined, undefined, sample.path, undefined, session.signal);
|
||||
const chunk = chunks[sample.seq];
|
||||
if (!chunk) {
|
||||
return { checked: true, adopted: 0, reason: `sample chunk ${expectedHashSeq} no longer exists` };
|
||||
}
|
||||
|
||||
const result = await session.embed(formatDocForEmbedding(chunk.text, title, model), { model });
|
||||
if (!result) {
|
||||
return { checked: true, adopted: 0, reason: "failed to embed legacy sample" };
|
||||
}
|
||||
|
||||
const nearest = db.prepare(`
|
||||
SELECT hash_seq, distance
|
||||
FROM vectors_vec
|
||||
WHERE embedding MATCH ? AND k = 1
|
||||
`).get(new Float32Array(result.embedding)) as { hash_seq: string; distance: number } | undefined;
|
||||
|
||||
if (!nearest) {
|
||||
return { checked: true, adopted: 0, reason: "legacy sample vector not found" };
|
||||
}
|
||||
|
||||
const threshold = 0.0001;
|
||||
if (nearest.hash_seq !== expectedHashSeq || nearest.distance > threshold) {
|
||||
return { checked: true, adopted: 0, reason: `legacy sample differs from current fingerprint (nearest ${nearest.hash_seq}, distance ${nearest.distance.toFixed(6)})` };
|
||||
}
|
||||
|
||||
const update = db.prepare(`UPDATE content_vectors SET embed_fingerprint = ? WHERE model = ? AND embed_fingerprint = ''`).run(fingerprint, model);
|
||||
return { checked: true, adopted: update.changes, reason: `sample ${expectedHashSeq} matched current fingerprint at distance ${nearest.distance.toFixed(6)}` };
|
||||
});
|
||||
}
|
||||
|
||||
export function getIndexHealth(db: Database, model: string = DEFAULT_EMBED_MODEL): IndexHealthInfo {
|
||||
const needsEmbedding = getHashesNeedingEmbedding(db, undefined, model);
|
||||
const totalDocs = (db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number }).count;
|
||||
@ -3353,21 +3482,21 @@ async function getEmbedding(text: string, model: string, isQuery: boolean, sessi
|
||||
* Returns hash, document body, and a sample path for display purposes.
|
||||
*/
|
||||
export function getHashesForEmbedding(db: Database, model: string = DEFAULT_EMBED_MODEL): { hash: string; body: string; path: string }[] {
|
||||
const expectedChunksExpr = contentVectorExpectedChunksExpr(db);
|
||||
return db.prepare(`
|
||||
const fingerprint = getEmbeddingFingerprint(model);
|
||||
return withLazyContentVectorMigration(db, () => db.prepare(`
|
||||
SELECT d.hash, c.doc as body, MIN(d.path) as path
|
||||
FROM documents d
|
||||
JOIN content c ON d.hash = c.hash
|
||||
LEFT JOIN (
|
||||
SELECT hash, model, COUNT(*) AS chunk_count, ${expectedChunksExpr} AS expected_chunks
|
||||
SELECT hash, model, COUNT(*) AS chunk_count, MAX(total_chunks) AS expected_chunks
|
||||
FROM content_vectors
|
||||
WHERE model = ?
|
||||
GROUP BY hash, model
|
||||
WHERE model = ? AND embed_fingerprint = ?
|
||||
GROUP BY hash, model, embed_fingerprint
|
||||
) v ON d.hash = v.hash
|
||||
WHERE d.active = 1
|
||||
AND (v.hash IS NULL OR v.chunk_count < v.expected_chunks)
|
||||
GROUP BY d.hash
|
||||
`).all(model) as { hash: string; body: string; path: string }[];
|
||||
`).all(model, fingerprint) as { hash: string; body: string; path: string }[]);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3453,19 +3582,22 @@ export function insertEmbedding(
|
||||
embedding: Float32Array,
|
||||
model: string,
|
||||
embeddedAt: string,
|
||||
totalChunks: number = 1
|
||||
totalChunks: number = 1,
|
||||
fingerprint: string = getEmbeddingFingerprint(model)
|
||||
): void {
|
||||
const hashSeq = `${hash}_${seq}`;
|
||||
|
||||
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
|
||||
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`);
|
||||
insertContentVectorStmt.run(hash, seq, pos, model, totalChunks, embeddedAt);
|
||||
withLazyContentVectorMigration(db, () => {
|
||||
// Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding)
|
||||
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?, ?)`);
|
||||
insertContentVectorStmt.run(hash, seq, pos, model, fingerprint, totalChunks, embeddedAt);
|
||||
|
||||
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
|
||||
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
||||
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
||||
deleteVecStmt.run(hashSeq);
|
||||
insertVecStmt.run(hashSeq, embedding);
|
||||
// vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT
|
||||
const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
||||
const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
||||
deleteVecStmt.run(hashSeq);
|
||||
insertVecStmt.run(hashSeq, embedding);
|
||||
});
|
||||
}
|
||||
|
||||
function removeIncompleteEmbeddings(db: Database, expectedChunksByHash: Map<string, number>, model: string): number {
|
||||
|
||||
@ -14,6 +14,7 @@ import { fileURLToPath } from "url";
|
||||
import { spawn } from "child_process";
|
||||
import { setTimeout as sleep } from "timers/promises";
|
||||
import { buildEditorUri, termLink, resolveEmbedModelForCli } from "../src/cli/qmd.ts";
|
||||
import { openDatabase } from "../src/db.ts";
|
||||
import { DEFAULT_EMBED_MODEL_URI } from "../src/llm.ts";
|
||||
|
||||
// Test fixtures directory and database path
|
||||
@ -465,6 +466,32 @@ describe("CLI Status Command", () => {
|
||||
await runQmd(["collection", "add", "."]);
|
||||
});
|
||||
|
||||
test("qmd doctor reports core index health checks", async () => {
|
||||
const { stdout, exitCode } = await runQmd(["doctor"]);
|
||||
expect(exitCode).toBe(0);
|
||||
expect(stdout).toContain("QMD Doctor");
|
||||
expect(stdout).toContain("SQLite runtime");
|
||||
expect(stdout).toContain("sqlite-vec");
|
||||
expect(stdout).toContain("embedding freshness");
|
||||
expect(stdout).toContain("embedding fingerprints");
|
||||
expect(stdout).toContain("content hash sample");
|
||||
});
|
||||
|
||||
test("qmd doctor flags mixed embedding fingerprints", async () => {
|
||||
const db = openDatabase(testDbPath);
|
||||
const doc = db.prepare(`SELECT hash FROM documents WHERE active = 1 LIMIT 1`).get() as { hash: string };
|
||||
db.prepare(`
|
||||
INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embed_fingerprint, total_chunks, embedded_at)
|
||||
VALUES (?, 0, 0, ?, 'stale1', 1, ?)
|
||||
`).run(doc.hash, resolveEmbedModelForCli(), new Date().toISOString());
|
||||
db.close();
|
||||
|
||||
const { stdout, exitCode } = await runQmd(["doctor"]);
|
||||
expect(exitCode).toBe(0);
|
||||
expect(stdout).toContain("embedding fingerprints");
|
||||
expect(stdout).toContain("stale1");
|
||||
});
|
||||
|
||||
test("shows index status", async () => {
|
||||
const { stdout, exitCode } = await runQmd(["status"]);
|
||||
expect(exitCode).toBe(0);
|
||||
|
||||
@ -80,6 +80,7 @@ function initTestDatabase(db: Database): void {
|
||||
seq INTEGER NOT NULL DEFAULT 0,
|
||||
pos INTEGER NOT NULL DEFAULT 0,
|
||||
model TEXT NOT NULL,
|
||||
embed_fingerprint TEXT NOT NULL DEFAULT '',
|
||||
embedded_at TEXT NOT NULL,
|
||||
PRIMARY KEY (hash, seq)
|
||||
)
|
||||
@ -186,7 +187,7 @@ function seedTestData(db: Database): void {
|
||||
for (let i = 0; i < 768; i++) embedding[i] = Math.random();
|
||||
|
||||
for (const doc of docs.slice(0, 4)) { // Skip large file for embeddings
|
||||
db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, 0, 0, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, now);
|
||||
db.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, embed_fingerprint, embedded_at) VALUES (?, 0, 0, ?, ?, ?)`).run(doc.hash, DEFAULT_EMBED_MODEL, getEmbeddingFingerprint(DEFAULT_EMBED_MODEL), now);
|
||||
db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${doc.hash}_0`, embedding);
|
||||
}
|
||||
}
|
||||
@ -211,6 +212,7 @@ import {
|
||||
findDocuments,
|
||||
getStatus,
|
||||
DEFAULT_EMBED_MODEL,
|
||||
getEmbeddingFingerprint,
|
||||
DEFAULT_QUERY_MODEL,
|
||||
DEFAULT_RERANK_MODEL,
|
||||
DEFAULT_MULTI_GET_MAX_BYTES,
|
||||
|
||||
@ -26,6 +26,7 @@ import {
|
||||
extractTitle,
|
||||
formatQueryForEmbedding,
|
||||
formatDocForEmbedding,
|
||||
getEmbeddingFingerprint,
|
||||
chunkDocument,
|
||||
chunkDocumentByTokens,
|
||||
chunkDocumentAsync,
|
||||
@ -311,19 +312,74 @@ describe("Store Creation", () => {
|
||||
|
||||
// Check tables exist
|
||||
const tables = store.db.prepare(`
|
||||
SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
|
||||
SELECT name FROM sqlite_master
|
||||
WHERE type='table'
|
||||
ORDER BY name
|
||||
`).all() as { name: string }[];
|
||||
|
||||
const tableNames = tables.map(t => t.name);
|
||||
expect(tableNames).toContain("documents");
|
||||
expect(tableNames).toContain("documents_fts");
|
||||
expect(tableNames).toContain("content_vectors");
|
||||
expect(tableNames).toContain("content");
|
||||
expect(tableNames).toContain("llm_cache");
|
||||
// Note: path_contexts table removed in favor of YAML-based context storage
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("createStore defers content_vectors embed_fingerprint migration until embedding health needs it", async () => {
|
||||
const dbPath = join(testDir, `legacy-${Date.now()}-${Math.random().toString(36).slice(2)}.sqlite`);
|
||||
const model = "hf:test/embed-model.gguf";
|
||||
const legacyDb = openDatabase(dbPath);
|
||||
legacyDb.exec(`
|
||||
CREATE TABLE content (
|
||||
hash TEXT PRIMARY KEY,
|
||||
doc TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
CREATE TABLE documents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
collection TEXT NOT NULL,
|
||||
path TEXT NOT NULL,
|
||||
title TEXT,
|
||||
hash TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL,
|
||||
modified_at TEXT NOT NULL,
|
||||
active INTEGER NOT NULL DEFAULT 1,
|
||||
FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
|
||||
UNIQUE(collection, path)
|
||||
);
|
||||
CREATE TABLE content_vectors (
|
||||
hash TEXT NOT NULL,
|
||||
seq INTEGER NOT NULL DEFAULT 0,
|
||||
pos INTEGER NOT NULL DEFAULT 0,
|
||||
model TEXT NOT NULL,
|
||||
total_chunks INTEGER NOT NULL DEFAULT 1,
|
||||
embedded_at TEXT NOT NULL,
|
||||
PRIMARY KEY (hash, seq)
|
||||
)
|
||||
`);
|
||||
const now = new Date().toISOString();
|
||||
legacyDb.prepare(`INSERT INTO content (hash, doc, created_at) VALUES (?, ?, ?)`).run("hash1", "# Legacy\nbody", now);
|
||||
legacyDb.prepare(`INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1)`).run("test", "legacy.md", "Legacy", "hash1", now, now);
|
||||
legacyDb.prepare(`INSERT INTO content_vectors (hash, seq, pos, model, total_chunks, embedded_at) VALUES (?, ?, ?, ?, ?, ?)`).run("hash1", 0, 0, model, 1, now);
|
||||
legacyDb.close();
|
||||
|
||||
const store = createStore(dbPath);
|
||||
let columns = store.db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
|
||||
expect(columns.map(col => col.name)).not.toContain("embed_fingerprint");
|
||||
|
||||
expect(store.getHashesNeedingEmbedding(model)).toBe(1);
|
||||
|
||||
columns = store.db.prepare(`PRAGMA table_info(content_vectors)`).all() as { name: string }[];
|
||||
const migratedRow = store.db.prepare(`SELECT embed_fingerprint FROM content_vectors WHERE hash = ?`).get("hash1") as { embed_fingerprint: string };
|
||||
expect(columns.map(col => col.name)).toContain("embed_fingerprint");
|
||||
expect(migratedRow.embed_fingerprint).toBe("");
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("createStore sets WAL journal mode", async () => {
|
||||
const store = await createTestStore();
|
||||
const result = store.db.prepare("PRAGMA journal_mode").get() as { journal_mode: string };
|
||||
@ -2301,6 +2357,23 @@ describe("Index Status", () => {
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("embedding health treats stale fingerprints as needing re-embedding", async () => {
|
||||
const store = await createTestStore();
|
||||
const collectionName = await createTestCollection();
|
||||
const model = "hf:test/embed-model.gguf";
|
||||
const now = new Date().toISOString();
|
||||
|
||||
store.llm = { embedModelName: model } as any;
|
||||
store.ensureVecTable(3);
|
||||
await insertTestDocument(store.db, collectionName, { name: "doc1", hash: "hash1" });
|
||||
store.insertEmbedding("hash1", 0, 0, new Float32Array([1, 2, 3]), model, now, 1, "stale1");
|
||||
|
||||
expect(getEmbeddingFingerprint(model)).toMatch(/^[a-f0-9]{6}$/);
|
||||
expect(store.getHashesNeedingEmbedding()).toBe(1);
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("getIndexHealth returns health info", async () => {
|
||||
const store = await createTestStore();
|
||||
const collectionName = await createTestCollection();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user