fix(embed): honor collection filter

This commit is contained in:
Tobi Lütke 2026-05-09 18:12:37 +00:00
parent d045a8bab6
commit 5b9f472849
No known key found for this signature in database
5 changed files with 185 additions and 18 deletions

View File

@ -4,6 +4,11 @@
### Fixes
- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
to the requested collection instead of embedding global pending work.
Scoped `--force` clears only collection-owned vectors, preserves shared
hashes referenced by sibling collections, and drops `vectors_vec` only
when the scoped clear empties all vectors.
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
- Fix: preserve original filename case in `handelize()`. The previous
`.toLowerCase()` call made indexed paths unreachable on case-sensitive

View File

@ -1684,7 +1684,7 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
async function vectorIndex(
model: string = DEFAULT_EMBED_MODEL_URI,
force: boolean = false,
batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy },
batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy; collection?: string },
): Promise<void> {
const storeInstance = getStore();
const db = storeInstance.db;
@ -1694,7 +1694,7 @@ async function vectorIndex(
}
// Check if there's work to do before starting
const hashesToEmbed = getHashesNeedingEmbedding(db);
const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection);
if (hashesToEmbed === 0 && !force) {
console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
closeDb();
@ -1715,6 +1715,7 @@ async function vectorIndex(
const result = await generateEmbeddings(storeInstance, {
force,
model,
collection: batchOptions?.collection,
maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
maxBatchBytes: batchOptions?.maxBatchBytes,
chunkStrategy: batchOptions?.chunkStrategy,
@ -2727,7 +2728,7 @@ function showHelp(): void {
console.log("Maintenance:");
console.log(" qmd status - View index + collection health");
console.log(" qmd update [--pull] - Re-index collections (optionally git pull first)");
console.log(" qmd embed [-f] - Generate/refresh vector embeddings");
console.log(" qmd embed [-f] [-c <name>] - Generate/refresh vector embeddings");
console.log(" --max-docs-per-batch <n> - Cap docs loaded into memory per embedding batch");
console.log(" --max-batch-mb <n> - Cap UTF-8 MB loaded into memory per embedding batch");
console.log(" qmd cleanup - Clear caches, vacuum DB");
@ -3120,10 +3121,17 @@ if (isMain) {
const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
// Validate -c against configured collections before dispatching, so a
// typo errors with "Collection not found: X" instead of silently
// reporting success because no pending docs match a nonexistent name.
// embed operates on a single collection; only the first value is used.
const embedValidatedCollections = resolveCollectionFilter(cli.opts.collection, false);
const embedCollection = embedValidatedCollections[0];
await vectorIndex(DEFAULT_EMBED_MODEL_URI, !!cli.values.force, {
maxDocsPerBatch,
maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
chunkStrategy: embedChunkStrategy,
collection: embedCollection,
});
} catch (error) {
console.error(error instanceof Error ? error.message : String(error));

View File

@ -290,6 +290,8 @@ export interface QMDStore {
embed(options?: {
force?: boolean;
model?: string;
/** Restrict embedding to documents in one collection. */
collection?: string;
maxDocsPerBatch?: number;
maxBatchBytes?: number;
chunkStrategy?: ChunkStrategy;
@ -516,6 +518,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
return generateEmbeddings(internal, {
force: embedOpts?.force,
model: embedOpts?.model,
collection: embedOpts?.collection,
maxDocsPerBatch: embedOpts?.maxDocsPerBatch,
maxBatchBytes: embedOpts?.maxBatchBytes,
chunkStrategy: embedOpts?.chunkStrategy,

View File

@ -1374,6 +1374,11 @@ export type EmbedResult = {
export type EmbedOptions = {
force?: boolean;
model?: string;
/**
* Restrict embedding to documents in a single collection.
* When omitted, all pending documents across every collection are embedded.
*/
collection?: string;
maxDocsPerBatch?: number;
maxBatchBytes?: number;
chunkStrategy?: ChunkStrategy;
@ -1415,16 +1420,18 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
};
}
function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
return db.prepare(`
function getPendingEmbeddingDocs(db: Database, collection?: string): PendingEmbeddingDoc[] {
const collectionFilter = collection ? `AND d.collection = ?` : ``;
const stmt = db.prepare(`
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
FROM documents d
JOIN content c ON d.hash = c.hash
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
WHERE d.active = 1 AND v.hash IS NULL
WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
GROUP BY d.hash
ORDER BY MIN(d.path)
`).all() as PendingEmbeddingDoc[];
`);
return (collection ? stmt.all(collection) : stmt.all()) as PendingEmbeddingDoc[];
}
function buildEmbeddingBatches(
@ -1491,10 +1498,10 @@ export async function generateEmbeddings(
const encoder = new TextEncoder();
if (options?.force) {
clearAllEmbeddings(db);
clearAllEmbeddings(db, options?.collection);
}
const docsToEmbed = getPendingEmbeddingDocs(db);
const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection);
if (docsToEmbed.length === 0) {
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
@ -1942,13 +1949,15 @@ export type IndexStatus = {
// Index health
// =============================================================================
export function getHashesNeedingEmbedding(db: Database): number {
const result = db.prepare(`
export function getHashesNeedingEmbedding(db: Database, collection?: string): number {
const collectionFilter = collection ? `AND d.collection = ?` : ``;
const stmt = db.prepare(`
SELECT COUNT(DISTINCT d.hash) as count
FROM documents d
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
WHERE d.active = 1 AND v.hash IS NULL
`).get() as { count: number };
WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
`);
const result = (collection ? stmt.get(collection) : stmt.get()) as { count: number };
return result.count;
}
@ -3315,12 +3324,68 @@ export function getHashesForEmbedding(db: Database): { hash: string; body: strin
}
/**
* Clear all embeddings from the database (force re-index).
* Deletes all rows from content_vectors and drops the vectors_vec table.
* Clear embeddings for the whole index, or just for one collection.
*
* When `collection` is omitted the entire content_vectors table is emptied and
* the vectors_vec virtual table is dropped (it is recreated with the right
* dimensions on the next embed run).
*
* When `collection` is provided, only vectors whose hash is referenced
* exclusively by active documents in that collection are removed. Hashes
* shared with active documents in other collections are left in place so
* vector search keeps working there (content_vectors is keyed globally by
* content hash; identical document bodies across collections share a row).
* vectors_vec is preserved so other collections keep working unless the scoped
* clear empties content_vectors entirely, in which case it is dropped so the
* next embed can recreate the table with the current dimensions.
*/
export function clearAllEmbeddings(db: Database): void {
db.exec(`DELETE FROM content_vectors`);
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
export function clearAllEmbeddings(db: Database, collection?: string): void {
if (!collection) {
db.exec(`DELETE FROM content_vectors`);
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
return;
}
const exclusiveHashesQuery = `
SELECT DISTINCT d.hash
FROM documents d
WHERE d.collection = ? AND d.active = 1
AND NOT EXISTS (
SELECT 1 FROM documents d2
WHERE d2.hash = d.hash
AND d2.active = 1
AND d2.collection != d.collection
)
`;
const vecTableExists = db
.prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
.get();
if (vecTableExists) {
const hashSeqRows = db.prepare(`
SELECT cv.hash, cv.seq
FROM content_vectors cv
WHERE cv.hash IN (${exclusiveHashesQuery})
`).all(collection) as { hash: string; seq: number }[];
const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
for (const row of hashSeqRows) {
delVec.run(`${row.hash}_${row.seq}`);
}
}
db.prepare(`
DELETE FROM content_vectors
WHERE hash IN (${exclusiveHashesQuery})
`).run(collection);
const remaining = db
.prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
.get() as { n: number };
if (remaining.n === 0) {
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
}
}
/**

View File

@ -982,6 +982,92 @@ describe("embed", () => {
}
});
test("store.embed scopes pending documents to the requested collection", async () => {
const store = await createStore({
dbPath: freshDbPath(),
config: {
collections: {
docs: { path: docsDir, pattern: "**/*.md" },
notes: { path: notesDir, pattern: "**/*.md" },
},
},
});
const fakeLlm = createFakeEmbedLlm();
setDefaultLlamaCpp(createFakeTokenizer() as any);
store.internal.llm = fakeLlm as any;
try {
await store.update();
const result = await store.embed({ collection: "docs" });
const vectorCounts = store.internal.db.prepare(`
SELECT d.collection, COUNT(DISTINCT v.hash) AS count
FROM documents d
LEFT JOIN content_vectors v ON v.hash = d.hash AND v.seq = 0
WHERE d.active = 1
GROUP BY d.collection
ORDER BY d.collection
`).all() as Array<{ collection: string; count: number }>;
expect(result.docsProcessed).toBe(3);
expect(result.chunksEmbedded).toBe(3);
expect(vectorCounts).toEqual([
{ collection: "docs", count: 3 },
{ collection: "notes", count: 0 },
]);
} finally {
setDefaultLlamaCpp(null);
await store.close();
}
});
test("store.embed with force only clears the requested collection", async () => {
const store = await createStore({
dbPath: freshDbPath(),
config: {
collections: {
docs: { path: docsDir, pattern: "**/*.md" },
notes: { path: notesDir, pattern: "**/*.md" },
},
},
});
const fakeLlm = createFakeEmbedLlm();
setDefaultLlamaCpp(createFakeTokenizer() as any);
store.internal.llm = fakeLlm as any;
const vectorCounts = () => store.internal.db.prepare(`
SELECT d.collection, COUNT(DISTINCT v.hash) AS count
FROM documents d
LEFT JOIN content_vectors v ON v.hash = d.hash AND v.seq = 0
WHERE d.active = 1
GROUP BY d.collection
ORDER BY d.collection
`).all() as Array<{ collection: string; count: number }>;
try {
await store.update();
await store.embed();
expect(vectorCounts()).toEqual([
{ collection: "docs", count: 3 },
{ collection: "notes", count: 3 },
]);
const result = await store.embed({ force: true, collection: "docs" });
expect(result.docsProcessed).toBe(3);
expect(result.chunksEmbedded).toBe(3);
expect(vectorCounts()).toEqual([
{ collection: "docs", count: 3 },
{ collection: "notes", count: 3 },
]);
} finally {
setDefaultLlamaCpp(null);
await store.close();
}
});
test("store.embed rejects invalid batch limits", async () => {
const store = await createStore({
dbPath: freshDbPath(),