fix(embed): honor collection filter
This commit is contained in:
parent
d045a8bab6
commit
5b9f472849
@ -4,6 +4,11 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
|
||||
to the requested collection instead of embedding global pending work.
|
||||
Scoped `--force` clears only collection-owned vectors, preserves shared
|
||||
hashes referenced by sibling collections, and drops `vectors_vec` only
|
||||
when the scoped clear empties all vectors.
|
||||
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
|
||||
- Fix: preserve original filename case in `handelize()`. The previous
|
||||
`.toLowerCase()` call made indexed paths unreachable on case-sensitive
|
||||
|
||||
@ -1684,7 +1684,7 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
|
||||
async function vectorIndex(
|
||||
model: string = DEFAULT_EMBED_MODEL_URI,
|
||||
force: boolean = false,
|
||||
batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy },
|
||||
batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy; collection?: string },
|
||||
): Promise<void> {
|
||||
const storeInstance = getStore();
|
||||
const db = storeInstance.db;
|
||||
@ -1694,7 +1694,7 @@ async function vectorIndex(
|
||||
}
|
||||
|
||||
// Check if there's work to do before starting
|
||||
const hashesToEmbed = getHashesNeedingEmbedding(db);
|
||||
const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection);
|
||||
if (hashesToEmbed === 0 && !force) {
|
||||
console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
|
||||
closeDb();
|
||||
@ -1715,6 +1715,7 @@ async function vectorIndex(
|
||||
const result = await generateEmbeddings(storeInstance, {
|
||||
force,
|
||||
model,
|
||||
collection: batchOptions?.collection,
|
||||
maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
|
||||
maxBatchBytes: batchOptions?.maxBatchBytes,
|
||||
chunkStrategy: batchOptions?.chunkStrategy,
|
||||
@ -2727,7 +2728,7 @@ function showHelp(): void {
|
||||
console.log("Maintenance:");
|
||||
console.log(" qmd status - View index + collection health");
|
||||
console.log(" qmd update [--pull] - Re-index collections (optionally git pull first)");
|
||||
console.log(" qmd embed [-f] - Generate/refresh vector embeddings");
|
||||
console.log(" qmd embed [-f] [-c <name>] - Generate/refresh vector embeddings");
|
||||
console.log(" --max-docs-per-batch <n> - Cap docs loaded into memory per embedding batch");
|
||||
console.log(" --max-batch-mb <n> - Cap UTF-8 MB loaded into memory per embedding batch");
|
||||
console.log(" qmd cleanup - Clear caches, vacuum DB");
|
||||
@ -3120,10 +3121,17 @@ if (isMain) {
|
||||
const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
|
||||
const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
|
||||
const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
|
||||
// Validate -c against configured collections before dispatching, so a
|
||||
// typo errors with "Collection not found: X" instead of silently
|
||||
// reporting success because no pending docs match a nonexistent name.
|
||||
// embed operates on a single collection; only the first value is used.
|
||||
const embedValidatedCollections = resolveCollectionFilter(cli.opts.collection, false);
|
||||
const embedCollection = embedValidatedCollections[0];
|
||||
await vectorIndex(DEFAULT_EMBED_MODEL_URI, !!cli.values.force, {
|
||||
maxDocsPerBatch,
|
||||
maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
|
||||
chunkStrategy: embedChunkStrategy,
|
||||
collection: embedCollection,
|
||||
});
|
||||
} catch (error) {
|
||||
console.error(error instanceof Error ? error.message : String(error));
|
||||
|
||||
@ -290,6 +290,8 @@ export interface QMDStore {
|
||||
embed(options?: {
|
||||
force?: boolean;
|
||||
model?: string;
|
||||
/** Restrict embedding to documents in one collection. */
|
||||
collection?: string;
|
||||
maxDocsPerBatch?: number;
|
||||
maxBatchBytes?: number;
|
||||
chunkStrategy?: ChunkStrategy;
|
||||
@ -516,6 +518,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
|
||||
return generateEmbeddings(internal, {
|
||||
force: embedOpts?.force,
|
||||
model: embedOpts?.model,
|
||||
collection: embedOpts?.collection,
|
||||
maxDocsPerBatch: embedOpts?.maxDocsPerBatch,
|
||||
maxBatchBytes: embedOpts?.maxBatchBytes,
|
||||
chunkStrategy: embedOpts?.chunkStrategy,
|
||||
|
||||
95
src/store.ts
95
src/store.ts
@ -1374,6 +1374,11 @@ export type EmbedResult = {
|
||||
export type EmbedOptions = {
|
||||
force?: boolean;
|
||||
model?: string;
|
||||
/**
|
||||
* Restrict embedding to documents in a single collection.
|
||||
* When omitted, all pending documents across every collection are embedded.
|
||||
*/
|
||||
collection?: string;
|
||||
maxDocsPerBatch?: number;
|
||||
maxBatchBytes?: number;
|
||||
chunkStrategy?: ChunkStrategy;
|
||||
@ -1415,16 +1420,18 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
|
||||
};
|
||||
}
|
||||
|
||||
function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
|
||||
return db.prepare(`
|
||||
function getPendingEmbeddingDocs(db: Database, collection?: string): PendingEmbeddingDoc[] {
|
||||
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
||||
const stmt = db.prepare(`
|
||||
SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
|
||||
FROM documents d
|
||||
JOIN content c ON d.hash = c.hash
|
||||
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
||||
WHERE d.active = 1 AND v.hash IS NULL
|
||||
WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
|
||||
GROUP BY d.hash
|
||||
ORDER BY MIN(d.path)
|
||||
`).all() as PendingEmbeddingDoc[];
|
||||
`);
|
||||
return (collection ? stmt.all(collection) : stmt.all()) as PendingEmbeddingDoc[];
|
||||
}
|
||||
|
||||
function buildEmbeddingBatches(
|
||||
@ -1491,10 +1498,10 @@ export async function generateEmbeddings(
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
if (options?.force) {
|
||||
clearAllEmbeddings(db);
|
||||
clearAllEmbeddings(db, options?.collection);
|
||||
}
|
||||
|
||||
const docsToEmbed = getPendingEmbeddingDocs(db);
|
||||
const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection);
|
||||
|
||||
if (docsToEmbed.length === 0) {
|
||||
return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
|
||||
@ -1942,13 +1949,15 @@ export type IndexStatus = {
|
||||
// Index health
|
||||
// =============================================================================
|
||||
|
||||
export function getHashesNeedingEmbedding(db: Database): number {
|
||||
const result = db.prepare(`
|
||||
export function getHashesNeedingEmbedding(db: Database, collection?: string): number {
|
||||
const collectionFilter = collection ? `AND d.collection = ?` : ``;
|
||||
const stmt = db.prepare(`
|
||||
SELECT COUNT(DISTINCT d.hash) as count
|
||||
FROM documents d
|
||||
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
||||
WHERE d.active = 1 AND v.hash IS NULL
|
||||
`).get() as { count: number };
|
||||
WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
|
||||
`);
|
||||
const result = (collection ? stmt.get(collection) : stmt.get()) as { count: number };
|
||||
return result.count;
|
||||
}
|
||||
|
||||
@ -3315,12 +3324,68 @@ export function getHashesForEmbedding(db: Database): { hash: string; body: strin
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all embeddings from the database (force re-index).
|
||||
* Deletes all rows from content_vectors and drops the vectors_vec table.
|
||||
* Clear embeddings for the whole index, or just for one collection.
|
||||
*
|
||||
* When `collection` is omitted the entire content_vectors table is emptied and
|
||||
* the vectors_vec virtual table is dropped (it is recreated with the right
|
||||
* dimensions on the next embed run).
|
||||
*
|
||||
* When `collection` is provided, only vectors whose hash is referenced
|
||||
* exclusively by active documents in that collection are removed. Hashes
|
||||
* shared with active documents in other collections are left in place so
|
||||
* vector search keeps working there (content_vectors is keyed globally by
|
||||
* content hash; identical document bodies across collections share a row).
|
||||
* vectors_vec is preserved so other collections keep working unless the scoped
|
||||
* clear empties content_vectors entirely, in which case it is dropped so the
|
||||
* next embed can recreate the table with the current dimensions.
|
||||
*/
|
||||
export function clearAllEmbeddings(db: Database): void {
|
||||
db.exec(`DELETE FROM content_vectors`);
|
||||
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
||||
export function clearAllEmbeddings(db: Database, collection?: string): void {
|
||||
if (!collection) {
|
||||
db.exec(`DELETE FROM content_vectors`);
|
||||
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
||||
return;
|
||||
}
|
||||
|
||||
const exclusiveHashesQuery = `
|
||||
SELECT DISTINCT d.hash
|
||||
FROM documents d
|
||||
WHERE d.collection = ? AND d.active = 1
|
||||
AND NOT EXISTS (
|
||||
SELECT 1 FROM documents d2
|
||||
WHERE d2.hash = d.hash
|
||||
AND d2.active = 1
|
||||
AND d2.collection != d.collection
|
||||
)
|
||||
`;
|
||||
|
||||
const vecTableExists = db
|
||||
.prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
|
||||
.get();
|
||||
|
||||
if (vecTableExists) {
|
||||
const hashSeqRows = db.prepare(`
|
||||
SELECT cv.hash, cv.seq
|
||||
FROM content_vectors cv
|
||||
WHERE cv.hash IN (${exclusiveHashesQuery})
|
||||
`).all(collection) as { hash: string; seq: number }[];
|
||||
|
||||
const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
|
||||
for (const row of hashSeqRows) {
|
||||
delVec.run(`${row.hash}_${row.seq}`);
|
||||
}
|
||||
}
|
||||
|
||||
db.prepare(`
|
||||
DELETE FROM content_vectors
|
||||
WHERE hash IN (${exclusiveHashesQuery})
|
||||
`).run(collection);
|
||||
|
||||
const remaining = db
|
||||
.prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
|
||||
.get() as { n: number };
|
||||
if (remaining.n === 0) {
|
||||
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -982,6 +982,92 @@ describe("embed", () => {
|
||||
}
|
||||
});
|
||||
|
||||
test("store.embed scopes pending documents to the requested collection", async () => {
|
||||
const store = await createStore({
|
||||
dbPath: freshDbPath(),
|
||||
config: {
|
||||
collections: {
|
||||
docs: { path: docsDir, pattern: "**/*.md" },
|
||||
notes: { path: notesDir, pattern: "**/*.md" },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const fakeLlm = createFakeEmbedLlm();
|
||||
setDefaultLlamaCpp(createFakeTokenizer() as any);
|
||||
store.internal.llm = fakeLlm as any;
|
||||
|
||||
try {
|
||||
await store.update();
|
||||
const result = await store.embed({ collection: "docs" });
|
||||
|
||||
const vectorCounts = store.internal.db.prepare(`
|
||||
SELECT d.collection, COUNT(DISTINCT v.hash) AS count
|
||||
FROM documents d
|
||||
LEFT JOIN content_vectors v ON v.hash = d.hash AND v.seq = 0
|
||||
WHERE d.active = 1
|
||||
GROUP BY d.collection
|
||||
ORDER BY d.collection
|
||||
`).all() as Array<{ collection: string; count: number }>;
|
||||
|
||||
expect(result.docsProcessed).toBe(3);
|
||||
expect(result.chunksEmbedded).toBe(3);
|
||||
expect(vectorCounts).toEqual([
|
||||
{ collection: "docs", count: 3 },
|
||||
{ collection: "notes", count: 0 },
|
||||
]);
|
||||
} finally {
|
||||
setDefaultLlamaCpp(null);
|
||||
await store.close();
|
||||
}
|
||||
});
|
||||
|
||||
test("store.embed with force only clears the requested collection", async () => {
|
||||
const store = await createStore({
|
||||
dbPath: freshDbPath(),
|
||||
config: {
|
||||
collections: {
|
||||
docs: { path: docsDir, pattern: "**/*.md" },
|
||||
notes: { path: notesDir, pattern: "**/*.md" },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const fakeLlm = createFakeEmbedLlm();
|
||||
setDefaultLlamaCpp(createFakeTokenizer() as any);
|
||||
store.internal.llm = fakeLlm as any;
|
||||
|
||||
const vectorCounts = () => store.internal.db.prepare(`
|
||||
SELECT d.collection, COUNT(DISTINCT v.hash) AS count
|
||||
FROM documents d
|
||||
LEFT JOIN content_vectors v ON v.hash = d.hash AND v.seq = 0
|
||||
WHERE d.active = 1
|
||||
GROUP BY d.collection
|
||||
ORDER BY d.collection
|
||||
`).all() as Array<{ collection: string; count: number }>;
|
||||
|
||||
try {
|
||||
await store.update();
|
||||
await store.embed();
|
||||
expect(vectorCounts()).toEqual([
|
||||
{ collection: "docs", count: 3 },
|
||||
{ collection: "notes", count: 3 },
|
||||
]);
|
||||
|
||||
const result = await store.embed({ force: true, collection: "docs" });
|
||||
|
||||
expect(result.docsProcessed).toBe(3);
|
||||
expect(result.chunksEmbedded).toBe(3);
|
||||
expect(vectorCounts()).toEqual([
|
||||
{ collection: "docs", count: 3 },
|
||||
{ collection: "notes", count: 3 },
|
||||
]);
|
||||
} finally {
|
||||
setDefaultLlamaCpp(null);
|
||||
await store.close();
|
||||
}
|
||||
});
|
||||
|
||||
test("store.embed rejects invalid batch limits", async () => {
|
||||
const store = await createStore({
|
||||
dbPath: freshDbPath(),
|
||||
|
||||
Loading…
Reference in New Issue
Block a user