fix(embed): honor collection filter

2026-05-09 18:12:37 +00:00 · 2026-05-09 18:12:37 +00:00 · 5b9f472849
commit 5b9f472849
parent d045a8bab6
5 changed files with 185 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,11 @@

 ### Fixes

+- Embedding: `qmd embed -c <collection>` now scopes pending-doc selection
+  to the requested collection instead of embedding global pending work.
+  Scoped `--force` clears only collection-owned vectors, preserves shared
+  hashes referenced by sibling collections, and drops `vectors_vec` only
+  when the scoped clear empties all vectors.
 - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
 - Fix: preserve original filename case in `handelize()`. The previous
  `.toLowerCase()` call made indexed paths unreachable on case-sensitive
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@ -1684,7 +1684,7 @@ function parseChunkStrategy(value: unknown): ChunkStrategy | undefined {
 async function vectorIndex(
  model: string = DEFAULT_EMBED_MODEL_URI,
  force: boolean = false,
-  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy },
+  batchOptions?: { maxDocsPerBatch?: number; maxBatchBytes?: number; chunkStrategy?: ChunkStrategy; collection?: string },
 ): Promise<void> {
  const storeInstance = getStore();
  const db = storeInstance.db;
@ -1694,7 +1694,7 @@ async function vectorIndex(
  }

  // Check if there's work to do before starting
-  const hashesToEmbed = getHashesNeedingEmbedding(db);
+  const hashesToEmbed = getHashesNeedingEmbedding(db, batchOptions?.collection);
  if (hashesToEmbed === 0 && !force) {
    console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
    closeDb();
@ -1715,6 +1715,7 @@ async function vectorIndex(
  const result = await generateEmbeddings(storeInstance, {
    force,
    model,
+    collection: batchOptions?.collection,
    maxDocsPerBatch: batchOptions?.maxDocsPerBatch,
    maxBatchBytes: batchOptions?.maxBatchBytes,
    chunkStrategy: batchOptions?.chunkStrategy,
@ -2727,7 +2728,7 @@ function showHelp(): void {
  console.log("Maintenance:");
  console.log("  qmd status                    - View index + collection health");
  console.log("  qmd update [--pull]           - Re-index collections (optionally git pull first)");
-  console.log("  qmd embed [-f]                - Generate/refresh vector embeddings");
+  console.log("  qmd embed [-f] [-c <name>]    - Generate/refresh vector embeddings");
  console.log("    --max-docs-per-batch <n>    - Cap docs loaded into memory per embedding batch");
  console.log("    --max-batch-mb <n>          - Cap UTF-8 MB loaded into memory per embedding batch");
  console.log("  qmd cleanup                   - Clear caches, vacuum DB");
@ -3120,10 +3121,17 @@ if (isMain) {
        const maxDocsPerBatch = parseEmbedBatchOption("maxDocsPerBatch", cli.values["max-docs-per-batch"]);
        const maxBatchMb = parseEmbedBatchOption("maxBatchBytes", cli.values["max-batch-mb"]);
        const embedChunkStrategy = parseChunkStrategy(cli.values["chunk-strategy"]);
+        // Validate -c against configured collections before dispatching, so a
+        // typo errors with "Collection not found: X" instead of silently
+        // reporting success because no pending docs match a nonexistent name.
+        // embed operates on a single collection; only the first value is used.
+        const embedValidatedCollections = resolveCollectionFilter(cli.opts.collection, false);
+        const embedCollection = embedValidatedCollections[0];
        await vectorIndex(DEFAULT_EMBED_MODEL_URI, !!cli.values.force, {
          maxDocsPerBatch,
          maxBatchBytes: maxBatchMb === undefined ? undefined : maxBatchMb * 1024 * 1024,
          chunkStrategy: embedChunkStrategy,
+          collection: embedCollection,
        });
      } catch (error) {
        console.error(error instanceof Error ? error.message : String(error));
--- a/src/index.ts
+++ b/src/index.ts
@ -290,6 +290,8 @@ export interface QMDStore {
  embed(options?: {
    force?: boolean;
    model?: string;
+    /** Restrict embedding to documents in one collection. */
+    collection?: string;
    maxDocsPerBatch?: number;
    maxBatchBytes?: number;
    chunkStrategy?: ChunkStrategy;
@ -516,6 +518,7 @@ export async function createStore(options: StoreOptions): Promise<QMDStore> {
      return generateEmbeddings(internal, {
        force: embedOpts?.force,
        model: embedOpts?.model,
+        collection: embedOpts?.collection,
        maxDocsPerBatch: embedOpts?.maxDocsPerBatch,
        maxBatchBytes: embedOpts?.maxBatchBytes,
        chunkStrategy: embedOpts?.chunkStrategy,
--- a/src/store.ts
+++ b/src/store.ts
@ -1374,6 +1374,11 @@ export type EmbedResult = {
 export type EmbedOptions = {
  force?: boolean;
  model?: string;
+  /**
+   * Restrict embedding to documents in a single collection.
+   * When omitted, all pending documents across every collection are embedded.
+   */
+  collection?: string;
  maxDocsPerBatch?: number;
  maxBatchBytes?: number;
  chunkStrategy?: ChunkStrategy;
@ -1415,16 +1420,18 @@ function resolveEmbedOptions(options?: EmbedOptions): Required<Pick<EmbedOptions
  };
 }

-function getPendingEmbeddingDocs(db: Database): PendingEmbeddingDoc[] {
-  return db.prepare(`
+function getPendingEmbeddingDocs(db: Database, collection?: string): PendingEmbeddingDoc[] {
+  const collectionFilter = collection ? `AND d.collection = ?` : ``;
+  const stmt = db.prepare(`
    SELECT d.hash, MIN(d.path) as path, length(CAST(c.doc AS BLOB)) as bytes
    FROM documents d
    JOIN content c ON d.hash = c.hash
    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL
+    WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
    GROUP BY d.hash
    ORDER BY MIN(d.path)
-  `).all() as PendingEmbeddingDoc[];
+  `);
+  return (collection ? stmt.all(collection) : stmt.all()) as PendingEmbeddingDoc[];
 }

 function buildEmbeddingBatches(
@ -1491,10 +1498,10 @@ export async function generateEmbeddings(
  const encoder = new TextEncoder();

  if (options?.force) {
-    clearAllEmbeddings(db);
+    clearAllEmbeddings(db, options?.collection);
  }

-  const docsToEmbed = getPendingEmbeddingDocs(db);
+  const docsToEmbed = getPendingEmbeddingDocs(db, options?.collection);

  if (docsToEmbed.length === 0) {
    return { docsProcessed: 0, chunksEmbedded: 0, errors: 0, durationMs: 0 };
@ -1942,13 +1949,15 @@ export type IndexStatus = {
 // Index health
 // =============================================================================

-export function getHashesNeedingEmbedding(db: Database): number {
-  const result = db.prepare(`
+export function getHashesNeedingEmbedding(db: Database, collection?: string): number {
+  const collectionFilter = collection ? `AND d.collection = ?` : ``;
+  const stmt = db.prepare(`
    SELECT COUNT(DISTINCT d.hash) as count
    FROM documents d
    LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
-    WHERE d.active = 1 AND v.hash IS NULL
-  `).get() as { count: number };
+    WHERE d.active = 1 AND v.hash IS NULL ${collectionFilter}
+  `);
+  const result = (collection ? stmt.get(collection) : stmt.get()) as { count: number };
  return result.count;
 }

@ -3315,12 +3324,68 @@ export function getHashesForEmbedding(db: Database): { hash: string; body: strin
 }

 /**
- * Clear all embeddings from the database (force re-index).
- * Deletes all rows from content_vectors and drops the vectors_vec table.
+ * Clear embeddings for the whole index, or just for one collection.
+ *
+ * When `collection` is omitted the entire content_vectors table is emptied and
+ * the vectors_vec virtual table is dropped (it is recreated with the right
+ * dimensions on the next embed run).
+ *
+ * When `collection` is provided, only vectors whose hash is referenced
+ * exclusively by active documents in that collection are removed. Hashes
+ * shared with active documents in other collections are left in place so
+ * vector search keeps working there (content_vectors is keyed globally by
+ * content hash; identical document bodies across collections share a row).
+ * vectors_vec is preserved so other collections keep working unless the scoped
+ * clear empties content_vectors entirely, in which case it is dropped so the
+ * next embed can recreate the table with the current dimensions.
 */
-export function clearAllEmbeddings(db: Database): void {
-  db.exec(`DELETE FROM content_vectors`);
-  db.exec(`DROP TABLE IF EXISTS vectors_vec`);
+export function clearAllEmbeddings(db: Database, collection?: string): void {
+  if (!collection) {
+    db.exec(`DELETE FROM content_vectors`);
+    db.exec(`DROP TABLE IF EXISTS vectors_vec`);
+    return;
+  }
+
+  const exclusiveHashesQuery = `
+    SELECT DISTINCT d.hash
+    FROM documents d
+    WHERE d.collection = ? AND d.active = 1
+      AND NOT EXISTS (
+        SELECT 1 FROM documents d2
+        WHERE d2.hash = d.hash
+          AND d2.active = 1
+          AND d2.collection != d.collection
+      )
+  `;
+
+  const vecTableExists = db
+    .prepare(`SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'`)
+    .get();
+
+  if (vecTableExists) {
+    const hashSeqRows = db.prepare(`
+      SELECT cv.hash, cv.seq
+      FROM content_vectors cv
+      WHERE cv.hash IN (${exclusiveHashesQuery})
+    `).all(collection) as { hash: string; seq: number }[];
+
+    const delVec = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`);
+    for (const row of hashSeqRows) {
+      delVec.run(`${row.hash}_${row.seq}`);
+    }
+  }
+
+  db.prepare(`
+    DELETE FROM content_vectors
+    WHERE hash IN (${exclusiveHashesQuery})
+  `).run(collection);
+
+  const remaining = db
+    .prepare(`SELECT COUNT(*) AS n FROM content_vectors`)
+    .get() as { n: number };
+  if (remaining.n === 0) {
+    db.exec(`DROP TABLE IF EXISTS vectors_vec`);
+  }
 }

 /**
--- a/test/sdk.test.ts
+++ b/test/sdk.test.ts
@ -982,6 +982,92 @@ describe("embed", () => {
    }
  });

+  test("store.embed scopes pending documents to the requested collection", async () => {
+    const store = await createStore({
+      dbPath: freshDbPath(),
+      config: {
+        collections: {
+          docs: { path: docsDir, pattern: "**/*.md" },
+          notes: { path: notesDir, pattern: "**/*.md" },
+        },
+      },
+    });
+
+    const fakeLlm = createFakeEmbedLlm();
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.internal.llm = fakeLlm as any;
+
+    try {
+      await store.update();
+      const result = await store.embed({ collection: "docs" });
+
+      const vectorCounts = store.internal.db.prepare(`
+        SELECT d.collection, COUNT(DISTINCT v.hash) AS count
+        FROM documents d
+        LEFT JOIN content_vectors v ON v.hash = d.hash AND v.seq = 0
+        WHERE d.active = 1
+        GROUP BY d.collection
+        ORDER BY d.collection
+      `).all() as Array<{ collection: string; count: number }>;
+
+      expect(result.docsProcessed).toBe(3);
+      expect(result.chunksEmbedded).toBe(3);
+      expect(vectorCounts).toEqual([
+        { collection: "docs", count: 3 },
+        { collection: "notes", count: 0 },
+      ]);
+    } finally {
+      setDefaultLlamaCpp(null);
+      await store.close();
+    }
+  });
+
+  test("store.embed with force only clears the requested collection", async () => {
+    const store = await createStore({
+      dbPath: freshDbPath(),
+      config: {
+        collections: {
+          docs: { path: docsDir, pattern: "**/*.md" },
+          notes: { path: notesDir, pattern: "**/*.md" },
+        },
+      },
+    });
+
+    const fakeLlm = createFakeEmbedLlm();
+    setDefaultLlamaCpp(createFakeTokenizer() as any);
+    store.internal.llm = fakeLlm as any;
+
+    const vectorCounts = () => store.internal.db.prepare(`
+      SELECT d.collection, COUNT(DISTINCT v.hash) AS count
+      FROM documents d
+      LEFT JOIN content_vectors v ON v.hash = d.hash AND v.seq = 0
+      WHERE d.active = 1
+      GROUP BY d.collection
+      ORDER BY d.collection
+    `).all() as Array<{ collection: string; count: number }>;
+
+    try {
+      await store.update();
+      await store.embed();
+      expect(vectorCounts()).toEqual([
+        { collection: "docs", count: 3 },
+        { collection: "notes", count: 3 },
+      ]);
+
+      const result = await store.embed({ force: true, collection: "docs" });
+
+      expect(result.docsProcessed).toBe(3);
+      expect(result.chunksEmbedded).toBe(3);
+      expect(vectorCounts()).toEqual([
+        { collection: "docs", count: 3 },
+        { collection: "notes", count: 3 },
+      ]);
+    } finally {
+      setDefaultLlamaCpp(null);
+      await store.close();
+    }
+  });
+
  test("store.embed rejects invalid batch limits", async () => {
    const store = await createStore({
      dbPath: freshDbPath(),