From 92aaded36e4aaf76f753dceb9e12dccae6b4704a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Sat, 9 May 2026 17:53:44 +0000 Subject: [PATCH] fix(store): preserve inactive docs during orphan cleanup --- CHANGELOG.md | 3 ++ src/store.ts | 6 ++-- test/store.test.ts | 71 +++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fee012c..4931d92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,9 @@ - CLI: lazy-load `node-llama-cpp` so lightweight commands such as `qmd status` do not import native ML dependencies or trigger llama.cpp builds on ARM/no-GPU machines. #491 +- Store: keep content rows referenced by inactive documents during orphan + cleanup so `qmd update` preserves soft-deleted tombstones for removed + files. #585 ## [2.1.0] - 2026-04-05 diff --git a/src/store.ts b/src/store.ts index 71dc887..f5dd47a 100644 --- a/src/store.ts +++ b/src/store.ts @@ -2032,13 +2032,15 @@ export function deleteInactiveDocuments(db: Database): number { } /** - * Remove orphaned content hashes that are not referenced by any active document. + * Remove orphaned content hashes that are not referenced by any document. + * Inactive documents are soft-deleted tombstones, so their content rows must + * remain referenced until deleteInactiveDocuments() hard-deletes them. * Returns the number of orphaned content hashes deleted. */ export function cleanupOrphanedContent(db: Database): number { const result = db.prepare(` DELETE FROM content - WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1) + WHERE hash NOT IN (SELECT DISTINCT hash FROM documents) `).run(); return result.changes; } diff --git a/test/store.test.ts b/test/store.test.ts index a172064..2ed0b06 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -9,7 +9,7 @@ import { describe, test, expect, beforeAll, afterAll, beforeEach, afterEach, vi } from "vitest"; import { openDatabase, loadSqliteVec } from "../src/db.js"; import type { Database } from "../src/db.js"; -import { unlink, mkdtemp, rmdir, writeFile } from "node:fs/promises"; +import { unlink, mkdtemp, rmdir, writeFile, rm } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join } from "node:path"; import YAML from "yaml"; @@ -51,6 +51,7 @@ import { insertContent, insertDocument, generateEmbeddings, + reindexCollection, type Store, type DocumentResult, type SearchResult, @@ -2313,6 +2314,33 @@ describe("Vector Table", () => { await cleanupTestDb(store); }); + + test("insertEmbedding is idempotent for an existing vec0 hash_seq (#598)", async () => { + const store = await createTestStore(); + store.ensureVecTable(2); + + const hash = "existinghashseq"; + const first = new Float32Array([0.1, 0.2]); + const second = new Float32Array([0.3, 0.4]); + const now = new Date().toISOString(); + + store.db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${hash}_0`, first); + + // Reproduces sqlite-vec's broken conflict handling: vec0 does not honor OR REPLACE. + expect(() => { + store.db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`).run(`${hash}_0`, second); + }).toThrow(/UNIQUE constraint failed/i); + + // QMD must therefore use DELETE + INSERT when upserting the vector row. + expect(() => store.insertEmbedding(hash, 0, 0, second, "test-model", now)).not.toThrow(); + + const vectorCount = store.db.prepare(`SELECT COUNT(*) AS count FROM vectors_vec WHERE hash_seq = ?`).get(`${hash}_0`) as { count: number }; + const metadataCount = store.db.prepare(`SELECT COUNT(*) AS count FROM content_vectors WHERE hash = ? AND seq = 0`).get(hash) as { count: number }; + expect(vectorCount.count).toBe(1); + expect(metadataCount.count).toBe(1); + + await cleanupTestDb(store); + }); }); // ============================================================================= @@ -2320,6 +2348,47 @@ describe("Vector Table", () => { // ============================================================================= describe("Integration", () => { + test("reindexCollection soft-deletes removed files and preserves inactive content (#585)", async () => { + const store = await createTestStore(); + const collectionDir = await mkdtemp(join(testDir, "orphan-regression-")); + const collectionName = "orphan-regression"; + + try { + for (let i = 1; i <= 5; i++) { + await writeFile(join(collectionDir, `doc-${i}.md`), `# Doc ${i}\n\nUnique body ${i}`); + } + + await createTestCollection({ pwd: collectionDir, glob: "**/*.md", name: collectionName }); + + const initial = await reindexCollection(store, collectionDir, "**/*.md", collectionName); + expect(initial.indexed).toBe(5); + expect(initial.removed).toBe(0); + + await rm(join(collectionDir, "doc-3.md")); + await rm(join(collectionDir, "doc-4.md")); + await rm(join(collectionDir, "doc-5.md")); + + const afterDelete = await reindexCollection(store, collectionDir, "**/*.md", collectionName); + expect(afterDelete.removed).toBe(3); + + const counts = store.db.prepare(` + SELECT + SUM(CASE WHEN active = 1 THEN 1 ELSE 0 END) AS active, + SUM(CASE WHEN active = 0 THEN 1 ELSE 0 END) AS inactive, + COUNT(*) AS total + FROM documents + WHERE collection = ? + `).get(collectionName) as { active: number; inactive: number; total: number }; + const contentCount = store.db.prepare(`SELECT COUNT(*) AS count FROM content`).get() as { count: number }; + + expect(counts).toEqual({ active: 2, inactive: 3, total: 5 }); + expect(contentCount.count).toBe(5); + } finally { + await rm(collectionDir, { recursive: true, force: true }); + await cleanupTestDb(store); + } + }); + test("full document lifecycle: create, search, retrieve", async () => { const store = await createTestStore(); const collectionName = await createTestCollection({ pwd: "/test/notes", glob: "**/*.md" });