Move embedding/vector DB operations to store.ts
Refactor vector indexing by extracting database operations from vectorIndex() in qmd.ts into three new store methods: - getHashesForEmbedding(): Returns content hashes needing embeddings - clearAllEmbeddings(): Clears all vectors for force re-indexing - insertEmbedding(): Inserts a single embedding into both tables This continues the refactoring effort to consolidate all database operations in store.ts, making the codebase more maintainable and testable. Changes: - Add new embedding operation methods to Store type and factory - Export getHashesForEmbedding(), clearAllEmbeddings(), insertEmbedding() - Update vectorIndex() to use new store methods instead of direct SQL - Remove inline SQL from embedding logic in qmd.ts Related: qmd-4u4 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
c4789171f9
commit
ab9396e675
@ -1,7 +1,9 @@
|
||||
{"id":"qmd-18s","title":"Move cleanup/maintenance DB operations to store.ts","description":"Move cleanup operations from cleanup() command to store.ts. Create methods like deleteInactiveDocuments(), vacuumDatabase(), cleanupOrphanedContent(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.815781-05:00","updated_at":"2025-12-12T16:36:21.815781-05:00","dependencies":[{"issue_id":"qmd-18s","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:03.014111-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-0ic","title":"in qmd status, list all the additonal contexts under the collections that match","description":"","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:41:42.126194-05:00","updated_at":"2025-12-12T16:41:42.126194-05:00"}
|
||||
{"id":"qmd-18s","title":"Move cleanup/maintenance DB operations to store.ts","description":"Move cleanup operations from cleanup() command to store.ts. Create methods like deleteInactiveDocuments(), vacuumDatabase(), cleanupOrphanedContent(), etc.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.815781-05:00","updated_at":"2025-12-12T16:42:36.896806-05:00","closed_at":"2025-12-12T16:42:36.896806-05:00","dependencies":[{"issue_id":"qmd-18s","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:03.014111-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-29c","title":"Move all database operations from qmd.ts to store.ts","description":"Currently qmd.ts has ~70 direct database operations (db.prepare, db.exec). All database operations should be moved to store.ts to improve separation of concerns. qmd.ts should only use high-level methods from store.ts that don't require direct SQL knowledge.","notes":"Phase 1 complete: Moved collection operations (listCollections, removeCollection, renameCollection) to store.ts. Created 4 subtasks for remaining work: document indexing, context management, embeddings, and cleanup operations.","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-12T16:32:13.722223-05:00","updated_at":"2025-12-12T16:37:39.863558-05:00"}
|
||||
{"id":"qmd-4ru","title":"Update document retrieval for new schema","description":"Functions like getDocument, findDocument, getMultipleDocuments need to work with new schema (path instead of filepath, content joins, virtual paths).","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-12T15:29:53.911881-05:00","updated_at":"2025-12-12T15:56:11.054888-05:00","closed_at":"2025-12-12T15:56:11.054888-05:00","dependencies":[{"issue_id":"qmd-4ru","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.912607-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-4u4","title":"Move embedding/vector DB operations to store.ts","description":"Move vector indexing DB operations from vectorIndex() to store.ts. Create methods like getHashesForEmbedding(), insertEmbedding(), clearEmbeddings(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.683434-05:00","updated_at":"2025-12-12T16:36:21.683434-05:00","dependencies":[{"issue_id":"qmd-4u4","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.944591-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-4u4","title":"Move embedding/vector DB operations to store.ts","description":"Move vector indexing DB operations from vectorIndex() to store.ts. Create methods like getHashesForEmbedding(), insertEmbedding(), clearEmbeddings(), etc.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.683434-05:00","updated_at":"2025-12-12T16:42:40.42653-05:00","closed_at":"2025-12-12T16:42:40.42653-05:00","dependencies":[{"issue_id":"qmd-4u4","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.944591-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-7ss","title":"remove all the symlinks and stuff in the git repo, clean up the root directory","description":"","status":"open","priority":4,"issue_type":"task","created_at":"2025-12-12T16:40:00.744982-05:00","updated_at":"2025-12-12T16:40:00.744982-05:00"}
|
||||
{"id":"qmd-afe","title":"implement qmd collection rename, which changes the global path prefix for the collection","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:55:54.779325-05:00","updated_at":"2025-12-12T16:29:24.153196-05:00","closed_at":"2025-12-12T16:29:24.153196-05:00"}
|
||||
{"id":"qmd-ama","title":"Refactor database system","description":"All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection, path, hash,\n┃ created_at, updated_at. (collection,path)\n┃\n┃\n\n┃ All documents should be stored as content addressable hash, e.g. hash, doc, created_at,\n┃ updated_at. documents should be a file system layer on top e.g. collection_id, path, hash,\n┃ created_at, updated_at. (collection,path) is unique. There is also collection which stores PWD\n┃ + glob pattern, name (\\w+). Every document is treated as path qmd://collection.name/","notes":"## Completed\n- ✅ Implemented content-addressable storage (content table with hash→doc mapping)\n- ✅ Refactored documents table as file system layer (collection_id, path, hash)\n- ✅ Added collection names (e.g., \"pages\", \"journals\", \"archive\")\n- ✅ Implemented virtual paths (qmd://collection-name/path/to/file.md)\n- ✅ Added hierarchical context support (collection-scoped)\n- ✅ Successfully migrated existing database\n- ✅ Updated search functions to work with new schema\n- ✅ Updated indexing logic to use content-addressable storage\n- ✅ Orphaned content hash cleanup\n\n## Still TODO\n- Fix migration SQL to properly extract basename (currently needs manual fix)\n- Implement `qmd collection add . --name \u003cname\u003e --mask '**/*.md'`\n- Implement `qmd ls [path]` for exploring virtual file tree","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:35.497489-05:00","updated_at":"2025-12-12T15:39:48.879143-05:00","closed_at":"2025-12-12T15:39:48.879143-05:00"}
|
||||
{"id":"qmd-bx1","title":"Fix migration SQL for proper basename extraction","description":"The migration currently generates collection names incorrectly (uses full path instead of basename). Need to fix the SQL in migrateToContentAddressable to properly extract the directory basename.","status":"closed","priority":1,"issue_type":"bug","created_at":"2025-12-12T15:29:53.757723-05:00","updated_at":"2025-12-12T15:50:29.349134-05:00","closed_at":"2025-12-12T15:50:29.349134-05:00","dependencies":[{"issue_id":"qmd-bx1","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.758524-05:00","created_by":"daemon"}]}
|
||||
@ -10,10 +12,12 @@
|
||||
{"id":"qmd-deh","title":"Refactor database introduce qmd collection *","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:56:04.516137-05:00","updated_at":"2025-12-12T16:12:12.349428-05:00","closed_at":"2025-12-12T16:12:12.349428-05:00"}
|
||||
{"id":"qmd-dmi","title":"Implement 'qmd collection' commands","description":"Add explicit collection management:\n- qmd collection add . --name \u003cname\u003e --mask '**/*.md'\n- qmd collection list\n- qmd collection remove \u003cname\u003e\n\nThis gives users control over collection names and patterns.","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.810666-05:00","updated_at":"2025-12-12T16:02:08.079158-05:00","closed_at":"2025-12-12T16:02:08.079158-05:00","dependencies":[{"issue_id":"qmd-dmi","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.811294-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-e2c","title":"Implement 'qmd ls' command","description":"Add command to explore virtual file tree:\n- qmd ls → list all collections\n- qmd ls \u003ccollection\u003e → list files in collection\n- qmd ls \u003ccollection\u003e/\u003cpath\u003e → list files under path\nOutput: flat list of qmd:// paths","status":"closed","priority":1,"issue_type":"feature","created_at":"2025-12-12T15:29:53.859804-05:00","updated_at":"2025-12-12T15:55:12.777701-05:00","closed_at":"2025-12-12T15:55:12.777701-05:00","dependencies":[{"issue_id":"qmd-e2c","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.860535-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-i3t","title":"Move context management DB operations to store.ts","description":"Move path_contexts INSERT/DELETE/SELECT operations from addContext(), listContexts(), removeContext() to store.ts. Create methods like insertContext(), deleteContext(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.561746-05:00","updated_at":"2025-12-12T16:36:21.561746-05:00","dependencies":[{"issue_id":"qmd-i3t","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.866006-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-i3t","title":"Move context management DB operations to store.ts","description":"Move path_contexts INSERT/DELETE/SELECT operations from addContext(), listContexts(), removeContext() to store.ts. Create methods like insertContext(), deleteContext(), etc.","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:21.561746-05:00","updated_at":"2025-12-12T16:39:16.024705-05:00","dependencies":[{"issue_id":"qmd-i3t","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.866006-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-j9z","title":"Add unit tests for content addressable hashes","description":"add same file from multiple places and verify that they both point at same hash. drop one collection and the content stays.","status":"closed","priority":3,"issue_type":"task","created_at":"2025-12-12T15:39:15.459504-05:00","updated_at":"2025-12-12T16:21:35.473776-05:00","closed_at":"2025-12-12T16:21:35.473776-05:00"}
|
||||
{"id":"qmd-kf8","title":"Move document indexing DB operations to store.ts","description":"Move INSERT/UPDATE/DELETE operations for documents and content tables from indexFiles() to store.ts. Create methods like insertDocument(), updateDocument(), deactivateDocuments(), etc.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:14.558702-05:00","updated_at":"2025-12-12T16:36:14.558702-05:00","dependencies":[{"issue_id":"qmd-kf8","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.770251-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-kf8","title":"Move document indexing DB operations to store.ts","description":"Move INSERT/UPDATE/DELETE operations for documents and content tables from indexFiles() to store.ts. Create methods like insertDocument(), updateDocument(), deactivateDocuments(), etc.","status":"in_progress","priority":2,"issue_type":"task","created_at":"2025-12-12T16:36:14.558702-05:00","updated_at":"2025-12-12T16:39:14.859951-05:00","dependencies":[{"issue_id":"qmd-kf8","depends_on_id":"qmd-29c","type":"parent-child","created_at":"2025-12-12T16:37:02.770251-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-ltg","title":"look for missing context","description":"i ran qmd context list and thats only one bit of context, i had a lot more. i think the path matching isn't quite working right","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:42:57.324769-05:00","updated_at":"2025-12-12T16:42:57.324769-05:00"}
|
||||
{"id":"qmd-p1h","title":"Create collection add|remove","description":"","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-10T10:57:00.717864-05:00","updated_at":"2025-12-12T16:12:00.557003-05:00","closed_at":"2025-12-12T16:12:00.557003-05:00"}
|
||||
{"id":"qmd-rck","title":"move the source files to src/*, clean up teh directory","description":"","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-12T16:40:19.198119-05:00","updated_at":"2025-12-12T16:40:19.198119-05:00"}
|
||||
{"id":"qmd-rhd","title":"Fix 'qmd status' output for new schema","description":"Update status to show collections by name, cleaner context display, virtual path examples.","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:29:54.020596-05:00","updated_at":"2025-12-12T16:13:28.08389-05:00","closed_at":"2025-12-12T16:13:28.08389-05:00","dependencies":[{"issue_id":"qmd-rhd","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:54.021095-05:00","created_by":"daemon"}]}
|
||||
{"id":"qmd-s1y","title":"Update 'qmd add-context' for collection scoping","description":"Update add-context to work with collection-scoped contexts using new path_contexts schema.","notes":"Refactoring to:\n- qmd context add [path] \"text\" (defaults to current collection if in one)\n- qmd context list\n- qmd context rm \u003cpath\u003e\n- Support \"/\" for global/system context\n- Auto-detect collection from pwd","status":"closed","priority":2,"issue_type":"task","created_at":"2025-12-12T15:29:54.076582-05:00","updated_at":"2025-12-12T15:37:47.683263-05:00","closed_at":"2025-12-12T15:37:47.683263-05:00"}
|
||||
{"id":"qmd-vro","title":"Update 'qmd get' to support virtual paths","description":"Allow qmd get to accept both virtual paths (qmd://journals/...) and filesystem paths, plus fuzzy matching by filename.","status":"closed","priority":0,"issue_type":"task","created_at":"2025-12-12T15:29:53.963113-05:00","updated_at":"2025-12-12T15:47:29.178955-05:00","closed_at":"2025-12-12T15:47:29.178955-05:00","dependencies":[{"issue_id":"qmd-vro","depends_on_id":"qmd-ama","type":"discovered-from","created_at":"2025-12-12T15:29:53.963641-05:00","created_by":"daemon"}]}
|
||||
|
||||
64
context-ops.ts
Normal file
64
context-ops.ts
Normal file
@ -0,0 +1,64 @@
|
||||
/**
|
||||
* Context management operations for store.ts
|
||||
* These will be integrated into store.ts
|
||||
*/
|
||||
|
||||
import { Database } from "bun:sqlite";
|
||||
|
||||
// =============================================================================
|
||||
// Context Management Operations
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Insert or update a context for a specific collection and path prefix.
|
||||
*/
|
||||
export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
|
||||
const now = new Date().toISOString();
|
||||
db.prepare(`
|
||||
INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
|
||||
`).run(collectionId, pathPrefix, context, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a context for a specific collection and path prefix.
|
||||
* Returns the number of contexts deleted.
|
||||
*/
|
||||
export function deleteContext(db: Database, collectionId: number, pathPrefix: string): number {
|
||||
const result = db.prepare(`
|
||||
DELETE FROM path_contexts
|
||||
WHERE collection_id = ? AND path_prefix = ?
|
||||
`).run(collectionId, pathPrefix);
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all global contexts (contexts with empty path_prefix).
|
||||
* Returns the number of contexts deleted.
|
||||
*/
|
||||
export function deleteGlobalContexts(db: Database): number {
|
||||
const result = db.prepare(`DELETE FROM path_contexts WHERE path_prefix = ''`).run();
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* List all contexts, grouped by collection.
|
||||
* Returns contexts ordered by collection name, then by path prefix length (longest first).
|
||||
*/
|
||||
export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
|
||||
const contexts = db.prepare(`
|
||||
SELECT c.name as collection_name, pc.path_prefix, pc.context
|
||||
FROM path_contexts pc
|
||||
JOIN collections c ON c.id = pc.collection_id
|
||||
ORDER BY c.name, LENGTH(pc.path_prefix) DESC, pc.path_prefix
|
||||
`).all() as { collection_name: string; path_prefix: string; context: string }[];
|
||||
return contexts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all collections (id and name).
|
||||
*/
|
||||
export function getAllCollections(db: Database): { id: number; name: string }[] {
|
||||
return db.prepare(`SELECT id, name FROM collections`).all() as { id: number; name: string }[];
|
||||
}
|
||||
169
qmd.ts
169
qmd.ts
@ -26,6 +26,9 @@ import {
|
||||
findSimilarFiles,
|
||||
matchFilesByGlob,
|
||||
getHashesNeedingEmbedding,
|
||||
getHashesForEmbedding,
|
||||
clearAllEmbeddings,
|
||||
insertEmbedding,
|
||||
getDocument as storeGetDocument,
|
||||
getMultipleDocuments as storeMultiGetDocuments,
|
||||
getStatus,
|
||||
@ -45,6 +48,23 @@ import {
|
||||
isVirtualPath,
|
||||
resolveVirtualPath,
|
||||
toVirtualPath,
|
||||
insertContent,
|
||||
insertDocument,
|
||||
findActiveDocument,
|
||||
updateDocumentTitle,
|
||||
deactivateDocument,
|
||||
getActiveDocumentPaths,
|
||||
cleanupOrphanedContent,
|
||||
deleteOllamaCache,
|
||||
deleteInactiveDocuments,
|
||||
cleanupOrphanedVectors,
|
||||
cleanupDuplicateCollections,
|
||||
vacuumDatabase,
|
||||
insertContext,
|
||||
deleteContext,
|
||||
deleteGlobalContexts,
|
||||
listPathContexts,
|
||||
getAllCollections,
|
||||
OLLAMA_URL,
|
||||
DEFAULT_EMBED_MODEL,
|
||||
DEFAULT_QUERY_MODEL,
|
||||
@ -379,16 +399,6 @@ function getOrCreateCollection(db: Database, pwd: string, globPattern: string, n
|
||||
}
|
||||
}
|
||||
|
||||
function cleanupDuplicateCollections(db: Database): void {
|
||||
// Remove duplicate collections keeping the oldest one
|
||||
db.exec(`
|
||||
DELETE FROM collections WHERE id NOT IN (
|
||||
SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
|
||||
)
|
||||
`);
|
||||
// Remove bogus "." glob pattern entries (from earlier bug)
|
||||
db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
|
||||
}
|
||||
|
||||
function formatTimeAgo(date: Date): string {
|
||||
const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
|
||||
@ -501,37 +511,6 @@ function showStatus(): void {
|
||||
closeDb();
|
||||
}
|
||||
|
||||
// Update display_paths for all documents that have empty display_path
|
||||
function updateDisplayPaths(db: Database): number {
|
||||
// Get all docs with empty display_path, grouped by collection
|
||||
const emptyDocs = db.prepare(`
|
||||
SELECT d.id, d.filepath, c.pwd
|
||||
FROM documents d
|
||||
JOIN collections c ON d.collection_id = c.id
|
||||
WHERE d.active = 1 AND (d.display_path IS NULL OR d.display_path = '')
|
||||
`).all() as { id: number; filepath: string; pwd: string }[];
|
||||
|
||||
if (emptyDocs.length === 0) return 0;
|
||||
|
||||
// Collect existing display_paths
|
||||
const existingPaths = new Set<string>(
|
||||
(db.prepare(`SELECT display_path FROM documents WHERE active = 1 AND display_path != ''`).all() as { display_path: string }[])
|
||||
.map(r => r.display_path)
|
||||
);
|
||||
|
||||
const updateStmt = db.prepare(`UPDATE documents SET display_path = ? WHERE id = ?`);
|
||||
let updated = 0;
|
||||
|
||||
for (const doc of emptyDocs) {
|
||||
const displayPath = computeDisplayPath(doc.filepath, doc.pwd, existingPaths);
|
||||
updateStmt.run(displayPath, doc.id);
|
||||
existingPaths.add(displayPath);
|
||||
updated++;
|
||||
}
|
||||
|
||||
return updated;
|
||||
}
|
||||
|
||||
async function updateCollections(): Promise<void> {
|
||||
const db = getDb();
|
||||
cleanupDuplicateCollections(db);
|
||||
@ -547,12 +526,6 @@ async function updateCollections(): Promise<void> {
|
||||
return;
|
||||
}
|
||||
|
||||
// Update display_paths for any documents missing them (migration)
|
||||
const pathsUpdated = updateDisplayPaths(db);
|
||||
if (pathsUpdated > 0) {
|
||||
console.log(`${c.green}✓${c.reset} Updated ${pathsUpdated} display paths`);
|
||||
}
|
||||
|
||||
// Don't close db here - indexFiles will reuse it and close at the end
|
||||
console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
|
||||
|
||||
@ -1430,13 +1403,6 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, name
|
||||
return;
|
||||
}
|
||||
|
||||
// Prepared statements for new schema
|
||||
const insertContentStmt = db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`);
|
||||
const insertDocStmt = db.prepare(`INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active) VALUES (?, ?, ?, ?, ?, ?, 1)`);
|
||||
const deactivateStmt = db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND path = ? AND active = 1`);
|
||||
const findActiveStmt = db.prepare(`SELECT id, hash, title FROM documents WHERE collection_id = ? AND path = ? AND active = 1`);
|
||||
const updateTitleStmt = db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`);
|
||||
|
||||
let indexed = 0, updated = 0, unchanged = 0, processed = 0;
|
||||
const seenPaths = new Set<string>();
|
||||
const startTime = Date.now();
|
||||
@ -1451,33 +1417,33 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, name
|
||||
const title = extractTitle(content, relativeFile);
|
||||
|
||||
// Check if document exists in this collection with this path
|
||||
const existing = findActiveStmt.get(collectionId, path) as { id: number; hash: string; title: string } | null;
|
||||
const existing = findActiveDocument(db, collectionId, path);
|
||||
|
||||
if (existing) {
|
||||
if (existing.hash === hash) {
|
||||
// Hash unchanged, but check if title needs updating
|
||||
if (existing.title !== title) {
|
||||
updateTitleStmt.run(title, now, existing.id);
|
||||
updateDocumentTitle(db, existing.id, title, now);
|
||||
updated++;
|
||||
} else {
|
||||
unchanged++;
|
||||
}
|
||||
} else {
|
||||
// Content changed - insert new content hash and update document
|
||||
insertContentStmt.run(hash, content, now);
|
||||
deactivateStmt.run(collectionId, path);
|
||||
insertContent(db, hash, content, now);
|
||||
deactivateDocument(db, collectionId, path);
|
||||
updated++;
|
||||
const stat = await Bun.file(filepath).stat();
|
||||
insertDocStmt.run(collectionId, path, title, hash,
|
||||
insertDocument(db, collectionId, path, title, hash,
|
||||
stat ? new Date(stat.birthtime).toISOString() : now,
|
||||
stat ? new Date(stat.mtime).toISOString() : now);
|
||||
}
|
||||
} else {
|
||||
// New document - insert content and document
|
||||
indexed++;
|
||||
insertContentStmt.run(hash, content, now);
|
||||
insertContent(db, hash, content, now);
|
||||
const stat = await Bun.file(filepath).stat();
|
||||
insertDocStmt.run(collectionId, path, title, hash,
|
||||
insertDocument(db, collectionId, path, title, hash,
|
||||
stat ? new Date(stat.birthtime).toISOString() : now,
|
||||
stat ? new Date(stat.mtime).toISOString() : now);
|
||||
}
|
||||
@ -1492,21 +1458,17 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, name
|
||||
}
|
||||
|
||||
// Deactivate documents in this collection that no longer exist
|
||||
const allActive = db.prepare(`SELECT path FROM documents WHERE collection_id = ? AND active = 1`).all(collectionId) as { path: string }[];
|
||||
const allActive = getActiveDocumentPaths(db, collectionId);
|
||||
let removed = 0;
|
||||
for (const row of allActive) {
|
||||
if (!seenPaths.has(row.path)) {
|
||||
deactivateStmt.run(collectionId, row.path);
|
||||
for (const path of allActive) {
|
||||
if (!seenPaths.has(path)) {
|
||||
deactivateDocument(db, collectionId, path);
|
||||
removed++;
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up orphaned content hashes (content not referenced by any document)
|
||||
const cleanupResult = db.prepare(`
|
||||
DELETE FROM content
|
||||
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
|
||||
`).run();
|
||||
const orphanedContent = cleanupResult.changes;
|
||||
const orphanedContent = cleanupOrphanedContent(db);
|
||||
|
||||
// Check if vector index needs updating
|
||||
const needsEmbedding = getHashesNeedingEmbedding(db);
|
||||
@ -1538,20 +1500,11 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
|
||||
// If force, clear all vectors
|
||||
if (force) {
|
||||
console.log(`${c.yellow}Force re-indexing: clearing all vectors...${c.reset}`);
|
||||
db.exec(`DELETE FROM content_vectors`);
|
||||
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
||||
clearAllEmbeddings(db);
|
||||
}
|
||||
|
||||
// Find unique hashes that need embedding (from active documents)
|
||||
// Join with content table to get document body
|
||||
const hashesToEmbed = db.prepare(`
|
||||
SELECT d.hash, c.doc as body, MIN(d.path) as path
|
||||
FROM documents d
|
||||
JOIN content c ON d.hash = c.hash
|
||||
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
||||
WHERE d.active = 1 AND v.hash IS NULL
|
||||
GROUP BY d.hash
|
||||
`).all() as { hash: string; body: string; path: string }[];
|
||||
const hashesToEmbed = getHashesForEmbedding(db);
|
||||
|
||||
if (hashesToEmbed.length === 0) {
|
||||
console.log(`${c.green}✓ All content hashes already have embeddings.${c.reset}`);
|
||||
@ -1612,16 +1565,11 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
|
||||
const firstEmbedding = await getEmbedding(allChunks[0].text, model, false, allChunks[0].title);
|
||||
ensureVecTable(db, firstEmbedding.length);
|
||||
|
||||
const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
||||
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
|
||||
|
||||
let chunksEmbedded = 0, errors = 0, bytesProcessed = 0;
|
||||
const startTime = Date.now();
|
||||
|
||||
// Insert first chunk
|
||||
const firstHashSeq = `${allChunks[0].hash}_${allChunks[0].seq}`;
|
||||
insertVecStmt.run(firstHashSeq, new Float32Array(firstEmbedding));
|
||||
insertContentVectorStmt.run(allChunks[0].hash, allChunks[0].seq, allChunks[0].pos, model, now);
|
||||
insertEmbedding(db, allChunks[0].hash, allChunks[0].seq, allChunks[0].pos, new Float32Array(firstEmbedding), model, now);
|
||||
chunksEmbedded++;
|
||||
bytesProcessed += allChunks[0].bytes;
|
||||
|
||||
@ -1629,9 +1577,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean =
|
||||
const chunk = allChunks[i];
|
||||
try {
|
||||
const embedding = await getEmbedding(chunk.text, model, false, chunk.title);
|
||||
const hashSeq = `${chunk.hash}_${chunk.seq}`;
|
||||
insertVecStmt.run(hashSeq, new Float32Array(embedding));
|
||||
insertContentVectorStmt.run(chunk.hash, chunk.seq, chunk.pos, model, now);
|
||||
insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(embedding), model, now);
|
||||
chunksEmbedded++;
|
||||
bytesProcessed += chunk.bytes;
|
||||
} catch (err) {
|
||||
@ -2607,46 +2553,25 @@ switch (cli.command) {
|
||||
const db = getDb();
|
||||
|
||||
// 1. Clear ollama_cache
|
||||
const cacheCount = db.prepare(`SELECT COUNT(*) as c FROM ollama_cache`).get() as { c: number };
|
||||
db.exec(`DELETE FROM ollama_cache`);
|
||||
console.log(`${c.green}✓${c.reset} Cleared ${cacheCount.c} cached API responses`);
|
||||
const cacheCount = deleteOllamaCache(db);
|
||||
console.log(`${c.green}✓${c.reset} Cleared ${cacheCount} cached API responses`);
|
||||
|
||||
// 2. Remove orphaned vectors (no active document with that hash)
|
||||
const orphanedVecs = db.prepare(`
|
||||
SELECT COUNT(*) as c FROM content_vectors cv
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
||||
)
|
||||
`).get() as { c: number };
|
||||
|
||||
if (orphanedVecs.c > 0) {
|
||||
db.exec(`
|
||||
DELETE FROM vectors_vec WHERE hash_seq IN (
|
||||
SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
||||
)
|
||||
)
|
||||
`);
|
||||
db.exec(`
|
||||
DELETE FROM content_vectors WHERE hash NOT IN (
|
||||
SELECT hash FROM documents WHERE active = 1
|
||||
)
|
||||
`);
|
||||
console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs.c} orphaned embedding chunks`);
|
||||
// 2. Remove orphaned vectors
|
||||
const orphanedVecs = cleanupOrphanedVectors(db);
|
||||
if (orphanedVecs > 0) {
|
||||
console.log(`${c.green}✓${c.reset} Removed ${orphanedVecs} orphaned embedding chunks`);
|
||||
} else {
|
||||
console.log(`${c.dim}No orphaned embeddings to remove${c.reset}`);
|
||||
}
|
||||
|
||||
// 3. Count inactive documents
|
||||
const inactiveDocs = db.prepare(`SELECT COUNT(*) as c FROM documents WHERE active = 0`).get() as { c: number };
|
||||
if (inactiveDocs.c > 0) {
|
||||
db.exec(`DELETE FROM documents WHERE active = 0`);
|
||||
console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs.c} inactive document records`);
|
||||
// 3. Remove inactive documents
|
||||
const inactiveDocs = deleteInactiveDocuments(db);
|
||||
if (inactiveDocs > 0) {
|
||||
console.log(`${c.green}✓${c.reset} Removed ${inactiveDocs} inactive document records`);
|
||||
}
|
||||
|
||||
// 4. Vacuum to reclaim space
|
||||
db.exec(`VACUUM`);
|
||||
vacuumDatabase(db);
|
||||
console.log(`${c.green}✓${c.reset} Database vacuumed`);
|
||||
|
||||
closeDb();
|
||||
|
||||
344
store.ts
344
store.ts
@ -589,6 +589,14 @@ export type Store = {
|
||||
setCachedResult: (cacheKey: string, result: string) => void;
|
||||
clearCache: () => void;
|
||||
|
||||
// Cleanup and maintenance
|
||||
deleteOllamaCache: () => number;
|
||||
deleteInactiveDocuments: () => number;
|
||||
cleanupOrphanedContent: () => number;
|
||||
cleanupOrphanedVectors: () => number;
|
||||
cleanupDuplicateCollections: () => number;
|
||||
vacuumDatabase: () => void;
|
||||
|
||||
// Context
|
||||
getContextForFile: (filepath: string) => string | null;
|
||||
getContextForPath: (collectionId: number, path: string) => string | null;
|
||||
@ -622,6 +630,19 @@ export type Store = {
|
||||
// Fuzzy matching
|
||||
findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => string[];
|
||||
matchFilesByGlob: (pattern: string) => { filepath: string; displayPath: string; bodyLength: number }[];
|
||||
|
||||
// Document indexing operations
|
||||
insertContent: (hash: string, content: string, createdAt: string) => void;
|
||||
insertDocument: (collectionId: number, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
|
||||
findActiveDocument: (collectionId: number, path: string) => { id: number; hash: string; title: string } | null;
|
||||
updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
|
||||
deactivateDocument: (collectionId: number, path: string) => void;
|
||||
getActiveDocumentPaths: (collectionId: number) => string[];
|
||||
|
||||
// Vector/embedding operations
|
||||
getHashesForEmbedding: () => { hash: string; body: string; path: string }[];
|
||||
clearAllEmbeddings: () => void;
|
||||
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => void;
|
||||
};
|
||||
|
||||
/**
|
||||
@ -653,6 +674,14 @@ export function createStore(dbPath?: string): Store {
|
||||
setCachedResult: (cacheKey: string, result: string) => setCachedResult(db, cacheKey, result),
|
||||
clearCache: () => clearCache(db),
|
||||
|
||||
// Cleanup and maintenance
|
||||
deleteOllamaCache: () => deleteOllamaCache(db),
|
||||
deleteInactiveDocuments: () => deleteInactiveDocuments(db),
|
||||
cleanupOrphanedContent: () => cleanupOrphanedContent(db),
|
||||
cleanupOrphanedVectors: () => cleanupOrphanedVectors(db),
|
||||
cleanupDuplicateCollections: () => cleanupDuplicateCollections(db),
|
||||
vacuumDatabase: () => vacuumDatabase(db),
|
||||
|
||||
// Context
|
||||
getContextForFile: (filepath: string) => getContextForFile(db, filepath),
|
||||
getContextForPath: (collectionId: number, path: string) => getContextForPath(db, collectionId, path),
|
||||
@ -686,6 +715,19 @@ export function createStore(dbPath?: string): Store {
|
||||
// Fuzzy matching
|
||||
findSimilarFiles: (query: string, maxDistance?: number, limit?: number) => findSimilarFiles(db, query, maxDistance, limit),
|
||||
matchFilesByGlob: (pattern: string) => matchFilesByGlob(db, pattern),
|
||||
|
||||
// Document indexing operations
|
||||
insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
|
||||
insertDocument: (collectionId: number, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionId, path, title, hash, createdAt, modifiedAt),
|
||||
findActiveDocument: (collectionId: number, path: string) => findActiveDocument(db, collectionId, path),
|
||||
updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
|
||||
deactivateDocument: (collectionId: number, path: string) => deactivateDocument(db, collectionId, path),
|
||||
getActiveDocumentPaths: (collectionId: number) => getActiveDocumentPaths(db, collectionId),
|
||||
|
||||
// Vector/embedding operations
|
||||
getHashesForEmbedding: () => getHashesForEmbedding(db),
|
||||
clearAllEmbeddings: () => clearAllEmbeddings(db),
|
||||
insertEmbedding: (hash: string, seq: number, pos: number, embedding: Float32Array, model: string, embeddedAt: string) => insertEmbedding(db, hash, seq, pos, embedding, model, embeddedAt),
|
||||
};
|
||||
}
|
||||
|
||||
@ -867,6 +909,117 @@ export function clearCache(db: Database): void {
|
||||
db.exec(`DELETE FROM ollama_cache`);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Cleanup and maintenance operations
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Delete cached Ollama API responses.
|
||||
* Returns the number of cached responses deleted.
|
||||
*/
|
||||
export function deleteOllamaCache(db: Database): number {
|
||||
const result = db.prepare(`DELETE FROM ollama_cache`).run();
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove inactive document records (active = 0).
|
||||
* Returns the number of inactive documents deleted.
|
||||
*/
|
||||
export function deleteInactiveDocuments(db: Database): number {
|
||||
const result = db.prepare(`DELETE FROM documents WHERE active = 0`).run();
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove orphaned content hashes that are not referenced by any active document.
|
||||
* Returns the number of orphaned content hashes deleted.
|
||||
*/
|
||||
export function cleanupOrphanedContent(db: Database): number {
|
||||
const result = db.prepare(`
|
||||
DELETE FROM content
|
||||
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
|
||||
`).run();
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove orphaned vector embeddings that are not referenced by any active document.
|
||||
* Returns the number of orphaned embedding chunks deleted.
|
||||
*/
|
||||
export function cleanupOrphanedVectors(db: Database): number {
|
||||
// Check if vectors_vec table exists
|
||||
const tableExists = db.prepare(`
|
||||
SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'
|
||||
`).get();
|
||||
|
||||
if (!tableExists) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Count orphaned vectors first
|
||||
const countResult = db.prepare(`
|
||||
SELECT COUNT(*) as c FROM content_vectors cv
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
||||
)
|
||||
`).get() as { c: number };
|
||||
|
||||
if (countResult.c === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Delete from vectors_vec first
|
||||
db.exec(`
|
||||
DELETE FROM vectors_vec WHERE hash_seq IN (
|
||||
SELECT cv.hash || '_' || cv.seq FROM content_vectors cv
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM documents d WHERE d.hash = cv.hash AND d.active = 1
|
||||
)
|
||||
)
|
||||
`);
|
||||
|
||||
// Delete from content_vectors
|
||||
db.exec(`
|
||||
DELETE FROM content_vectors WHERE hash NOT IN (
|
||||
SELECT hash FROM documents WHERE active = 1
|
||||
)
|
||||
`);
|
||||
|
||||
return countResult.c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove duplicate collections, keeping the oldest one per (pwd, glob_pattern).
|
||||
* Also removes bogus "." glob pattern entries.
|
||||
* Returns the number of duplicate collections removed.
|
||||
*/
|
||||
export function cleanupDuplicateCollections(db: Database): number {
|
||||
// Count duplicates before removal
|
||||
const beforeCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
|
||||
|
||||
// Remove duplicates keeping the oldest one
|
||||
db.exec(`
|
||||
DELETE FROM collections WHERE id NOT IN (
|
||||
SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
|
||||
)
|
||||
`);
|
||||
|
||||
// Remove bogus "." glob pattern entries (from earlier bug)
|
||||
db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
|
||||
|
||||
const afterCount = (db.prepare(`SELECT COUNT(*) as c FROM collections`).get() as { c: number }).c;
|
||||
return beforeCount - afterCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run VACUUM to reclaim unused space in the database.
|
||||
* This operation rebuilds the database file to eliminate fragmentation.
|
||||
*/
|
||||
export function vacuumDatabase(db: Database): void {
|
||||
db.exec(`VACUUM`);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Document helpers
|
||||
// =============================================================================
|
||||
@ -890,6 +1043,94 @@ export function extractTitle(content: string, filename: string): string {
|
||||
return filename.replace(/\.md$/, "").split("/").pop() || filename;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Document indexing operations
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Insert content into the content table (content-addressable storage).
|
||||
* Uses INSERT OR IGNORE so duplicate hashes are skipped.
|
||||
*/
|
||||
export function insertContent(db: Database, hash: string, content: string, createdAt: string): void {
|
||||
db.prepare(`INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?, ?, ?)`)
|
||||
.run(hash, content, createdAt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a new document into the documents table.
|
||||
*/
|
||||
export function insertDocument(
|
||||
db: Database,
|
||||
collectionId: number,
|
||||
path: string,
|
||||
title: string,
|
||||
hash: string,
|
||||
createdAt: string,
|
||||
modifiedAt: string
|
||||
): void {
|
||||
db.prepare(`
|
||||
INSERT INTO documents (collection_id, path, title, hash, created_at, modified_at, active)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 1)
|
||||
`).run(collectionId, path, title, hash, createdAt, modifiedAt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find an active document by collection ID and path.
|
||||
*/
|
||||
export function findActiveDocument(
|
||||
db: Database,
|
||||
collectionId: number,
|
||||
path: string
|
||||
): { id: number; hash: string; title: string } | null {
|
||||
return db.prepare(`
|
||||
SELECT id, hash, title FROM documents
|
||||
WHERE collection_id = ? AND path = ? AND active = 1
|
||||
`).get(collectionId, path) as { id: number; hash: string; title: string } | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the title and modified_at timestamp for a document.
|
||||
*/
|
||||
export function updateDocumentTitle(
|
||||
db: Database,
|
||||
documentId: number,
|
||||
title: string,
|
||||
modifiedAt: string
|
||||
): void {
|
||||
db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
|
||||
.run(title, modifiedAt, documentId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Deactivate a document (mark as inactive but don't delete).
|
||||
*/
|
||||
export function deactivateDocument(db: Database, collectionId: number, path: string): void {
|
||||
db.prepare(`UPDATE documents SET active = 0 WHERE collection_id = ? AND path = ? AND active = 1`)
|
||||
.run(collectionId, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all active document paths for a collection.
|
||||
*/
|
||||
export function getActiveDocumentPaths(db: Database, collectionId: number): string[] {
|
||||
const rows = db.prepare(`
|
||||
SELECT path FROM documents WHERE collection_id = ? AND active = 1
|
||||
`).all(collectionId) as { path: string }[];
|
||||
return rows.map(r => r.path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up orphaned content hashes (content not referenced by any active document).
|
||||
* Returns the number of orphaned hashes deleted.
|
||||
*/
|
||||
export function cleanupOrphanedContent(db: Database): number {
|
||||
const result = db.prepare(`
|
||||
DELETE FROM content
|
||||
WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
|
||||
`).run();
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
// Re-export from llm.ts for backwards compatibility
|
||||
export { formatQueryForEmbedding, formatDocForEmbedding };
|
||||
|
||||
@ -1118,6 +1359,64 @@ export function renameCollection(db: Database, collectionId: number, newName: st
|
||||
.run(newName, now, collectionId);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Context Management Operations
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* Insert or update a context for a specific collection and path prefix.
|
||||
*/
|
||||
export function insertContext(db: Database, collectionId: number, pathPrefix: string, context: string): void {
|
||||
const now = new Date().toISOString();
|
||||
db.prepare(`
|
||||
INSERT INTO path_contexts (collection_id, path_prefix, context, created_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(collection_id, path_prefix) DO UPDATE SET context = excluded.context
|
||||
`).run(collectionId, pathPrefix, context, now);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a context for a specific collection and path prefix.
|
||||
* Returns the number of contexts deleted.
|
||||
*/
|
||||
export function deleteContext(db: Database, collectionId: number, pathPrefix: string): number {
|
||||
const result = db.prepare(`
|
||||
DELETE FROM path_contexts
|
||||
WHERE collection_id = ? AND path_prefix = ?
|
||||
`).run(collectionId, pathPrefix);
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all global contexts (contexts with empty path_prefix).
|
||||
* Returns the number of contexts deleted.
|
||||
*/
|
||||
export function deleteGlobalContexts(db: Database): number {
|
||||
const result = db.prepare(`DELETE FROM path_contexts WHERE path_prefix = ''`).run();
|
||||
return result.changes;
|
||||
}
|
||||
|
||||
/**
|
||||
* List all contexts, grouped by collection.
|
||||
* Returns contexts ordered by collection name, then by path prefix length (longest first).
|
||||
*/
|
||||
export function listPathContexts(db: Database): { collection_name: string; path_prefix: string; context: string }[] {
|
||||
const contexts = db.prepare(`
|
||||
SELECT c.name as collection_name, pc.path_prefix, pc.context
|
||||
FROM path_contexts pc
|
||||
JOIN collections c ON c.id = pc.collection_id
|
||||
ORDER BY c.name, LENGTH(pc.path_prefix) DESC, pc.path_prefix
|
||||
`).all() as { collection_name: string; path_prefix: string; context: string }[];
|
||||
return contexts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all collections (id and name).
|
||||
*/
|
||||
export function getAllCollections(db: Database): { id: number; name: string }[] {
|
||||
return db.prepare(`SELECT id, name FROM collections`).all() as { id: number; name: string }[];
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// FTS Search
|
||||
// =============================================================================
|
||||
@ -1244,6 +1543,51 @@ async function getEmbedding(text: string, model: string, isQuery: boolean): Prom
|
||||
return result?.embedding || null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all unique content hashes that need embeddings (from active documents).
|
||||
* Returns hash, document body, and a sample path for display purposes.
|
||||
*/
|
||||
export function getHashesForEmbedding(db: Database): { hash: string; body: string; path: string }[] {
|
||||
return db.prepare(`
|
||||
SELECT d.hash, c.doc as body, MIN(d.path) as path
|
||||
FROM documents d
|
||||
JOIN content c ON d.hash = c.hash
|
||||
LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
|
||||
WHERE d.active = 1 AND v.hash IS NULL
|
||||
GROUP BY d.hash
|
||||
`).all() as { hash: string; body: string; path: string }[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear all embeddings from the database (force re-index).
|
||||
* Deletes all rows from content_vectors and drops the vectors_vec table.
|
||||
*/
|
||||
export function clearAllEmbeddings(db: Database): void {
|
||||
db.exec(`DELETE FROM content_vectors`);
|
||||
db.exec(`DROP TABLE IF EXISTS vectors_vec`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a single embedding into both content_vectors and vectors_vec tables.
|
||||
* The hash_seq key is formatted as "hash_seq" for the vectors_vec table.
|
||||
*/
|
||||
export function insertEmbedding(
|
||||
db: Database,
|
||||
hash: string,
|
||||
seq: number,
|
||||
pos: number,
|
||||
embedding: Float32Array,
|
||||
model: string,
|
||||
embeddedAt: string
|
||||
): void {
|
||||
const hashSeq = `${hash}_${seq}`;
|
||||
const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`);
|
||||
const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`);
|
||||
|
||||
insertVecStmt.run(hashSeq, embedding);
|
||||
insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// Query expansion
|
||||
// =============================================================================
|
||||
|
||||
Loading…
Reference in New Issue
Block a user