fix(search): support CJK FTS queries

This commit is contained in:
Tobi Lütke 2026-05-09 18:12:37 +00:00
parent 3d991b2a47
commit d045a8bab6
No known key found for this signature in database
2 changed files with 184 additions and 24 deletions

View File

@ -733,6 +733,73 @@ export function verifySqliteVecLoaded(db: Database): void {
let _sqliteVecAvailable: boolean | null = null;
const CJK_CHAR_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]/u;
const CJK_RUN_PATTERN = /[\p{Script=Han}\p{Script=Hiragana}\p{Script=Katakana}\p{Script=Hangul}]+/gu;
const FTS_CJK_NORMALIZED_VERSION = "1";
/**
* FTS5's unicode61 tokenizer does not segment CJK text into searchable words.
* Normalize CJK runs by spacing every character so exact CJK queries can be
* translated into phrase queries while Latin text keeps the default tokenizer.
*/
export function normalizeCjkForFTS(text: string): string {
return text.replace(CJK_RUN_PATTERN, run => ` ${Array.from(run).join(' ')} `);
}
function containsCjk(text: string): boolean {
return CJK_CHAR_PATTERN.test(text);
}
function sanitizeFTS5Phrase(phrase: string): string {
return normalizeCjkForFTS(phrase)
.split(/\s+/)
.map(t => sanitizeFTS5Term(t))
.filter(t => t)
.join(' ');
}
function rebuildFTSForCjkNormalization(db: Database): void {
const version = db.prepare(`SELECT value FROM store_config WHERE key = 'fts_cjk_normalized_version'`).get() as { value?: string } | undefined;
if (version?.value === FTS_CJK_NORMALIZED_VERSION) return;
try {
db.exec(`DELETE FROM documents_fts WHERE rowid >= 0`);
} catch {
// Some older/corrupt FTS5 shadow-table states can reject bulk deletes even
// though reads still work. Recreate the virtual table; documents_fts is a
// derived index, so rebuilding it from documents/content is safe.
db.exec(`DROP TABLE IF EXISTS documents_fts`);
db.exec(`
CREATE VIRTUAL TABLE documents_fts USING fts5(
filepath, title, body,
tokenize='porter unicode61'
)
`);
}
const rows = db.prepare(`
SELECT d.id, d.collection, d.path, d.title, content.doc as body
FROM documents d
JOIN content ON content.hash = d.hash
WHERE d.active = 1
`).all() as { id: number; collection: string; path: string; title: string; body: string }[];
const insert = db.prepare(`INSERT INTO documents_fts(rowid, filepath, title, body) VALUES (?, ?, ?, ?)`);
const rebuild = db.transaction(() => {
for (const row of rows) {
insert.run(
row.id,
normalizeCjkForFTS(`${row.collection}/${row.path}`),
normalizeCjkForFTS(row.title),
normalizeCjkForFTS(row.body)
);
}
});
rebuild();
db.prepare(`
INSERT OR REPLACE INTO store_config(key, value)
VALUES ('fts_cjk_normalized_version', ?)
`).run(FTS_CJK_NORMALIZED_VERSION);
}
function initializeDatabase(db: Database): void {
try {
loadSqliteVec(db);
@ -838,9 +905,12 @@ function initializeDatabase(db: Database): void {
)
`);
// Triggers to keep FTS in sync
// Triggers keep FTS in sync for callers that write directly to documents.
// Production indexing paths rebuild entries in TypeScript so CJK text can be
// normalized before it reaches the unicode61 tokenizer.
db.exec(`DROP TRIGGER IF EXISTS documents_ai`);
db.exec(`
CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
CREATE TRIGGER documents_ai AFTER INSERT ON documents
WHEN new.active = 1
BEGIN
INSERT INTO documents_fts(rowid, filepath, title, body)
@ -853,14 +923,16 @@ function initializeDatabase(db: Database): void {
END
`);
db.exec(`DROP TRIGGER IF EXISTS documents_ad`);
db.exec(`
CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
CREATE TRIGGER documents_ad AFTER DELETE ON documents BEGIN
DELETE FROM documents_fts WHERE rowid = old.id;
END
`);
db.exec(`DROP TRIGGER IF EXISTS documents_au`);
db.exec(`
CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
CREATE TRIGGER documents_au AFTER UPDATE ON documents
BEGIN
-- Delete from FTS if no longer active
DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
@ -875,6 +947,8 @@ function initializeDatabase(db: Database): void {
WHERE new.active = 1;
END
`);
rebuildFTSForCjkNormalization(db);
}
// =============================================================================
@ -2077,6 +2151,28 @@ export function insertContent(db: Database, hash: string, content: string, creat
.run(hash, content, createdAt);
}
function rebuildDocumentFTS(db: Database, documentId: number): void {
const row = db.prepare(`
SELECT d.id, d.collection, d.path, d.title, content.doc as body
FROM documents d
JOIN content ON content.hash = d.hash
WHERE d.id = ? AND d.active = 1
`).get(documentId) as { id: number; collection: string; path: string; title: string; body: string } | undefined;
db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(documentId);
if (!row) return;
db.prepare(`
INSERT INTO documents_fts(rowid, filepath, title, body)
VALUES (?, ?, ?, ?)
`).run(
row.id,
normalizeCjkForFTS(`${row.collection}/${row.path}`),
normalizeCjkForFTS(row.title),
normalizeCjkForFTS(row.body)
);
}
/**
* Insert a new document into the documents table.
*/
@ -2098,6 +2194,9 @@ export function insertDocument(
modified_at = excluded.modified_at,
active = 1
`).run(collectionName, path, title, hash, createdAt, modifiedAt);
const row = db.prepare(`SELECT id FROM documents WHERE collection = ? AND path = ?`).get(collectionName, path) as { id: number } | undefined;
if (row) rebuildDocumentFTS(db, row.id);
}
/**
@ -2148,15 +2247,7 @@ export function findOrMigrateLegacyDocument(
if (result.changes === 0) return false;
// FTS5 does not reliably update via the documents_au trigger's
// INSERT OR REPLACE. Manually rebuild the FTS entry.
db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(legacy.id);
db.prepare(`
INSERT INTO documents_fts(rowid, filepath, title, body)
SELECT id, collection || '/' || path, title,
(SELECT doc FROM content WHERE hash = documents.hash)
FROM documents WHERE id = ?
`).run(legacy.id);
rebuildDocumentFTS(db, legacy.id);
return true;
});
@ -2177,6 +2268,7 @@ export function updateDocumentTitle(
): void {
db.prepare(`UPDATE documents SET title = ?, modified_at = ? WHERE id = ?`)
.run(title, modifiedAt, documentId);
rebuildDocumentFTS(db, documentId);
}
/**
@ -2192,6 +2284,7 @@ export function updateDocument(
): void {
db.prepare(`UPDATE documents SET title = ?, hash = ?, modified_at = ? WHERE id = ?`)
.run(title, hash, modifiedAt, documentId);
rebuildDocumentFTS(db, documentId);
}
/**
@ -2940,7 +3033,7 @@ function buildFTS5Query(query: string): string | null {
const phrase = s.slice(start, i).trim();
i++; // skip closing quote
if (phrase.length > 0) {
const sanitized = phrase.split(/\s+/).map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
const sanitized = sanitizeFTS5Phrase(phrase);
if (sanitized) {
const ftsPhrase = `"${sanitized}"`; // Exact phrase, no prefix match
if (negated) {
@ -2968,6 +3061,16 @@ function buildFTS5Query(query: string): string | null {
positive.push(ftsPhrase);
}
}
} else if (containsCjk(term)) {
const sanitized = sanitizeFTS5Phrase(term);
if (sanitized) {
const ftsPhrase = `"${sanitized}"`; // CJK phrase over character tokens
if (negated) {
negative.push(ftsPhrase);
} else {
positive.push(ftsPhrase);
}
}
} else {
const sanitized = sanitizeFTS5Term(term);
if (sanitized) {

View File

@ -48,6 +48,8 @@ import {
syncConfigToDb,
STRONG_SIGNAL_MIN_SCORE,
STRONG_SIGNAL_MIN_GAP,
insertContent,
insertDocument,
generateEmbeddings,
type Store,
type DocumentResult,
@ -156,18 +158,18 @@ async function insertTestDocument(
const hash = opts.hash || await hashContent(body);
// Insert content (with OR IGNORE for deduplication)
db.prepare(`
INSERT OR IGNORE INTO content (hash, doc, created_at)
VALUES (?, ?, ?)
`).run(hash, body, now);
insertContent(db, hash, body, now);
// Insert document
const result = db.prepare(`
INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
VALUES (?, ?, ?, ?, ?, ?, ?)
`).run(collectionName, path, title, hash, now, now, active);
insertDocument(db, collectionName, path, title, hash, now, now);
const row = db.prepare(`
SELECT id FROM documents WHERE collection = ? AND path = ?
`).get(collectionName, path) as { id: number } | undefined;
return Number(result.lastInsertRowid);
if (active === 0 && row) {
db.prepare(`UPDATE documents SET active = 0 WHERE id = ?`).run(row.id);
}
return row?.id ?? 0;
}
/** Sync YAML config file to SQLite store_collections in the current test store */
@ -1250,6 +1252,61 @@ describe("FTS Search", () => {
await cleanupTestDb(store);
});
test("searchFTS finds CJK documents by exact and mixed queries", async () => {
const store = await createTestStore();
const collectionName = await createTestCollection();
await insertTestDocument(store.db, collectionName, {
name: "zh",
title: "中文检索说明",
body: "这里介绍 vector 数据库和关键词检索。",
displayPath: "cjk/zh.md",
});
await insertTestDocument(store.db, collectionName, {
name: "ja",
title: "日本語検索メモ",
body: "この文書は検索品質とトークン化について説明します。",
displayPath: "cjk/ja.md",
});
await insertTestDocument(store.db, collectionName, {
name: "ko",
title: "한국어 검색 노트",
body: "이 문서는 검색 품질과 토큰화 문제를 설명합니다.",
displayPath: "cjk/ko.md",
});
expect(store.searchFTS("关键词检索", 10).map(r => r.displayPath)).toContain(`${collectionName}/cjk/zh.md`);
expect(store.searchFTS("検索品質", 10).map(r => r.displayPath)).toContain(`${collectionName}/cjk/ja.md`);
expect(store.searchFTS("검색 품질", 10).map(r => r.displayPath)).toContain(`${collectionName}/cjk/ko.md`);
expect(store.searchFTS("vector 关键词", 10).map(r => r.displayPath)).toContain(`${collectionName}/cjk/zh.md`);
await cleanupTestDb(store);
});
test("searchFTS keeps English behavior while indexing CJK text", async () => {
const store = await createTestStore();
const collectionName = await createTestCollection();
await insertTestDocument(store.db, collectionName, {
name: "english",
title: "Vector Search Notes",
body: "The quick brown fox explains vector search and BM25 ranking.",
displayPath: "english.md",
});
await insertTestDocument(store.db, collectionName, {
name: "zh",
title: "中文检索说明",
body: "这里介绍向量数据库和关键词检索。",
displayPath: "zh.md",
});
const foxResults = store.searchFTS("quick fox", 10);
expect(foxResults.map(r => r.displayPath)).toContain(`${collectionName}/english.md`);
expect(foxResults.map(r => r.displayPath)).not.toContain(`${collectionName}/zh.md`);
await cleanupTestDb(store);
});
test("searchFTS handles special characters in query", async () => {
const store = await createTestStore();
const collectionName = await createTestCollection();