diff --git a/src/store.ts b/src/store.ts index aa33ce6..14faa42 100644 --- a/src/store.ts +++ b/src/store.ts @@ -2687,20 +2687,46 @@ function sanitizeFTS5Term(term: string): string { return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase(); } +/** + * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4). + * Returns true if the token contains internal hyphens between word/digit characters. + */ +function isHyphenatedToken(token: string): boolean { + return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token); +} + +/** + * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens + * and sanitizing each part. Returns the parts joined by spaces for use + * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer. + */ +function sanitizeHyphenatedTerm(term: string): string { + return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); +} + /** * Parse lex query syntax into FTS5 query. * * Supports: * - Quoted phrases: "exact phrase" → "exact phrase" (exact match) * - Negation: -term or -"phrase" → uses FTS5 NOT operator + * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases * - Plain terms: term → "term"* (prefix match) * * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2". * So `-term` only works when there are also positive terms. * + * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent` + * (where `-` is between word characters) is treated as a hyphenated phrase. + * When a leading `-` is followed by what looks like a hyphenated compound word + * (e.g., `-multi-agent`), the entire token is treated as a negated phrase. + * * Examples: * performance -sports → "performance"* NOT "sports"* * "machine learning" → "machine learning" + * multi-agent memory → "multi agent" AND "memory"* + * DEC-0054 → "dec 0054" + * -multi-agent → NOT "multi agent" */ function buildFTS5Query(query: string): string | null { const positive: string[] = []; @@ -2742,13 +2768,27 @@ function buildFTS5Query(query: string): string | null { while (i < s.length && !/[\s"]/.test(s[i]!)) i++; const term = s.slice(start, i); - const sanitized = sanitizeFTS5Term(term); - if (sanitized) { - const ftsTerm = `"${sanitized}"*`; // Prefix match - if (negated) { - negative.push(ftsTerm); - } else { - positive.push(ftsTerm); + // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4 + // These get split into phrase queries so FTS5 porter tokenizer matches them. + if (isHyphenatedToken(term)) { + const sanitized = sanitizeHyphenatedTerm(term); + if (sanitized) { + const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix) + if (negated) { + negative.push(ftsPhrase); + } else { + positive.push(ftsPhrase); + } + } + } else { + const sanitized = sanitizeFTS5Term(term); + if (sanitized) { + const ftsTerm = `"${sanitized}"*`; // Prefix match + if (negated) { + negative.push(ftsTerm); + } else { + positive.push(ftsTerm); + } } } } diff --git a/test/structured-search.test.ts b/test/structured-search.test.ts index 5c4e97f..d704210 100644 --- a/test/structured-search.test.ts +++ b/test/structured-search.test.ts @@ -399,6 +399,14 @@ describe("buildFTS5Query (lex parser)", () => { return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase(); } + function isHyphenatedToken(token: string): boolean { + return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token); + } + + function sanitizeHyphenatedTerm(term: string): string { + return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); + } + function buildFTS5Query(query: string): string | null { const positive: string[] = []; const negative: string[] = []; @@ -424,8 +432,14 @@ describe("buildFTS5Query (lex parser)", () => { const start = i; while (i < s.length && !/[\s"]/.test(s[i]!)) i++; const term = s.slice(start, i); - const sanitized = sanitizeFTS5Term(term); - if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`); + + if (isHyphenatedToken(term)) { + const sanitized = sanitizeHyphenatedTerm(term); + if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`); + } else { + const sanitized = sanitizeFTS5Term(term); + if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`); + } } } @@ -488,4 +502,37 @@ describe("buildFTS5Query (lex parser)", () => { test("special chars in terms stripped", () => { expect(buildFTS5Query("hello!world")).toBe('"helloworld"*'); }); + + // Hyphenated token tests + test("hyphenated term → phrase match", () => { + expect(buildFTS5Query("multi-agent")).toBe('"multi agent"'); + }); + + test("hyphenated identifier → phrase match", () => { + expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"'); + }); + + test("hyphenated model name → phrase match", () => { + expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"'); + }); + + test("multi-hyphen term → phrase match", () => { + expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"'); + }); + + test("hyphenated term mixed with plain terms", () => { + expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*'); + }); + + test("negation still works alongside hyphenated terms", () => { + expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*'); + }); + + test("negated hyphenated term", () => { + expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"'); + }); + + test("plain negation still works (not confused with hyphen)", () => { + expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*'); + }); });