fix: handle hyphenated tokens in FTS5 lex queries
Hyphenated terms like multi-agent, DEC-0054, gpt-4 were being stripped
of hyphens and concatenated (e.g., "multiagent") which missed matches.
Now they're split into FTS5 phrase queries ("multi agent") so the porter
tokenizer matches them correctly.
This commit is contained in:
parent
2b8f329d7e
commit
7b9bd01226
54
src/store.ts
54
src/store.ts
@ -2654,20 +2654,46 @@ function sanitizeFTS5Term(term: string): string {
|
||||
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
|
||||
* Returns true if the token contains internal hyphens between word/digit characters.
|
||||
*/
|
||||
function isHyphenatedToken(token: string): boolean {
|
||||
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
|
||||
* and sanitizing each part. Returns the parts joined by spaces for use
|
||||
* inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
|
||||
*/
|
||||
function sanitizeHyphenatedTerm(term: string): string {
|
||||
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse lex query syntax into FTS5 query.
|
||||
*
|
||||
* Supports:
|
||||
* - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
|
||||
* - Negation: -term or -"phrase" → uses FTS5 NOT operator
|
||||
* - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
|
||||
* - Plain terms: term → "term"* (prefix match)
|
||||
*
|
||||
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
|
||||
* So `-term` only works when there are also positive terms.
|
||||
*
|
||||
* Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
|
||||
* (where `-` is between word characters) is treated as a hyphenated phrase.
|
||||
* When a leading `-` is followed by what looks like a hyphenated compound word
|
||||
* (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
|
||||
*
|
||||
* Examples:
|
||||
* performance -sports → "performance"* NOT "sports"*
|
||||
* "machine learning" → "machine learning"
|
||||
* multi-agent memory → "multi agent" AND "memory"*
|
||||
* DEC-0054 → "dec 0054"
|
||||
* -multi-agent → NOT "multi agent"
|
||||
*/
|
||||
function buildFTS5Query(query: string): string | null {
|
||||
const positive: string[] = [];
|
||||
@ -2709,13 +2735,27 @@ function buildFTS5Query(query: string): string | null {
|
||||
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
|
||||
const term = s.slice(start, i);
|
||||
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) {
|
||||
const ftsTerm = `"${sanitized}"*`; // Prefix match
|
||||
if (negated) {
|
||||
negative.push(ftsTerm);
|
||||
} else {
|
||||
positive.push(ftsTerm);
|
||||
// Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
|
||||
// These get split into phrase queries so FTS5 porter tokenizer matches them.
|
||||
if (isHyphenatedToken(term)) {
|
||||
const sanitized = sanitizeHyphenatedTerm(term);
|
||||
if (sanitized) {
|
||||
const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
|
||||
if (negated) {
|
||||
negative.push(ftsPhrase);
|
||||
} else {
|
||||
positive.push(ftsPhrase);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) {
|
||||
const ftsTerm = `"${sanitized}"*`; // Prefix match
|
||||
if (negated) {
|
||||
negative.push(ftsTerm);
|
||||
} else {
|
||||
positive.push(ftsTerm);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -399,6 +399,14 @@ describe("buildFTS5Query (lex parser)", () => {
|
||||
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
||||
}
|
||||
|
||||
function isHyphenatedToken(token: string): boolean {
|
||||
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
||||
}
|
||||
|
||||
function sanitizeHyphenatedTerm(term: string): string {
|
||||
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
||||
}
|
||||
|
||||
function buildFTS5Query(query: string): string | null {
|
||||
const positive: string[] = [];
|
||||
const negative: string[] = [];
|
||||
@ -424,8 +432,14 @@ describe("buildFTS5Query (lex parser)", () => {
|
||||
const start = i;
|
||||
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
|
||||
const term = s.slice(start, i);
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
|
||||
|
||||
if (isHyphenatedToken(term)) {
|
||||
const sanitized = sanitizeHyphenatedTerm(term);
|
||||
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
|
||||
} else {
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -488,4 +502,37 @@ describe("buildFTS5Query (lex parser)", () => {
|
||||
test("special chars in terms stripped", () => {
|
||||
expect(buildFTS5Query("hello!world")).toBe('"helloworld"*');
|
||||
});
|
||||
|
||||
// Hyphenated token tests
|
||||
test("hyphenated term → phrase match", () => {
|
||||
expect(buildFTS5Query("multi-agent")).toBe('"multi agent"');
|
||||
});
|
||||
|
||||
test("hyphenated identifier → phrase match", () => {
|
||||
expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"');
|
||||
});
|
||||
|
||||
test("hyphenated model name → phrase match", () => {
|
||||
expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"');
|
||||
});
|
||||
|
||||
test("multi-hyphen term → phrase match", () => {
|
||||
expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"');
|
||||
});
|
||||
|
||||
test("hyphenated term mixed with plain terms", () => {
|
||||
expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*');
|
||||
});
|
||||
|
||||
test("negation still works alongside hyphenated terms", () => {
|
||||
expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*');
|
||||
});
|
||||
|
||||
test("negated hyphenated term", () => {
|
||||
expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"');
|
||||
});
|
||||
|
||||
test("plain negation still works (not confused with hyphen)", () => {
|
||||
expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
|
||||
});
|
||||
});
|
||||
|
||||
Loading…
Reference in New Issue
Block a user