Merge pull request #463 from goldsr09/fix/hyphenated-lex-queries

Fix hyphenated tokens in FTS5 lex queries
This commit is contained in:
Tobias Lütke 2026-03-28 19:58:22 -04:00 committed by GitHub
commit dd27f499c7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 96 additions and 9 deletions

View File

@ -2687,20 +2687,46 @@ function sanitizeFTS5Term(term: string): string {
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
}
/**
* Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
* Returns true if the token contains internal hyphens between word/digit characters.
*/
function isHyphenatedToken(token: string): boolean {
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
}
/**
* Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
* and sanitizing each part. Returns the parts joined by spaces for use
* inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
*/
function sanitizeHyphenatedTerm(term: string): string {
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
}
/**
* Parse lex query syntax into FTS5 query.
*
* Supports:
* - Quoted phrases: "exact phrase" "exact phrase" (exact match)
* - Negation: -term or -"phrase" uses FTS5 NOT operator
* - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 treated as phrases
* - Plain terms: term "term"* (prefix match)
*
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
* So `-term` only works when there are also positive terms.
*
* Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
* (where `-` is between word characters) is treated as a hyphenated phrase.
* When a leading `-` is followed by what looks like a hyphenated compound word
* (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
*
* Examples:
* performance -sports "performance"* NOT "sports"*
* "machine learning" "machine learning"
* multi-agent memory "multi agent" AND "memory"*
* DEC-0054 "dec 0054"
* -multi-agent NOT "multi agent"
*/
function buildFTS5Query(query: string): string | null {
const positive: string[] = [];
@ -2742,13 +2768,27 @@ function buildFTS5Query(query: string): string | null {
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
const term = s.slice(start, i);
const sanitized = sanitizeFTS5Term(term);
if (sanitized) {
const ftsTerm = `"${sanitized}"*`; // Prefix match
if (negated) {
negative.push(ftsTerm);
} else {
positive.push(ftsTerm);
// Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
// These get split into phrase queries so FTS5 porter tokenizer matches them.
if (isHyphenatedToken(term)) {
const sanitized = sanitizeHyphenatedTerm(term);
if (sanitized) {
const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
if (negated) {
negative.push(ftsPhrase);
} else {
positive.push(ftsPhrase);
}
}
} else {
const sanitized = sanitizeFTS5Term(term);
if (sanitized) {
const ftsTerm = `"${sanitized}"*`; // Prefix match
if (negated) {
negative.push(ftsTerm);
} else {
positive.push(ftsTerm);
}
}
}
}

View File

@ -399,6 +399,14 @@ describe("buildFTS5Query (lex parser)", () => {
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
}
function isHyphenatedToken(token: string): boolean {
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
}
function sanitizeHyphenatedTerm(term: string): string {
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
}
function buildFTS5Query(query: string): string | null {
const positive: string[] = [];
const negative: string[] = [];
@ -424,8 +432,14 @@ describe("buildFTS5Query (lex parser)", () => {
const start = i;
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
const term = s.slice(start, i);
const sanitized = sanitizeFTS5Term(term);
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
if (isHyphenatedToken(term)) {
const sanitized = sanitizeHyphenatedTerm(term);
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
} else {
const sanitized = sanitizeFTS5Term(term);
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
}
}
}
@ -488,4 +502,37 @@ describe("buildFTS5Query (lex parser)", () => {
test("special chars in terms stripped", () => {
expect(buildFTS5Query("hello!world")).toBe('"helloworld"*');
});
// Hyphenated token tests
test("hyphenated term → phrase match", () => {
expect(buildFTS5Query("multi-agent")).toBe('"multi agent"');
});
test("hyphenated identifier → phrase match", () => {
expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"');
});
test("hyphenated model name → phrase match", () => {
expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"');
});
test("multi-hyphen term → phrase match", () => {
expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"');
});
test("hyphenated term mixed with plain terms", () => {
expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*');
});
test("negation still works alongside hyphenated terms", () => {
expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*');
});
test("negated hyphenated term", () => {
expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"');
});
test("plain negation still works (not confused with hyphen)", () => {
expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
});
});