Merge pull request #463 from goldsr09/fix/hyphenated-lex-queries
Fix hyphenated tokens in FTS5 lex queries
This commit is contained in:
commit
dd27f499c7
54
src/store.ts
54
src/store.ts
@ -2687,20 +2687,46 @@ function sanitizeFTS5Term(term: string): string {
|
||||
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4).
|
||||
* Returns true if the token contains internal hyphens between word/digit characters.
|
||||
*/
|
||||
function isHyphenatedToken(token: string): boolean {
|
||||
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens
|
||||
* and sanitizing each part. Returns the parts joined by spaces for use
|
||||
* inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer.
|
||||
*/
|
||||
function sanitizeHyphenatedTerm(term: string): string {
|
||||
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse lex query syntax into FTS5 query.
|
||||
*
|
||||
* Supports:
|
||||
* - Quoted phrases: "exact phrase" → "exact phrase" (exact match)
|
||||
* - Negation: -term or -"phrase" → uses FTS5 NOT operator
|
||||
* - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases
|
||||
* - Plain terms: term → "term"* (prefix match)
|
||||
*
|
||||
* FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2".
|
||||
* So `-term` only works when there are also positive terms.
|
||||
*
|
||||
* Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent`
|
||||
* (where `-` is between word characters) is treated as a hyphenated phrase.
|
||||
* When a leading `-` is followed by what looks like a hyphenated compound word
|
||||
* (e.g., `-multi-agent`), the entire token is treated as a negated phrase.
|
||||
*
|
||||
* Examples:
|
||||
* performance -sports → "performance"* NOT "sports"*
|
||||
* "machine learning" → "machine learning"
|
||||
* multi-agent memory → "multi agent" AND "memory"*
|
||||
* DEC-0054 → "dec 0054"
|
||||
* -multi-agent → NOT "multi agent"
|
||||
*/
|
||||
function buildFTS5Query(query: string): string | null {
|
||||
const positive: string[] = [];
|
||||
@ -2742,13 +2768,27 @@ function buildFTS5Query(query: string): string | null {
|
||||
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
|
||||
const term = s.slice(start, i);
|
||||
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) {
|
||||
const ftsTerm = `"${sanitized}"*`; // Prefix match
|
||||
if (negated) {
|
||||
negative.push(ftsTerm);
|
||||
} else {
|
||||
positive.push(ftsTerm);
|
||||
// Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4
|
||||
// These get split into phrase queries so FTS5 porter tokenizer matches them.
|
||||
if (isHyphenatedToken(term)) {
|
||||
const sanitized = sanitizeHyphenatedTerm(term);
|
||||
if (sanitized) {
|
||||
const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix)
|
||||
if (negated) {
|
||||
negative.push(ftsPhrase);
|
||||
} else {
|
||||
positive.push(ftsPhrase);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) {
|
||||
const ftsTerm = `"${sanitized}"*`; // Prefix match
|
||||
if (negated) {
|
||||
negative.push(ftsTerm);
|
||||
} else {
|
||||
positive.push(ftsTerm);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -399,6 +399,14 @@ describe("buildFTS5Query (lex parser)", () => {
|
||||
return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase();
|
||||
}
|
||||
|
||||
function isHyphenatedToken(token: string): boolean {
|
||||
return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token);
|
||||
}
|
||||
|
||||
function sanitizeHyphenatedTerm(term: string): string {
|
||||
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
||||
}
|
||||
|
||||
function buildFTS5Query(query: string): string | null {
|
||||
const positive: string[] = [];
|
||||
const negative: string[] = [];
|
||||
@ -424,8 +432,14 @@ describe("buildFTS5Query (lex parser)", () => {
|
||||
const start = i;
|
||||
while (i < s.length && !/[\s"]/.test(s[i]!)) i++;
|
||||
const term = s.slice(start, i);
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
|
||||
|
||||
if (isHyphenatedToken(term)) {
|
||||
const sanitized = sanitizeHyphenatedTerm(term);
|
||||
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`);
|
||||
} else {
|
||||
const sanitized = sanitizeFTS5Term(term);
|
||||
if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -488,4 +502,37 @@ describe("buildFTS5Query (lex parser)", () => {
|
||||
test("special chars in terms stripped", () => {
|
||||
expect(buildFTS5Query("hello!world")).toBe('"helloworld"*');
|
||||
});
|
||||
|
||||
// Hyphenated token tests
|
||||
test("hyphenated term → phrase match", () => {
|
||||
expect(buildFTS5Query("multi-agent")).toBe('"multi agent"');
|
||||
});
|
||||
|
||||
test("hyphenated identifier → phrase match", () => {
|
||||
expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"');
|
||||
});
|
||||
|
||||
test("hyphenated model name → phrase match", () => {
|
||||
expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"');
|
||||
});
|
||||
|
||||
test("multi-hyphen term → phrase match", () => {
|
||||
expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"');
|
||||
});
|
||||
|
||||
test("hyphenated term mixed with plain terms", () => {
|
||||
expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*');
|
||||
});
|
||||
|
||||
test("negation still works alongside hyphenated terms", () => {
|
||||
expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*');
|
||||
});
|
||||
|
||||
test("negated hyphenated term", () => {
|
||||
expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"');
|
||||
});
|
||||
|
||||
test("plain negation still works (not confused with hyphen)", () => {
|
||||
expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*');
|
||||
});
|
||||
});
|
||||
|
||||
Loading…
Reference in New Issue
Block a user