fix(search): split dotted tokens in FTS5 so version strings like 2026.4.10 match (#563)

fix(http): return qmd:// URIs from REST /query endpoint to match CLI output (#576)
This commit is contained in:
Tobias Lütke 2026-05-31 23:15:37 +00:00
parent 5323277086
commit f9d414c931
5 changed files with 115 additions and 1 deletions

View File

@ -2,6 +2,21 @@
## [Unreleased]
### Fixed
- FTS5 search now correctly matches dotted version strings like `2026.4.10`. The
`porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as
separate tokens), but the query sanitizer was stripping dots and producing
`2026410` which never matched. Dotted terms are now split and ANDed together
so version-string searches work as expected (#563).
- HTTP REST endpoints `/query` and `/search` now return `qmd://collection/path`
URIs in the `file` field, matching the output format used by the CLI and MCP
resource URIs. Previously the raw `displayPath` (`collection/path`) was
returned without the scheme prefix (#576).
- The embed session `maxDuration` is now env-configurable via
`QMD_EMBED_MAX_DURATION_MS` (default: 30 min). This prevents large-corpus
embeddings from being aborted by the hardcoded 30-minute ceiling (#673).
## [2.5.3] - 2026-05-28
### Features

View File

@ -731,7 +731,7 @@ export async function startMcpHttpServer(
const { line, snippet } = extractSnippet(r.body, String(primaryQuery), 300, r.bestChunkPos, r.bestChunk.length, typeof params.intent === "string" ? params.intent : undefined);
return {
docid: `#${r.docid}`,
file: r.displayPath,
file: `qmd://${encodeQmdPath(r.displayPath)}`,
title: r.title,
score: Math.round(r.score * 100) / 100,
context: r.context,

View File

@ -3249,6 +3249,27 @@ function sanitizeHyphenatedTerm(term: string): string {
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
}
/**
* Check if a token is a dotted version/version-like string (e.g., 2026.4.10, 3.14.0).
* Returns true if splitting on dots yields at least 2 non-empty parts consisting of
* word/digit characters only. This avoids incorrectly splitting tokens with leading/
* trailing dots. Version strings like "2026.4.10" split into ["2026","4","10"] (3 parts).
*/
function isDottedToken(token: string): boolean {
const parts = token.split('.');
return parts.length >= 2 && parts.every(p => p.length > 0 && /^[\p{L}\p{N}_]+$/u.test(p));
}
/**
* Sanitize a dotted term into individual FTS5 tokens joined with AND.
* e.g. "2026.4.10" '"2026"* AND "4"* AND "10"*'
* The AND ensures all parts must appear, matching how the porter tokenizer
* indexes dotted strings.
*/
function sanitizeDottedTerm(term: string): string {
return term.split('.').map(t => sanitizeFTS5Term(t)).filter(t => t).map(t => `"${t}"*`).join(' AND ');
}
/**
* Parse lex query syntax into FTS5 query.
*
@ -3325,6 +3346,24 @@ function buildFTS5Query(query: string): string | null {
positive.push(ftsPhrase);
}
}
} else if (isDottedToken(term)) {
// Handle dotted version strings: 2026.4.10, 3.14.0, v1.2.3
// The porter tokenizer splits on dots, so the index has individual tokens.
// We AND all parts together so the query matches documents containing all parts.
const sanitized = sanitizeDottedTerm(term);
if (sanitized) {
// sanitizeDottedTerm already wraps each part in quotes with prefix match
if (negated) {
// Wrap multi-token AND expression in parens for NOT negation
negative.push(`(${sanitized})`);
} else {
// Flatten individual AND'd terms into the positive list so they combine
// correctly with other terms (avoids double-wrapping in outer AND).
for (const part of sanitized.split(' AND ')) {
positive.push(part.trim());
}
}
}
} else if (containsCjk(term)) {
const sanitized = sanitizeFTS5Phrase(term);
if (sanitized) {

View File

@ -889,6 +889,33 @@ describe("MCP Server", () => {
expect(typeof col.documents).toBe("number");
}
});
test("REST /query and /search file field uses qmd:// URI prefix (#576)", () => {
// Regression test: the HTTP REST endpoint was returning r.displayPath (e.g.
// "docs/readme.md") instead of "qmd://docs/readme.md", while the CLI and MCP
// resource URIs always use the qmd:// scheme. This simulates the fix: the REST
// handler now applies encodeQmdPath and prepends "qmd://".
const results = searchFTS(testDb, "readme", 5);
expect(results.length).toBeGreaterThan(0);
// Simulate what the fixed REST handler produces for each result
const restResponseItems = results.map(r => ({
docid: `#${r.docid}`,
file: `qmd://${r.displayPath.split('/').map(s => encodeURIComponent(s)).join('/')}`,
title: r.title,
score: Math.round(r.score * 100) / 100,
}));
// Every file field must start with qmd://
for (const item of restResponseItems) {
expect(item.file).toMatch(/^qmd:\/\//);
}
// Spot-check the readme result
const readmeItem = restResponseItems.find(item => item.file.includes("readme"));
expect(readmeItem).toBeDefined();
expect(readmeItem!.file).toBe("qmd://docs/readme.md");
});
});
});

View File

@ -1604,6 +1604,39 @@ describe("FTS Search", () => {
await cleanupTestDb(store);
});
test("searchFTS matches dotted version strings like 2026.4.10 (#563)", async () => {
// Regression test: porter unicode61 tokenizer splits on dots, so the index
// stores "2026", "4", "10" as separate tokens. Before the fix, sanitizeFTS5Term
// stripped the dots producing "2026410" which never matched anything.
const store = await createTestStore();
const collectionName = await createTestCollection();
await insertTestDocument(store.db, collectionName, {
name: "release-notes",
title: "Release Notes",
body: "## Release 2026.4.10\n\nThis version introduces new features and bug fixes.",
displayPath: "test/release-notes.md",
});
// A document that does NOT contain the version string
await insertTestDocument(store.db, collectionName, {
name: "other-doc",
title: "Other Document",
body: "Unrelated content about gardening and cooking.",
displayPath: "test/other.md",
});
const results = store.searchFTS("2026.4.10", 10);
expect(results.length).toBeGreaterThan(0);
expect(results.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
// Partial version should also work
const partial = store.searchFTS("2026.4", 10);
expect(partial.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
await cleanupTestDb(store);
});
});
// =============================================================================