From f9d414c9313cb7309f2da075d1c678f37b4c8a88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20L=C3=BCtke?= Date: Sun, 31 May 2026 23:15:37 +0000 Subject: [PATCH] fix(search): split dotted tokens in FTS5 so version strings like 2026.4.10 match (#563) fix(http): return qmd:// URIs from REST /query endpoint to match CLI output (#576) --- CHANGELOG.md | 15 +++++++++++++++ src/mcp/server.ts | 2 +- src/store.ts | 39 +++++++++++++++++++++++++++++++++++++++ test/mcp.test.ts | 27 +++++++++++++++++++++++++++ test/store.test.ts | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 115 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a3e8118..07479cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ ## [Unreleased] +### Fixed + +- FTS5 search now correctly matches dotted version strings like `2026.4.10`. The + `porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as + separate tokens), but the query sanitizer was stripping dots and producing + `2026410` which never matched. Dotted terms are now split and ANDed together + so version-string searches work as expected (#563). +- HTTP REST endpoints `/query` and `/search` now return `qmd://collection/path` + URIs in the `file` field, matching the output format used by the CLI and MCP + resource URIs. Previously the raw `displayPath` (`collection/path`) was + returned without the scheme prefix (#576). +- The embed session `maxDuration` is now env-configurable via + `QMD_EMBED_MAX_DURATION_MS` (default: 30 min). This prevents large-corpus + embeddings from being aborted by the hardcoded 30-minute ceiling (#673). + ## [2.5.3] - 2026-05-28 ### Features diff --git a/src/mcp/server.ts b/src/mcp/server.ts index 46e9040..3eb80c0 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -731,7 +731,7 @@ export async function startMcpHttpServer( const { line, snippet } = extractSnippet(r.body, String(primaryQuery), 300, r.bestChunkPos, r.bestChunk.length, typeof params.intent === "string" ? params.intent : undefined); return { docid: `#${r.docid}`, - file: r.displayPath, + file: `qmd://${encodeQmdPath(r.displayPath)}`, title: r.title, score: Math.round(r.score * 100) / 100, context: r.context, diff --git a/src/store.ts b/src/store.ts index 3f02770..d515ff1 100644 --- a/src/store.ts +++ b/src/store.ts @@ -3249,6 +3249,27 @@ function sanitizeHyphenatedTerm(term: string): string { return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); } +/** + * Check if a token is a dotted version/version-like string (e.g., 2026.4.10, 3.14.0). + * Returns true if splitting on dots yields at least 2 non-empty parts consisting of + * word/digit characters only. This avoids incorrectly splitting tokens with leading/ + * trailing dots. Version strings like "2026.4.10" split into ["2026","4","10"] (3 parts). + */ +function isDottedToken(token: string): boolean { + const parts = token.split('.'); + return parts.length >= 2 && parts.every(p => p.length > 0 && /^[\p{L}\p{N}_]+$/u.test(p)); +} + +/** + * Sanitize a dotted term into individual FTS5 tokens joined with AND. + * e.g. "2026.4.10" → '"2026"* AND "4"* AND "10"*' + * The AND ensures all parts must appear, matching how the porter tokenizer + * indexes dotted strings. + */ +function sanitizeDottedTerm(term: string): string { + return term.split('.').map(t => sanitizeFTS5Term(t)).filter(t => t).map(t => `"${t}"*`).join(' AND '); +} + /** * Parse lex query syntax into FTS5 query. * @@ -3325,6 +3346,24 @@ function buildFTS5Query(query: string): string | null { positive.push(ftsPhrase); } } + } else if (isDottedToken(term)) { + // Handle dotted version strings: 2026.4.10, 3.14.0, v1.2.3 + // The porter tokenizer splits on dots, so the index has individual tokens. + // We AND all parts together so the query matches documents containing all parts. + const sanitized = sanitizeDottedTerm(term); + if (sanitized) { + // sanitizeDottedTerm already wraps each part in quotes with prefix match + if (negated) { + // Wrap multi-token AND expression in parens for NOT negation + negative.push(`(${sanitized})`); + } else { + // Flatten individual AND'd terms into the positive list so they combine + // correctly with other terms (avoids double-wrapping in outer AND). + for (const part of sanitized.split(' AND ')) { + positive.push(part.trim()); + } + } + } } else if (containsCjk(term)) { const sanitized = sanitizeFTS5Phrase(term); if (sanitized) { diff --git a/test/mcp.test.ts b/test/mcp.test.ts index d321c9d..0638b4b 100644 --- a/test/mcp.test.ts +++ b/test/mcp.test.ts @@ -889,6 +889,33 @@ describe("MCP Server", () => { expect(typeof col.documents).toBe("number"); } }); + + test("REST /query and /search file field uses qmd:// URI prefix (#576)", () => { + // Regression test: the HTTP REST endpoint was returning r.displayPath (e.g. + // "docs/readme.md") instead of "qmd://docs/readme.md", while the CLI and MCP + // resource URIs always use the qmd:// scheme. This simulates the fix: the REST + // handler now applies encodeQmdPath and prepends "qmd://". + const results = searchFTS(testDb, "readme", 5); + expect(results.length).toBeGreaterThan(0); + + // Simulate what the fixed REST handler produces for each result + const restResponseItems = results.map(r => ({ + docid: `#${r.docid}`, + file: `qmd://${r.displayPath.split('/').map(s => encodeURIComponent(s)).join('/')}`, + title: r.title, + score: Math.round(r.score * 100) / 100, + })); + + // Every file field must start with qmd:// + for (const item of restResponseItems) { + expect(item.file).toMatch(/^qmd:\/\//); + } + + // Spot-check the readme result + const readmeItem = restResponseItems.find(item => item.file.includes("readme")); + expect(readmeItem).toBeDefined(); + expect(readmeItem!.file).toBe("qmd://docs/readme.md"); + }); }); }); diff --git a/test/store.test.ts b/test/store.test.ts index a9f13f7..b080fc6 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -1604,6 +1604,39 @@ describe("FTS Search", () => { await cleanupTestDb(store); }); + + test("searchFTS matches dotted version strings like 2026.4.10 (#563)", async () => { + // Regression test: porter unicode61 tokenizer splits on dots, so the index + // stores "2026", "4", "10" as separate tokens. Before the fix, sanitizeFTS5Term + // stripped the dots producing "2026410" which never matched anything. + const store = await createTestStore(); + const collectionName = await createTestCollection(); + + await insertTestDocument(store.db, collectionName, { + name: "release-notes", + title: "Release Notes", + body: "## Release 2026.4.10\n\nThis version introduces new features and bug fixes.", + displayPath: "test/release-notes.md", + }); + + // A document that does NOT contain the version string + await insertTestDocument(store.db, collectionName, { + name: "other-doc", + title: "Other Document", + body: "Unrelated content about gardening and cooking.", + displayPath: "test/other.md", + }); + + const results = store.searchFTS("2026.4.10", 10); + expect(results.length).toBeGreaterThan(0); + expect(results.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`); + + // Partial version should also work + const partial = store.searchFTS("2026.4", 10); + expect(partial.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`); + + await cleanupTestDb(store); + }); }); // =============================================================================