fix(search): split dotted tokens in FTS5 so version strings like 2026.4.10 match (#563)
fix(http): return qmd:// URIs from REST /query endpoint to match CLI output (#576)
This commit is contained in:
parent
5323277086
commit
f9d414c931
15
CHANGELOG.md
15
CHANGELOG.md
@ -2,6 +2,21 @@
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Fixed
|
||||
|
||||
- FTS5 search now correctly matches dotted version strings like `2026.4.10`. The
|
||||
`porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as
|
||||
separate tokens), but the query sanitizer was stripping dots and producing
|
||||
`2026410` which never matched. Dotted terms are now split and ANDed together
|
||||
so version-string searches work as expected (#563).
|
||||
- HTTP REST endpoints `/query` and `/search` now return `qmd://collection/path`
|
||||
URIs in the `file` field, matching the output format used by the CLI and MCP
|
||||
resource URIs. Previously the raw `displayPath` (`collection/path`) was
|
||||
returned without the scheme prefix (#576).
|
||||
- The embed session `maxDuration` is now env-configurable via
|
||||
`QMD_EMBED_MAX_DURATION_MS` (default: 30 min). This prevents large-corpus
|
||||
embeddings from being aborted by the hardcoded 30-minute ceiling (#673).
|
||||
|
||||
## [2.5.3] - 2026-05-28
|
||||
|
||||
### Features
|
||||
|
||||
@ -731,7 +731,7 @@ export async function startMcpHttpServer(
|
||||
const { line, snippet } = extractSnippet(r.body, String(primaryQuery), 300, r.bestChunkPos, r.bestChunk.length, typeof params.intent === "string" ? params.intent : undefined);
|
||||
return {
|
||||
docid: `#${r.docid}`,
|
||||
file: r.displayPath,
|
||||
file: `qmd://${encodeQmdPath(r.displayPath)}`,
|
||||
title: r.title,
|
||||
score: Math.round(r.score * 100) / 100,
|
||||
context: r.context,
|
||||
|
||||
39
src/store.ts
39
src/store.ts
@ -3249,6 +3249,27 @@ function sanitizeHyphenatedTerm(term: string): string {
|
||||
return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a token is a dotted version/version-like string (e.g., 2026.4.10, 3.14.0).
|
||||
* Returns true if splitting on dots yields at least 2 non-empty parts consisting of
|
||||
* word/digit characters only. This avoids incorrectly splitting tokens with leading/
|
||||
* trailing dots. Version strings like "2026.4.10" split into ["2026","4","10"] (3 parts).
|
||||
*/
|
||||
function isDottedToken(token: string): boolean {
|
||||
const parts = token.split('.');
|
||||
return parts.length >= 2 && parts.every(p => p.length > 0 && /^[\p{L}\p{N}_]+$/u.test(p));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize a dotted term into individual FTS5 tokens joined with AND.
|
||||
* e.g. "2026.4.10" → '"2026"* AND "4"* AND "10"*'
|
||||
* The AND ensures all parts must appear, matching how the porter tokenizer
|
||||
* indexes dotted strings.
|
||||
*/
|
||||
function sanitizeDottedTerm(term: string): string {
|
||||
return term.split('.').map(t => sanitizeFTS5Term(t)).filter(t => t).map(t => `"${t}"*`).join(' AND ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse lex query syntax into FTS5 query.
|
||||
*
|
||||
@ -3325,6 +3346,24 @@ function buildFTS5Query(query: string): string | null {
|
||||
positive.push(ftsPhrase);
|
||||
}
|
||||
}
|
||||
} else if (isDottedToken(term)) {
|
||||
// Handle dotted version strings: 2026.4.10, 3.14.0, v1.2.3
|
||||
// The porter tokenizer splits on dots, so the index has individual tokens.
|
||||
// We AND all parts together so the query matches documents containing all parts.
|
||||
const sanitized = sanitizeDottedTerm(term);
|
||||
if (sanitized) {
|
||||
// sanitizeDottedTerm already wraps each part in quotes with prefix match
|
||||
if (negated) {
|
||||
// Wrap multi-token AND expression in parens for NOT negation
|
||||
negative.push(`(${sanitized})`);
|
||||
} else {
|
||||
// Flatten individual AND'd terms into the positive list so they combine
|
||||
// correctly with other terms (avoids double-wrapping in outer AND).
|
||||
for (const part of sanitized.split(' AND ')) {
|
||||
positive.push(part.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (containsCjk(term)) {
|
||||
const sanitized = sanitizeFTS5Phrase(term);
|
||||
if (sanitized) {
|
||||
|
||||
@ -889,6 +889,33 @@ describe("MCP Server", () => {
|
||||
expect(typeof col.documents).toBe("number");
|
||||
}
|
||||
});
|
||||
|
||||
test("REST /query and /search file field uses qmd:// URI prefix (#576)", () => {
|
||||
// Regression test: the HTTP REST endpoint was returning r.displayPath (e.g.
|
||||
// "docs/readme.md") instead of "qmd://docs/readme.md", while the CLI and MCP
|
||||
// resource URIs always use the qmd:// scheme. This simulates the fix: the REST
|
||||
// handler now applies encodeQmdPath and prepends "qmd://".
|
||||
const results = searchFTS(testDb, "readme", 5);
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
|
||||
// Simulate what the fixed REST handler produces for each result
|
||||
const restResponseItems = results.map(r => ({
|
||||
docid: `#${r.docid}`,
|
||||
file: `qmd://${r.displayPath.split('/').map(s => encodeURIComponent(s)).join('/')}`,
|
||||
title: r.title,
|
||||
score: Math.round(r.score * 100) / 100,
|
||||
}));
|
||||
|
||||
// Every file field must start with qmd://
|
||||
for (const item of restResponseItems) {
|
||||
expect(item.file).toMatch(/^qmd:\/\//);
|
||||
}
|
||||
|
||||
// Spot-check the readme result
|
||||
const readmeItem = restResponseItems.find(item => item.file.includes("readme"));
|
||||
expect(readmeItem).toBeDefined();
|
||||
expect(readmeItem!.file).toBe("qmd://docs/readme.md");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
@ -1604,6 +1604,39 @@ describe("FTS Search", () => {
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("searchFTS matches dotted version strings like 2026.4.10 (#563)", async () => {
|
||||
// Regression test: porter unicode61 tokenizer splits on dots, so the index
|
||||
// stores "2026", "4", "10" as separate tokens. Before the fix, sanitizeFTS5Term
|
||||
// stripped the dots producing "2026410" which never matched anything.
|
||||
const store = await createTestStore();
|
||||
const collectionName = await createTestCollection();
|
||||
|
||||
await insertTestDocument(store.db, collectionName, {
|
||||
name: "release-notes",
|
||||
title: "Release Notes",
|
||||
body: "## Release 2026.4.10\n\nThis version introduces new features and bug fixes.",
|
||||
displayPath: "test/release-notes.md",
|
||||
});
|
||||
|
||||
// A document that does NOT contain the version string
|
||||
await insertTestDocument(store.db, collectionName, {
|
||||
name: "other-doc",
|
||||
title: "Other Document",
|
||||
body: "Unrelated content about gardening and cooking.",
|
||||
displayPath: "test/other.md",
|
||||
});
|
||||
|
||||
const results = store.searchFTS("2026.4.10", 10);
|
||||
expect(results.length).toBeGreaterThan(0);
|
||||
expect(results.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
|
||||
|
||||
// Partial version should also work
|
||||
const partial = store.searchFTS("2026.4", 10);
|
||||
expect(partial.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
|
||||
Loading…
Reference in New Issue
Block a user