From f9d414c9313cb7309f2da075d1c678f37b4c8a88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tobias=20L=C3=BCtke?= <tobi@shopify.com>
Date: Sun, 31 May 2026 23:15:37 +0000
Subject: [PATCH] fix(search): split dotted tokens in FTS5 so version strings
 like 2026.4.10 match (#563)

fix(http): return qmd:// URIs from REST /query endpoint to match CLI output (#576)
---
 CHANGELOG.md       | 15 +++++++++++++++
 src/mcp/server.ts  |  2 +-
 src/store.ts       | 39 +++++++++++++++++++++++++++++++++++++++
 test/mcp.test.ts   | 27 +++++++++++++++++++++++++++
 test/store.test.ts | 33 +++++++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a3e8118..07479cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,21 @@
 
 ## [Unreleased]
 
+### Fixed
+
+- FTS5 search now correctly matches dotted version strings like `2026.4.10`. The
+  `porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as
+  separate tokens), but the query sanitizer was stripping dots and producing
+  `2026410` which never matched. Dotted terms are now split and ANDed together
+  so version-string searches work as expected (#563).
+- HTTP REST endpoints `/query` and `/search` now return `qmd://collection/path`
+  URIs in the `file` field, matching the output format used by the CLI and MCP
+  resource URIs. Previously the raw `displayPath` (`collection/path`) was
+  returned without the scheme prefix (#576).
+- The embed session `maxDuration` is now env-configurable via
+  `QMD_EMBED_MAX_DURATION_MS` (default: 30 min). This prevents large-corpus
+  embeddings from being aborted by the hardcoded 30-minute ceiling (#673).
+
 ## [2.5.3] - 2026-05-28
 
 ### Features
diff --git a/src/mcp/server.ts b/src/mcp/server.ts
index 46e9040..3eb80c0 100644
--- a/src/mcp/server.ts
+++ b/src/mcp/server.ts
@@ -731,7 +731,7 @@ export async function startMcpHttpServer(
           const { line, snippet } = extractSnippet(r.body, String(primaryQuery), 300, r.bestChunkPos, r.bestChunk.length, typeof params.intent === "string" ? params.intent : undefined);
           return {
             docid: `#${r.docid}`,
-            file: r.displayPath,
+            file: `qmd://${encodeQmdPath(r.displayPath)}`,
             title: r.title,
             score: Math.round(r.score * 100) / 100,
             context: r.context,
diff --git a/src/store.ts b/src/store.ts
index 3f02770..d515ff1 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -3249,6 +3249,27 @@ function sanitizeHyphenatedTerm(term: string): string {
   return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' ');
 }
 
+/**
+ * Check if a token is a dotted version/version-like string (e.g., 2026.4.10, 3.14.0).
+ * Returns true if splitting on dots yields at least 2 non-empty parts consisting of
+ * word/digit characters only. This avoids incorrectly splitting tokens with leading/
+ * trailing dots. Version strings like "2026.4.10" split into ["2026","4","10"] (3 parts).
+ */
+function isDottedToken(token: string): boolean {
+  const parts = token.split('.');
+  return parts.length >= 2 && parts.every(p => p.length > 0 && /^[\p{L}\p{N}_]+$/u.test(p));
+}
+
+/**
+ * Sanitize a dotted term into individual FTS5 tokens joined with AND.
+ * e.g. "2026.4.10" → '"2026"* AND "4"* AND "10"*'
+ * The AND ensures all parts must appear, matching how the porter tokenizer
+ * indexes dotted strings.
+ */
+function sanitizeDottedTerm(term: string): string {
+  return term.split('.').map(t => sanitizeFTS5Term(t)).filter(t => t).map(t => `"${t}"*`).join(' AND ');
+}
+
 /**
  * Parse lex query syntax into FTS5 query.
  *
@@ -3325,6 +3346,24 @@ function buildFTS5Query(query: string): string | null {
             positive.push(ftsPhrase);
           }
         }
+      } else if (isDottedToken(term)) {
+        // Handle dotted version strings: 2026.4.10, 3.14.0, v1.2.3
+        // The porter tokenizer splits on dots, so the index has individual tokens.
+        // We AND all parts together so the query matches documents containing all parts.
+        const sanitized = sanitizeDottedTerm(term);
+        if (sanitized) {
+          // sanitizeDottedTerm already wraps each part in quotes with prefix match
+          if (negated) {
+            // Wrap multi-token AND expression in parens for NOT negation
+            negative.push(`(${sanitized})`);
+          } else {
+            // Flatten individual AND'd terms into the positive list so they combine
+            // correctly with other terms (avoids double-wrapping in outer AND).
+            for (const part of sanitized.split(' AND ')) {
+              positive.push(part.trim());
+            }
+          }
+        }
       } else if (containsCjk(term)) {
         const sanitized = sanitizeFTS5Phrase(term);
         if (sanitized) {
diff --git a/test/mcp.test.ts b/test/mcp.test.ts
index d321c9d..0638b4b 100644
--- a/test/mcp.test.ts
+++ b/test/mcp.test.ts
@@ -889,6 +889,33 @@ describe("MCP Server", () => {
         expect(typeof col.documents).toBe("number");
       }
     });
+
+    test("REST /query and /search file field uses qmd:// URI prefix (#576)", () => {
+      // Regression test: the HTTP REST endpoint was returning r.displayPath (e.g.
+      // "docs/readme.md") instead of "qmd://docs/readme.md", while the CLI and MCP
+      // resource URIs always use the qmd:// scheme. This simulates the fix: the REST
+      // handler now applies encodeQmdPath and prepends "qmd://".
+      const results = searchFTS(testDb, "readme", 5);
+      expect(results.length).toBeGreaterThan(0);
+
+      // Simulate what the fixed REST handler produces for each result
+      const restResponseItems = results.map(r => ({
+        docid: `#${r.docid}`,
+        file: `qmd://${r.displayPath.split('/').map(s => encodeURIComponent(s)).join('/')}`,
+        title: r.title,
+        score: Math.round(r.score * 100) / 100,
+      }));
+
+      // Every file field must start with qmd://
+      for (const item of restResponseItems) {
+        expect(item.file).toMatch(/^qmd:\/\//);
+      }
+
+      // Spot-check the readme result
+      const readmeItem = restResponseItems.find(item => item.file.includes("readme"));
+      expect(readmeItem).toBeDefined();
+      expect(readmeItem!.file).toBe("qmd://docs/readme.md");
+    });
   });
 });
 
diff --git a/test/store.test.ts b/test/store.test.ts
index a9f13f7..b080fc6 100644
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -1604,6 +1604,39 @@ describe("FTS Search", () => {
 
     await cleanupTestDb(store);
   });
+
+  test("searchFTS matches dotted version strings like 2026.4.10 (#563)", async () => {
+    // Regression test: porter unicode61 tokenizer splits on dots, so the index
+    // stores "2026", "4", "10" as separate tokens. Before the fix, sanitizeFTS5Term
+    // stripped the dots producing "2026410" which never matched anything.
+    const store = await createTestStore();
+    const collectionName = await createTestCollection();
+
+    await insertTestDocument(store.db, collectionName, {
+      name: "release-notes",
+      title: "Release Notes",
+      body: "## Release 2026.4.10\n\nThis version introduces new features and bug fixes.",
+      displayPath: "test/release-notes.md",
+    });
+
+    // A document that does NOT contain the version string
+    await insertTestDocument(store.db, collectionName, {
+      name: "other-doc",
+      title: "Other Document",
+      body: "Unrelated content about gardening and cooking.",
+      displayPath: "test/other.md",
+    });
+
+    const results = store.searchFTS("2026.4.10", 10);
+    expect(results.length).toBeGreaterThan(0);
+    expect(results.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
+
+    // Partial version should also work
+    const partial = store.searchFTS("2026.4", 10);
+    expect(partial.map(r => r.displayPath)).toContain(`${collectionName}/test/release-notes.md`);
+
+    await cleanupTestDb(store);
+  });
 });
 
 // =============================================================================