From 070147d8ab313a20ae35b02bca388c02a82d1b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20L=C3=BCtke?= Date: Mon, 1 Jun 2026 20:20:50 +0000 Subject: [PATCH] fix: store literal filesystem paths, drop handelize() at index time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filenames with special characters (#, &, spaces, [], (), etc.) now round-trip correctly through index → search → get → full-path. Root cause: reindexCollection() called handelize() on the relative path before storing it in documents.path, turning '# Meeting - 234232 3432 __ 5.md' → 'Meeting-234232-3432-5.md' This broke all downstream operations that needed to reconstruct the real filesystem path from the DB record. Changes: - Remove handelize() from reindexCollection() in store.ts (index time) - Remove handelize() from update command path in cli/qmd.ts - findOrMigrateLegacyDocument now tries both raw path and handalized variant so existing indexes auto-migrate on next qmd update - resolveVirtualPath, toVirtualPath, detectCollectionFromPath all work correctly once the DB stores literal paths Tests (test/path-fidelity.test.ts — 10/10): - Store level: DB contains literal paths, not handalized slugs - toVirtualPath returns non-null for crazy-named files - (1) search --json file field shows literal path - (2) get --full-path resolves to a real on-disk path - (3) get finds the document - (3b) subdir file with crazy name also works - (4) ls shows literal paths - (5) search docid can be fetched back - Normal filenames still work (regression) - Migration: qmd update on handalized index rewrites paths to literal --- CHANGELOG.md | 8 + src/cli/qmd.ts | 3 +- src/store.ts | 29 ++- test/path-fidelity.test.ts | 409 +++++++++++++++++++++++++++++++++++++ 4 files changed, 446 insertions(+), 3 deletions(-) create mode 100644 test/path-fidelity.test.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 07479cf..b367f40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,14 @@ ### Fixed +- Filesystem paths with special characters (`#`, `&`, spaces, `[]`, `()`, etc.) + now round-trip correctly through index → search → get. Previously + `reindexCollection` called `handelize()` on relative paths before storing + them, turning `# Meeting - 234232 3432 __ 5.md` into + `Meeting-234232-3432-5.md` and making `qmd get `, + `qmd get --full-path`, and `qmd ls` return dead or garbled paths. Paths are + now stored verbatim. Existing indexes auto-migrate on the next `qmd update`. + - FTS5 search now correctly matches dotted version strings like `2026.4.10`. The `porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as separate tokens), but the query sanitizer was stripping dots and producing diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index aff0af7..105506d 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -1824,7 +1824,8 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, coll for (const relativeFile of files) { const filepath = getRealPath(resolve(resolvedPwd, relativeFile)); - const path = handelize(relativeFile); // Normalize path for token-friendliness + // Store the literal relative path — handelize() is NOT applied at index time. + const path = relativeFile.replace(/\\/g, '/'); seenPaths.add(path); let content: string; diff --git a/src/store.ts b/src/store.ts index d515ff1..99e36b8 100644 --- a/src/store.ts +++ b/src/store.ts @@ -1306,7 +1306,10 @@ export async function reindexCollection( for (const relativeFile of files) { const filepath = getRealPath(resolve(collectionPath, relativeFile)); - const path = handelize(relativeFile); + // Store the literal relative path so the filesystem path can always be + // reconstructed as: resolve(collection.path, storedPath). + // handelize() is NOT applied at index time — it is display-only. + const path = normalizePathSeparators(relativeFile); seenPaths.add(path); let content: string; @@ -2493,12 +2496,34 @@ export function findOrMigrateLegacyDocument( const existing = findActiveDocument(db, collectionName, path); if (existing) return existing; - const legacy = db.prepare(` + // Case-insensitive match (legacy normalization: e.g. "README.md" → "readme.md"). + const legacyCase = db.prepare(` SELECT id, hash, title FROM documents WHERE collection = ? AND path COLLATE NOCASE = ? AND active = 1 ORDER BY id LIMIT 1 `).get(collectionName, path) as { id: number; hash: string; title: string } | undefined; + + // Handalized-path match: existing DBs indexed with handelize() stored slugged paths + // like "Budget-Revenue-Q4-2024.md" for a raw path like "Budget & Revenue (Q4) [2024].md". + // Try matching the handalized form of the incoming raw path against the DB so that + // qmd update on an old index can rename the row to the literal path. + let legacyHandalized: { id: number; hash: string; title: string } | undefined; + try { + const handleized = handelize(path); + if (handleized !== path) { + legacyHandalized = db.prepare(` + SELECT id, hash, title FROM documents + WHERE collection = ? AND path = ? AND active = 1 + ORDER BY id + LIMIT 1 + `).get(collectionName, handleized) as { id: number; hash: string; title: string } | undefined; + } + } catch { + // handelize throws on invalid paths; just skip + } + + const legacy = legacyCase ?? legacyHandalized; if (!legacy) return null; // Wrap rename + FTS rebuild in a transaction for atomicity. diff --git a/test/path-fidelity.test.ts b/test/path-fidelity.test.ts new file mode 100644 index 0000000..7c4a26f --- /dev/null +++ b/test/path-fidelity.test.ts @@ -0,0 +1,409 @@ +/** + * Path Fidelity Tests + * + * Verifies that QMD stores literal filesystem paths (not handalized slugs) so + * that paths with special characters — spaces, #, &, @, [], (), etc. — round- + * trip correctly through index → search → get → full-path. + * + * This covers the five breakage points found before the literal-path fix: + * 1. search --json `file` field shows handalized slug instead of real path + * 2. `qmd get --full-path` silently falls back (resolveVirtualPath built + * a non-existent path from the slug, existsSync returned false) + * 3. `qmd get ` returns "Document not found" + * 4. `qmd ls` shows handalized slugs + * 5. `toVirtualPath(db, absPath)` returns null + * + * Also covers backward-compat migration: an index created with the old + * handalize-at-index-time code can be updated with `qmd update` and the paths + * are renamed to their literal forms in-place. + */ + +import { describe, test, expect, beforeAll, afterAll } from "vitest"; +import { mkdir, mkdtemp, rm, writeFile } from "fs/promises"; +import { existsSync } from "fs"; +import { tmpdir } from "os"; +import { join } from "path"; +import { spawn } from "child_process"; +import { fileURLToPath } from "url"; +import { dirname } from "path"; +import YAML from "yaml"; +import { openDatabase } from "../src/db.js"; +import type { Database } from "../src/db.js"; +import { + createStore, + toVirtualPath, + insertDocument, + insertContent, + hashContent, + handelize, + normalizePathSeparators, + syncConfigToDb, +} from "../src/store.js"; +import type { CollectionConfig } from "../src/collections.js"; + +const thisDir = dirname(fileURLToPath(import.meta.url)); +const projectRoot = join(thisDir, ".."); +const qmdScript = join(projectRoot, "src", "cli", "qmd.ts"); +const isBunRuntime = typeof (globalThis as { Bun?: unknown }).Bun !== "undefined"; +const tsxCli = join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs"); + +async function runQmd( + args: string[], + opts: { cwd: string; dbPath: string; configDir: string; env?: Record } +): Promise<{ stdout: string; stderr: string; exitCode: number }> { + const runner = isBunRuntime + ? { command: process.execPath, args: [qmdScript, ...args] } + : { command: process.execPath, args: [tsxCli, qmdScript, ...args] }; + + const proc = spawn(runner.command, runner.args, { + cwd: opts.cwd, + env: { + ...process.env, + INDEX_PATH: opts.dbPath, + QMD_CONFIG_DIR: opts.configDir, + PWD: opts.cwd, + QMD_DOCTOR_DEVICE_PROBE: "0", + ...(opts.env ?? {}), + }, + stdio: ["ignore", "pipe", "pipe"], + }); + + let stdout = ""; + let stderr = ""; + proc.stdout?.on("data", (c: Buffer) => { stdout += c.toString(); }); + proc.stderr?.on("data", (c: Buffer) => { stderr += c.toString(); }); + const exitCode = await new Promise((res, rej) => { + proc.once("error", rej); + proc.on("close", (code) => res(code ?? 1)); + }); + return { stdout, stderr, exitCode }; +} + +// --------------------------------------------------------------------------- +// Test environment setup +// --------------------------------------------------------------------------- + +let testDir: string; + +// Files with names that previously broke due to handalize() at index time. +const crazyFiles: Array<{ name: string; content: string }> = [ + { + name: "# Meeting - 234232 3432 __ 5.md", + content: "# Meeting - 234232 3432 // 5\n\nSome meeting content with searchterm-alpha.\n", + }, + { + name: "Budget & Revenue (Q4) [2024].md", + content: "# Budget & Revenue Q4 2024\n\nFinancial overview searchterm-beta.\n", + }, + { + name: "normal-file.md", + content: "# Normal File\n\nPlain filename, should always work.\n", + }, +]; + +const crazySubFiles: Array<{ name: string; content: string }> = [ + { + name: "Notes #42 - foo@bar.md", + content: "# Notes #42\n\nSubdir file with searchterm-gamma.\n", + }, +]; + +beforeAll(async () => { + testDir = await mkdtemp(join(tmpdir(), "qmd-path-fidelity-")); +}); + +afterAll(async () => { + await rm(testDir, { recursive: true, force: true }); +}); + +// Helper: create a fresh isolated test environment with a corpus of crazy filenames. +async function createCrazyCollection(prefix: string): Promise<{ + collectionDir: string; + dbPath: string; + configDir: string; +}> { + const envDir = join(testDir, prefix); + const collectionDir = join(envDir, "corpus"); + const dbPath = join(envDir, "test.sqlite"); + const configDir = join(envDir, "config"); + + await mkdir(collectionDir, { recursive: true }); + await mkdir(join(collectionDir, "subdir"), { recursive: true }); + await mkdir(configDir, { recursive: true }); + + for (const f of crazyFiles) { + await writeFile(join(collectionDir, f.name), f.content); + } + for (const f of crazySubFiles) { + await writeFile(join(collectionDir, "subdir", f.name), f.content); + } + + // Write empty YAML config — `collection add` will populate it + await writeFile(join(configDir, "index.yml"), "collections: {}\n"); + + return { collectionDir, dbPath, configDir }; +} + +// --------------------------------------------------------------------------- +// Unit tests: store-level path storage +// --------------------------------------------------------------------------- + +describe("Path fidelity — store level", () => { + test("reindexCollection stores literal relative paths, not handalized slugs", async () => { + const { collectionDir, dbPath, configDir } = await createCrazyCollection("store-unit"); + + // Run `collection add` to index + const add = await runQmd( + ["collection", "add", collectionDir, "--name", "crazytest"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(add.exitCode, `collection add failed: ${add.stderr}`).toBe(0); + + // Inspect the DB directly + const db = openDatabase(dbPath); + const rows = db.prepare( + "SELECT path FROM documents WHERE active = 1 ORDER BY path" + ).all() as { path: string }[]; + db.close(); + + const paths = rows.map((r) => r.path); + + // Must contain literal filenames — not handalized slugs + expect(paths).toContain("# Meeting - 234232 3432 __ 5.md"); + expect(paths).toContain("Budget & Revenue (Q4) [2024].md"); + expect(paths).toContain("normal-file.md"); + expect(paths).toContain("subdir/Notes #42 - foo@bar.md"); + + // Must NOT contain handalized versions + expect(paths).not.toContain("Meeting-234232-3432-5.md"); + expect(paths).not.toContain("Budget-Revenue-Q4-2024.md"); + expect(paths).not.toContain("subdir/Notes-42-foo-bar.md"); + }); + + test("toVirtualPath returns non-null for crazy-named files", async () => { + const { collectionDir, dbPath, configDir } = await createCrazyCollection("store-to-virtual"); + const add = await runQmd( + ["collection", "add", collectionDir, "--name", "crazytest"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(add.exitCode).toBe(0); + + const rawDb = openDatabase(dbPath); + const result = toVirtualPath(rawDb, join(collectionDir, "Budget & Revenue (Q4) [2024].md")); + rawDb.close(); + + expect(result).not.toBeNull(); + expect(result).toBe(`qmd://crazytest/Budget & Revenue (Q4) [2024].md`); + }); +}); + +// --------------------------------------------------------------------------- +// CLI integration tests — the five original breakage points +// --------------------------------------------------------------------------- + +describe("Path fidelity — CLI integration", () => { + let collectionDir: string; + let dbPath: string; + let configDir: string; + + // Index once for the whole describe block (read-only tests share it) + beforeAll(async () => { + ({ collectionDir, dbPath, configDir } = await createCrazyCollection("cli-shared")); + const add = await runQmd( + ["collection", "add", collectionDir, "--name", "crazytest"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(add.exitCode, `collection add failed: ${add.stderr}`).toBe(0); + }); + + test("(1) search --json file field contains literal path, not handalized slug", async () => { + const { stdout, exitCode } = await runQmd( + ["search", "searchterm-alpha", "--json"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(exitCode).toBe(0); + + const results = JSON.parse(stdout) as Array<{ file: string }>; + expect(results.length).toBeGreaterThan(0); + + const meetingResult = results.find((r) => r.file.includes("Meeting")); + expect(meetingResult).toBeDefined(); + // Must contain the literal filename fragment + expect(meetingResult!.file).toContain("# Meeting - 234232 3432 __ 5.md"); + // Must not contain the handalized version + expect(meetingResult!.file).not.toContain("Meeting-234232-3432-5.md"); + }); + + test("(2) get --full-path resolves to real filesystem path for crazy-named file", async () => { + const virtualPath = `qmd://crazytest/Budget & Revenue (Q4) [2024].md`; + const { stdout, exitCode } = await runQmd( + ["get", virtualPath, "--full-path"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(exitCode, `get failed: ${stdout}`).toBe(0); + + const header = stdout.split("\n")[0]!; + // Should show a real filesystem path, not a qmd:// virtual path + expect(header).not.toMatch(/^qmd:\/\//); + // Should include the literal filename + expect(header).toContain("Budget & Revenue (Q4) [2024].md"); + // The resolved filesystem path should exist — strip the trailing docid (#abc123) + const fsPath = header.trim().replace(/\s+#[a-f0-9]{6}$/, ""); + // Path may be absolute or relative-to-collectionDir; resolve against collectionDir + const absPath = fsPath.startsWith("/") ? fsPath : join(collectionDir, fsPath.replace(/^\.\//, "")); + expect(existsSync(absPath), `resolved path does not exist: ${absPath}`).toBe(true); + }); + test("(3) get finds the document", async () => { + const fsPath = join(collectionDir, "Budget & Revenue (Q4) [2024].md"); + const { stdout, exitCode, stderr } = await runQmd( + ["get", fsPath], + { cwd: collectionDir, dbPath, configDir } + ); + expect(exitCode, `get by fs path failed: ${stderr}`).toBe(0); + // Header should contain the document identifier + expect(stdout).toContain("Budget & Revenue (Q4) [2024].md"); + }); + + test("(3b) get finds subdir file with crazy name", async () => { + const fsPath = join(collectionDir, "subdir", "Notes #42 - foo@bar.md"); + const { stdout, exitCode, stderr } = await runQmd( + ["get", fsPath], + { cwd: collectionDir, dbPath, configDir } + ); + expect(exitCode, `get subdir file failed: ${stderr}`).toBe(0); + expect(stdout).toContain("Notes #42 - foo@bar.md"); + }); + + test("(4) ls shows literal paths, not handalized slugs", async () => { + const { stdout, exitCode } = await runQmd( + ["ls", "crazytest"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(exitCode).toBe(0); + + // Literal paths must appear + expect(stdout).toContain("# Meeting - 234232 3432 __ 5.md"); + expect(stdout).toContain("Budget & Revenue (Q4) [2024].md"); + expect(stdout).toContain("Notes #42 - foo@bar.md"); + + // Handalized slugs must NOT appear + expect(stdout).not.toContain("Meeting-234232-3432-5.md"); + expect(stdout).not.toContain("Budget-Revenue-Q4-2024.md"); + expect(stdout).not.toContain("Notes-42-foo-bar.md"); + }); + + test("(5) search --json returns docid that can be fetched back", async () => { + const { stdout: searchOut, exitCode: searchExit } = await runQmd( + ["search", "searchterm-beta", "--json"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(searchExit).toBe(0); + + const results = JSON.parse(searchOut) as Array<{ docid: string; file: string }>; + expect(results.length).toBeGreaterThan(0); + + const hit = results[0]!; + expect(hit.docid).toMatch(/^#[a-f0-9]{6}$/); + + // Fetch by docid — must work + const { stdout: getOut, exitCode: getExit } = await runQmd( + ["get", hit.docid], + { cwd: collectionDir, dbPath, configDir } + ); + expect(getExit, `get by docid failed`).toBe(0); + expect(getOut).toContain("Budget & Revenue (Q4) [2024].md"); + }); + + test("normal filenames are still stored correctly (regression)", async () => { + const { stdout, exitCode } = await runQmd( + ["search", "Plain filename", "--json"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(exitCode).toBe(0); + const results = JSON.parse(stdout) as Array<{ file: string }>; + const hit = results.find((r) => r.file.includes("normal-file")); + expect(hit).toBeDefined(); + expect(hit!.file).toContain("normal-file.md"); + }); +}); + +// --------------------------------------------------------------------------- +// Migration test: old handalized DB upgraded by `qmd update` +// --------------------------------------------------------------------------- + +describe("Path fidelity — migration from handalized index", () => { + test("qmd update migrates handalized paths to literal paths in existing index", async () => { + const { collectionDir, dbPath, configDir } = await createCrazyCollection("migration"); + + // Manually build an old-style DB using handalize() (simulates pre-fix index) + const store = createStore(dbPath); + const now = new Date().toISOString(); + // Write and sync a config that points at the collection so `qmd update` knows where it is + const migrationYaml = `collections:\n crazytest:\n path: "${collectionDir}"\n mask: "**/*.md"\n`; + await writeFile(join(configDir, "index.yml"), migrationYaml); + const config = YAML.parse(migrationYaml) as CollectionConfig; + syncConfigToDb(store.db, config); + + // Insert documents with handalized paths (old behavior) + for (const f of crazyFiles) { + const relPath = normalizePathSeparators(f.name); + const handleized = handelize(relPath); + const hash = await hashContent(f.content); + insertContent(store.db, hash, f.content, now); + insertDocument(store.db, "crazytest", handleized, `Title ${f.name}`, hash, now, now); + } + const subFile = crazySubFiles[0]!; + const subRel = `subdir/${subFile.name}`; + const subHandelized = handelize(subRel); + const subHash = await hashContent(subFile.content); + insertContent(store.db, subHash, subFile.content, now); + insertDocument(store.db, "crazytest", subHandelized, "Sub title", subHash, now, now); + store.close(); + + // Verify the old DB has handalized paths + const dbBefore = openDatabase(dbPath); + const pathsBefore = (dbBefore.prepare( + "SELECT path FROM documents WHERE active = 1 ORDER BY path" + ).all() as { path: string }[]).map((r) => r.path); + dbBefore.close(); + + expect(pathsBefore).toContain("Meeting-234232-3432-5.md"); + expect(pathsBefore).toContain("Budget-Revenue-Q4-2024.md"); + expect(pathsBefore).not.toContain("# Meeting - 234232 3432 __ 5.md"); + + // Run `qmd update` with the new code — should migrate paths in-place + const update = await runQmd( + ["update"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(update.exitCode, `qmd update failed: ${update.stderr}`).toBe(0); + + // Verify the DB now has literal paths + const dbAfter = openDatabase(dbPath); + const pathsAfter = (dbAfter.prepare( + "SELECT path FROM documents WHERE active = 1 ORDER BY path" + ).all() as { path: string }[]).map((r) => r.path); + dbAfter.close(); + + expect(pathsAfter).toContain("# Meeting - 234232 3432 __ 5.md"); + expect(pathsAfter).toContain("Budget & Revenue (Q4) [2024].md"); + expect(pathsAfter).toContain("normal-file.md"); + expect(pathsAfter).toContain("subdir/Notes #42 - foo@bar.md"); + + // Handalized slugs must be gone + expect(pathsAfter).not.toContain("Meeting-234232-3432-5.md"); + expect(pathsAfter).not.toContain("Budget-Revenue-Q4-2024.md"); + + // Search must work after migration + const { stdout: searchOut, exitCode: searchExit } = await runQmd( + ["search", "searchterm-alpha", "--json"], + { cwd: collectionDir, dbPath, configDir } + ); + expect(searchExit).toBe(0); + const results = JSON.parse(searchOut) as Array<{ file: string }>; + expect(results.length).toBeGreaterThan(0); + const meetingResult = results.find((r) => r.file.includes("Meeting")); + expect(meetingResult).toBeDefined(); + expect(meetingResult!.file).toContain("# Meeting - 234232 3432 __ 5.md"); + }); +});