fix: store literal filesystem paths, drop handelize() at index time

Filenames with special characters (#, &, spaces, [], (), etc.) now
round-trip correctly through index → search → get → full-path.

Root cause: reindexCollection() called handelize() on the relative path
before storing it in documents.path, turning
  '# Meeting - 234232 3432 __ 5.md' → 'Meeting-234232-3432-5.md'
This broke all downstream operations that needed to reconstruct the
real filesystem path from the DB record.

Changes:
- Remove handelize() from reindexCollection() in store.ts (index time)
- Remove handelize() from update command path in cli/qmd.ts
- findOrMigrateLegacyDocument now tries both raw path and handalized
  variant so existing indexes auto-migrate on next qmd update
- resolveVirtualPath, toVirtualPath, detectCollectionFromPath all work
  correctly once the DB stores literal paths

Tests (test/path-fidelity.test.ts — 10/10):
- Store level: DB contains literal paths, not handalized slugs
- toVirtualPath returns non-null for crazy-named files
- (1) search --json file field shows literal path
- (2) get --full-path resolves to a real on-disk path
- (3) get <actual-fs-path> finds the document
- (3b) subdir file with crazy name also works
- (4) ls shows literal paths
- (5) search docid can be fetched back
- Normal filenames still work (regression)
- Migration: qmd update on handalized index rewrites paths to literal
This commit is contained in:
Tobias Lütke 2026-06-01 20:20:50 +00:00
parent f9d414c931
commit 070147d8ab
4 changed files with 446 additions and 3 deletions

View File

@ -4,6 +4,14 @@
### Fixed
- Filesystem paths with special characters (`#`, `&`, spaces, `[]`, `()`, etc.)
now round-trip correctly through index → search → get. Previously
`reindexCollection` called `handelize()` on relative paths before storing
them, turning `# Meeting - 234232 3432 __ 5.md` into
`Meeting-234232-3432-5.md` and making `qmd get <actual-path>`,
`qmd get --full-path`, and `qmd ls` return dead or garbled paths. Paths are
now stored verbatim. Existing indexes auto-migrate on the next `qmd update`.
- FTS5 search now correctly matches dotted version strings like `2026.4.10`. The
`porter unicode61` tokenizer splits on dots (storing `2026`, `4`, `10` as
separate tokens), but the query sanitizer was stripping dots and producing

View File

@ -1824,7 +1824,8 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, coll
for (const relativeFile of files) {
const filepath = getRealPath(resolve(resolvedPwd, relativeFile));
const path = handelize(relativeFile); // Normalize path for token-friendliness
// Store the literal relative path — handelize() is NOT applied at index time.
const path = relativeFile.replace(/\\/g, '/');
seenPaths.add(path);
let content: string;

View File

@ -1306,7 +1306,10 @@ export async function reindexCollection(
for (const relativeFile of files) {
const filepath = getRealPath(resolve(collectionPath, relativeFile));
const path = handelize(relativeFile);
// Store the literal relative path so the filesystem path can always be
// reconstructed as: resolve(collection.path, storedPath).
// handelize() is NOT applied at index time — it is display-only.
const path = normalizePathSeparators(relativeFile);
seenPaths.add(path);
let content: string;
@ -2493,12 +2496,34 @@ export function findOrMigrateLegacyDocument(
const existing = findActiveDocument(db, collectionName, path);
if (existing) return existing;
const legacy = db.prepare(`
// Case-insensitive match (legacy normalization: e.g. "README.md" → "readme.md").
const legacyCase = db.prepare(`
SELECT id, hash, title FROM documents
WHERE collection = ? AND path COLLATE NOCASE = ? AND active = 1
ORDER BY id
LIMIT 1
`).get(collectionName, path) as { id: number; hash: string; title: string } | undefined;
// Handalized-path match: existing DBs indexed with handelize() stored slugged paths
// like "Budget-Revenue-Q4-2024.md" for a raw path like "Budget & Revenue (Q4) [2024].md".
// Try matching the handalized form of the incoming raw path against the DB so that
// qmd update on an old index can rename the row to the literal path.
let legacyHandalized: { id: number; hash: string; title: string } | undefined;
try {
const handleized = handelize(path);
if (handleized !== path) {
legacyHandalized = db.prepare(`
SELECT id, hash, title FROM documents
WHERE collection = ? AND path = ? AND active = 1
ORDER BY id
LIMIT 1
`).get(collectionName, handleized) as { id: number; hash: string; title: string } | undefined;
}
} catch {
// handelize throws on invalid paths; just skip
}
const legacy = legacyCase ?? legacyHandalized;
if (!legacy) return null;
// Wrap rename + FTS rebuild in a transaction for atomicity.

409
test/path-fidelity.test.ts Normal file
View File

@ -0,0 +1,409 @@
/**
* Path Fidelity Tests
*
* Verifies that QMD stores literal filesystem paths (not handalized slugs) so
* that paths with special characters spaces, #, &, @, [], (), etc. round-
* trip correctly through index search get full-path.
*
* This covers the five breakage points found before the literal-path fix:
* 1. search --json `file` field shows handalized slug instead of real path
* 2. `qmd get --full-path` silently falls back (resolveVirtualPath built
* a non-existent path from the slug, existsSync returned false)
* 3. `qmd get <actual-fs-path>` returns "Document not found"
* 4. `qmd ls` shows handalized slugs
* 5. `toVirtualPath(db, absPath)` returns null
*
* Also covers backward-compat migration: an index created with the old
* handalize-at-index-time code can be updated with `qmd update` and the paths
* are renamed to their literal forms in-place.
*/
import { describe, test, expect, beforeAll, afterAll } from "vitest";
import { mkdir, mkdtemp, rm, writeFile } from "fs/promises";
import { existsSync } from "fs";
import { tmpdir } from "os";
import { join } from "path";
import { spawn } from "child_process";
import { fileURLToPath } from "url";
import { dirname } from "path";
import YAML from "yaml";
import { openDatabase } from "../src/db.js";
import type { Database } from "../src/db.js";
import {
createStore,
toVirtualPath,
insertDocument,
insertContent,
hashContent,
handelize,
normalizePathSeparators,
syncConfigToDb,
} from "../src/store.js";
import type { CollectionConfig } from "../src/collections.js";
const thisDir = dirname(fileURLToPath(import.meta.url));
const projectRoot = join(thisDir, "..");
const qmdScript = join(projectRoot, "src", "cli", "qmd.ts");
const isBunRuntime = typeof (globalThis as { Bun?: unknown }).Bun !== "undefined";
const tsxCli = join(projectRoot, "node_modules", "tsx", "dist", "cli.mjs");
async function runQmd(
args: string[],
opts: { cwd: string; dbPath: string; configDir: string; env?: Record<string, string> }
): Promise<{ stdout: string; stderr: string; exitCode: number }> {
const runner = isBunRuntime
? { command: process.execPath, args: [qmdScript, ...args] }
: { command: process.execPath, args: [tsxCli, qmdScript, ...args] };
const proc = spawn(runner.command, runner.args, {
cwd: opts.cwd,
env: {
...process.env,
INDEX_PATH: opts.dbPath,
QMD_CONFIG_DIR: opts.configDir,
PWD: opts.cwd,
QMD_DOCTOR_DEVICE_PROBE: "0",
...(opts.env ?? {}),
},
stdio: ["ignore", "pipe", "pipe"],
});
let stdout = "";
let stderr = "";
proc.stdout?.on("data", (c: Buffer) => { stdout += c.toString(); });
proc.stderr?.on("data", (c: Buffer) => { stderr += c.toString(); });
const exitCode = await new Promise<number>((res, rej) => {
proc.once("error", rej);
proc.on("close", (code) => res(code ?? 1));
});
return { stdout, stderr, exitCode };
}
// ---------------------------------------------------------------------------
// Test environment setup
// ---------------------------------------------------------------------------
let testDir: string;
// Files with names that previously broke due to handalize() at index time.
const crazyFiles: Array<{ name: string; content: string }> = [
{
name: "# Meeting - 234232 3432 __ 5.md",
content: "# Meeting - 234232 3432 // 5\n\nSome meeting content with searchterm-alpha.\n",
},
{
name: "Budget & Revenue (Q4) [2024].md",
content: "# Budget & Revenue Q4 2024\n\nFinancial overview searchterm-beta.\n",
},
{
name: "normal-file.md",
content: "# Normal File\n\nPlain filename, should always work.\n",
},
];
const crazySubFiles: Array<{ name: string; content: string }> = [
{
name: "Notes #42 - foo@bar.md",
content: "# Notes #42\n\nSubdir file with searchterm-gamma.\n",
},
];
beforeAll(async () => {
testDir = await mkdtemp(join(tmpdir(), "qmd-path-fidelity-"));
});
afterAll(async () => {
await rm(testDir, { recursive: true, force: true });
});
// Helper: create a fresh isolated test environment with a corpus of crazy filenames.
async function createCrazyCollection(prefix: string): Promise<{
collectionDir: string;
dbPath: string;
configDir: string;
}> {
const envDir = join(testDir, prefix);
const collectionDir = join(envDir, "corpus");
const dbPath = join(envDir, "test.sqlite");
const configDir = join(envDir, "config");
await mkdir(collectionDir, { recursive: true });
await mkdir(join(collectionDir, "subdir"), { recursive: true });
await mkdir(configDir, { recursive: true });
for (const f of crazyFiles) {
await writeFile(join(collectionDir, f.name), f.content);
}
for (const f of crazySubFiles) {
await writeFile(join(collectionDir, "subdir", f.name), f.content);
}
// Write empty YAML config — `collection add` will populate it
await writeFile(join(configDir, "index.yml"), "collections: {}\n");
return { collectionDir, dbPath, configDir };
}
// ---------------------------------------------------------------------------
// Unit tests: store-level path storage
// ---------------------------------------------------------------------------
describe("Path fidelity — store level", () => {
test("reindexCollection stores literal relative paths, not handalized slugs", async () => {
const { collectionDir, dbPath, configDir } = await createCrazyCollection("store-unit");
// Run `collection add` to index
const add = await runQmd(
["collection", "add", collectionDir, "--name", "crazytest"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(add.exitCode, `collection add failed: ${add.stderr}`).toBe(0);
// Inspect the DB directly
const db = openDatabase(dbPath);
const rows = db.prepare(
"SELECT path FROM documents WHERE active = 1 ORDER BY path"
).all() as { path: string }[];
db.close();
const paths = rows.map((r) => r.path);
// Must contain literal filenames — not handalized slugs
expect(paths).toContain("# Meeting - 234232 3432 __ 5.md");
expect(paths).toContain("Budget & Revenue (Q4) [2024].md");
expect(paths).toContain("normal-file.md");
expect(paths).toContain("subdir/Notes #42 - foo@bar.md");
// Must NOT contain handalized versions
expect(paths).not.toContain("Meeting-234232-3432-5.md");
expect(paths).not.toContain("Budget-Revenue-Q4-2024.md");
expect(paths).not.toContain("subdir/Notes-42-foo-bar.md");
});
test("toVirtualPath returns non-null for crazy-named files", async () => {
const { collectionDir, dbPath, configDir } = await createCrazyCollection("store-to-virtual");
const add = await runQmd(
["collection", "add", collectionDir, "--name", "crazytest"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(add.exitCode).toBe(0);
const rawDb = openDatabase(dbPath);
const result = toVirtualPath(rawDb, join(collectionDir, "Budget & Revenue (Q4) [2024].md"));
rawDb.close();
expect(result).not.toBeNull();
expect(result).toBe(`qmd://crazytest/Budget & Revenue (Q4) [2024].md`);
});
});
// ---------------------------------------------------------------------------
// CLI integration tests — the five original breakage points
// ---------------------------------------------------------------------------
describe("Path fidelity — CLI integration", () => {
let collectionDir: string;
let dbPath: string;
let configDir: string;
// Index once for the whole describe block (read-only tests share it)
beforeAll(async () => {
({ collectionDir, dbPath, configDir } = await createCrazyCollection("cli-shared"));
const add = await runQmd(
["collection", "add", collectionDir, "--name", "crazytest"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(add.exitCode, `collection add failed: ${add.stderr}`).toBe(0);
});
test("(1) search --json file field contains literal path, not handalized slug", async () => {
const { stdout, exitCode } = await runQmd(
["search", "searchterm-alpha", "--json"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(exitCode).toBe(0);
const results = JSON.parse(stdout) as Array<{ file: string }>;
expect(results.length).toBeGreaterThan(0);
const meetingResult = results.find((r) => r.file.includes("Meeting"));
expect(meetingResult).toBeDefined();
// Must contain the literal filename fragment
expect(meetingResult!.file).toContain("# Meeting - 234232 3432 __ 5.md");
// Must not contain the handalized version
expect(meetingResult!.file).not.toContain("Meeting-234232-3432-5.md");
});
test("(2) get --full-path resolves to real filesystem path for crazy-named file", async () => {
const virtualPath = `qmd://crazytest/Budget & Revenue (Q4) [2024].md`;
const { stdout, exitCode } = await runQmd(
["get", virtualPath, "--full-path"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(exitCode, `get failed: ${stdout}`).toBe(0);
const header = stdout.split("\n")[0]!;
// Should show a real filesystem path, not a qmd:// virtual path
expect(header).not.toMatch(/^qmd:\/\//);
// Should include the literal filename
expect(header).toContain("Budget & Revenue (Q4) [2024].md");
// The resolved filesystem path should exist — strip the trailing docid (#abc123)
const fsPath = header.trim().replace(/\s+#[a-f0-9]{6}$/, "");
// Path may be absolute or relative-to-collectionDir; resolve against collectionDir
const absPath = fsPath.startsWith("/") ? fsPath : join(collectionDir, fsPath.replace(/^\.\//, ""));
expect(existsSync(absPath), `resolved path does not exist: ${absPath}`).toBe(true);
});
test("(3) get <actual-fs-path> finds the document", async () => {
const fsPath = join(collectionDir, "Budget & Revenue (Q4) [2024].md");
const { stdout, exitCode, stderr } = await runQmd(
["get", fsPath],
{ cwd: collectionDir, dbPath, configDir }
);
expect(exitCode, `get by fs path failed: ${stderr}`).toBe(0);
// Header should contain the document identifier
expect(stdout).toContain("Budget & Revenue (Q4) [2024].md");
});
test("(3b) get <actual-fs-path> finds subdir file with crazy name", async () => {
const fsPath = join(collectionDir, "subdir", "Notes #42 - foo@bar.md");
const { stdout, exitCode, stderr } = await runQmd(
["get", fsPath],
{ cwd: collectionDir, dbPath, configDir }
);
expect(exitCode, `get subdir file failed: ${stderr}`).toBe(0);
expect(stdout).toContain("Notes #42 - foo@bar.md");
});
test("(4) ls shows literal paths, not handalized slugs", async () => {
const { stdout, exitCode } = await runQmd(
["ls", "crazytest"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(exitCode).toBe(0);
// Literal paths must appear
expect(stdout).toContain("# Meeting - 234232 3432 __ 5.md");
expect(stdout).toContain("Budget & Revenue (Q4) [2024].md");
expect(stdout).toContain("Notes #42 - foo@bar.md");
// Handalized slugs must NOT appear
expect(stdout).not.toContain("Meeting-234232-3432-5.md");
expect(stdout).not.toContain("Budget-Revenue-Q4-2024.md");
expect(stdout).not.toContain("Notes-42-foo-bar.md");
});
test("(5) search --json returns docid that can be fetched back", async () => {
const { stdout: searchOut, exitCode: searchExit } = await runQmd(
["search", "searchterm-beta", "--json"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(searchExit).toBe(0);
const results = JSON.parse(searchOut) as Array<{ docid: string; file: string }>;
expect(results.length).toBeGreaterThan(0);
const hit = results[0]!;
expect(hit.docid).toMatch(/^#[a-f0-9]{6}$/);
// Fetch by docid — must work
const { stdout: getOut, exitCode: getExit } = await runQmd(
["get", hit.docid],
{ cwd: collectionDir, dbPath, configDir }
);
expect(getExit, `get by docid failed`).toBe(0);
expect(getOut).toContain("Budget & Revenue (Q4) [2024].md");
});
test("normal filenames are still stored correctly (regression)", async () => {
const { stdout, exitCode } = await runQmd(
["search", "Plain filename", "--json"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(exitCode).toBe(0);
const results = JSON.parse(stdout) as Array<{ file: string }>;
const hit = results.find((r) => r.file.includes("normal-file"));
expect(hit).toBeDefined();
expect(hit!.file).toContain("normal-file.md");
});
});
// ---------------------------------------------------------------------------
// Migration test: old handalized DB upgraded by `qmd update`
// ---------------------------------------------------------------------------
describe("Path fidelity — migration from handalized index", () => {
test("qmd update migrates handalized paths to literal paths in existing index", async () => {
const { collectionDir, dbPath, configDir } = await createCrazyCollection("migration");
// Manually build an old-style DB using handalize() (simulates pre-fix index)
const store = createStore(dbPath);
const now = new Date().toISOString();
// Write and sync a config that points at the collection so `qmd update` knows where it is
const migrationYaml = `collections:\n crazytest:\n path: "${collectionDir}"\n mask: "**/*.md"\n`;
await writeFile(join(configDir, "index.yml"), migrationYaml);
const config = YAML.parse(migrationYaml) as CollectionConfig;
syncConfigToDb(store.db, config);
// Insert documents with handalized paths (old behavior)
for (const f of crazyFiles) {
const relPath = normalizePathSeparators(f.name);
const handleized = handelize(relPath);
const hash = await hashContent(f.content);
insertContent(store.db, hash, f.content, now);
insertDocument(store.db, "crazytest", handleized, `Title ${f.name}`, hash, now, now);
}
const subFile = crazySubFiles[0]!;
const subRel = `subdir/${subFile.name}`;
const subHandelized = handelize(subRel);
const subHash = await hashContent(subFile.content);
insertContent(store.db, subHash, subFile.content, now);
insertDocument(store.db, "crazytest", subHandelized, "Sub title", subHash, now, now);
store.close();
// Verify the old DB has handalized paths
const dbBefore = openDatabase(dbPath);
const pathsBefore = (dbBefore.prepare(
"SELECT path FROM documents WHERE active = 1 ORDER BY path"
).all() as { path: string }[]).map((r) => r.path);
dbBefore.close();
expect(pathsBefore).toContain("Meeting-234232-3432-5.md");
expect(pathsBefore).toContain("Budget-Revenue-Q4-2024.md");
expect(pathsBefore).not.toContain("# Meeting - 234232 3432 __ 5.md");
// Run `qmd update` with the new code — should migrate paths in-place
const update = await runQmd(
["update"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(update.exitCode, `qmd update failed: ${update.stderr}`).toBe(0);
// Verify the DB now has literal paths
const dbAfter = openDatabase(dbPath);
const pathsAfter = (dbAfter.prepare(
"SELECT path FROM documents WHERE active = 1 ORDER BY path"
).all() as { path: string }[]).map((r) => r.path);
dbAfter.close();
expect(pathsAfter).toContain("# Meeting - 234232 3432 __ 5.md");
expect(pathsAfter).toContain("Budget & Revenue (Q4) [2024].md");
expect(pathsAfter).toContain("normal-file.md");
expect(pathsAfter).toContain("subdir/Notes #42 - foo@bar.md");
// Handalized slugs must be gone
expect(pathsAfter).not.toContain("Meeting-234232-3432-5.md");
expect(pathsAfter).not.toContain("Budget-Revenue-Q4-2024.md");
// Search must work after migration
const { stdout: searchOut, exitCode: searchExit } = await runQmd(
["search", "searchterm-alpha", "--json"],
{ cwd: collectionDir, dbPath, configDir }
);
expect(searchExit).toBe(0);
const results = JSON.parse(searchOut) as Array<{ file: string }>;
expect(results.length).toBeGreaterThan(0);
const meetingResult = results.find((r) => r.file.includes("Meeting"));
expect(meetingResult).toBeDefined();
expect(meetingResult!.file).toContain("# Meeting - 234232 3432 __ 5.md");
});
});