Merge pull request #546 from junmo-kim/fix/handelize-preserve-case
fix: preserve original case in handelize()
This commit is contained in:
commit
525b9970cd
@ -5,6 +5,10 @@
|
||||
### Fixes
|
||||
|
||||
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
|
||||
- Fix: preserve original filename case in `handelize()`. The previous
|
||||
`.toLowerCase()` call made indexed paths unreachable on case-sensitive
|
||||
filesystems (Linux). `qmd update` automatically migrates legacy
|
||||
lowercase paths without re-embedding.
|
||||
|
||||
## [2.1.0] - 2026-04-05
|
||||
|
||||
|
||||
@ -45,6 +45,7 @@ import {
|
||||
insertContent,
|
||||
insertDocument,
|
||||
findActiveDocument,
|
||||
findOrMigrateLegacyDocument,
|
||||
updateDocumentTitle,
|
||||
updateDocument,
|
||||
deactivateDocument,
|
||||
@ -1581,8 +1582,8 @@ async function indexFiles(pwd?: string, globPattern: string = DEFAULT_GLOB, coll
|
||||
const hash = await hashContent(content);
|
||||
const title = extractTitle(content, relativeFile);
|
||||
|
||||
// Check if document exists in this collection with this path
|
||||
const existing = findActiveDocument(db, collectionName, path);
|
||||
// Check if document exists (also migrates legacy lowercase paths)
|
||||
const existing = findOrMigrateLegacyDocument(db, collectionName, path);
|
||||
|
||||
if (existing) {
|
||||
if (existing.hash === hash) {
|
||||
|
||||
58
src/store.ts
58
src/store.ts
@ -1146,6 +1146,7 @@ export type Store = {
|
||||
insertContent: (hash: string, content: string, createdAt: string) => void;
|
||||
insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => void;
|
||||
findActiveDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
|
||||
findOrMigrateLegacyDocument: (collectionName: string, path: string) => { id: number; hash: string; title: string } | null;
|
||||
updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => void;
|
||||
updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => void;
|
||||
deactivateDocument: (collectionName: string, path: string) => void;
|
||||
@ -1236,7 +1237,7 @@ export async function reindexCollection(
|
||||
const hash = await hashContent(content);
|
||||
const title = extractTitle(content, relativeFile);
|
||||
|
||||
const existing = findActiveDocument(db, collectionName, path);
|
||||
const existing = findOrMigrateLegacyDocument(db, collectionName, path);
|
||||
|
||||
if (existing) {
|
||||
if (existing.hash === hash) {
|
||||
@ -1659,6 +1660,7 @@ export function createStore(dbPath?: string): Store {
|
||||
insertContent: (hash: string, content: string, createdAt: string) => insertContent(db, hash, content, createdAt),
|
||||
insertDocument: (collectionName: string, path: string, title: string, hash: string, createdAt: string, modifiedAt: string) => insertDocument(db, collectionName, path, title, hash, createdAt, modifiedAt),
|
||||
findActiveDocument: (collectionName: string, path: string) => findActiveDocument(db, collectionName, path),
|
||||
findOrMigrateLegacyDocument: (collectionName: string, path: string) => findOrMigrateLegacyDocument(db, collectionName, path),
|
||||
updateDocumentTitle: (documentId: number, title: string, modifiedAt: string) => updateDocumentTitle(db, documentId, title, modifiedAt),
|
||||
updateDocument: (documentId: number, title: string, hash: string, modifiedAt: string) => updateDocument(db, documentId, title, hash, modifiedAt),
|
||||
deactivateDocument: (collectionName: string, path: string) => deactivateDocument(db, collectionName, path),
|
||||
@ -1704,11 +1706,11 @@ export function getDocid(hash: string): string {
|
||||
/**
|
||||
* Handelize a filename to be more token-friendly.
|
||||
* - Convert triple underscore `___` to `/` (folder separator)
|
||||
* - Convert to lowercase
|
||||
* - Replace sequences of non-word chars (except /) with single dash
|
||||
* - Remove leading/trailing dashes from path segments
|
||||
* - Preserve folder structure (a/b/c/d.md stays structured)
|
||||
* - Preserve file extension
|
||||
* - Preserve original case (important for case-sensitive filesystems)
|
||||
*/
|
||||
/** Replace emoji/symbol codepoints with their hex representation (e.g. 🐘 → 1f418) */
|
||||
function emojiToHex(str: string): string {
|
||||
@ -1736,7 +1738,6 @@ export function handelize(path: string): string {
|
||||
|
||||
const result = path
|
||||
.replace(/___/g, '/') // Triple underscore becomes folder separator
|
||||
.toLowerCase()
|
||||
.split('/')
|
||||
.map((segment, idx, arr) => {
|
||||
const isLastSegment = idx === arr.length - 1;
|
||||
@ -2114,6 +2115,57 @@ export function findActiveDocument(
|
||||
return row ?? null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find an active document, falling back to a legacy lowercase path.
|
||||
* If found under the legacy path, renames it in-place and rebuilds the
|
||||
* FTS entry. Embeddings are keyed by content hash, so the rename is
|
||||
* safe — no re-embedding required.
|
||||
*
|
||||
* @internal Used by reindexCollection and indexFiles during qmd update.
|
||||
* Returns null if the document does not exist under either path.
|
||||
*/
|
||||
export function findOrMigrateLegacyDocument(
|
||||
db: Database,
|
||||
collectionName: string,
|
||||
path: string
|
||||
): { id: number; hash: string; title: string } | null {
|
||||
const existing = findActiveDocument(db, collectionName, path);
|
||||
if (existing) return existing;
|
||||
|
||||
const legacyPath = path.toLowerCase();
|
||||
if (legacyPath === path) return null;
|
||||
|
||||
const legacy = findActiveDocument(db, collectionName, legacyPath);
|
||||
if (!legacy) return null;
|
||||
|
||||
// Wrap rename + FTS rebuild in a transaction for atomicity.
|
||||
const migrate = db.transaction(() => {
|
||||
// Use OR IGNORE so a UNIQUE conflict (e.g. both "readme.md" and
|
||||
// "README.md" already exist) is a no-op rather than crashing.
|
||||
const result = db.prepare(
|
||||
`UPDATE OR IGNORE documents SET path = ? WHERE id = ? AND active = 1`
|
||||
).run(path, legacy.id);
|
||||
|
||||
if (result.changes === 0) return false;
|
||||
|
||||
// FTS5 does not reliably update via the documents_au trigger's
|
||||
// INSERT OR REPLACE. Manually rebuild the FTS entry.
|
||||
db.prepare(`DELETE FROM documents_fts WHERE rowid = ?`).run(legacy.id);
|
||||
db.prepare(`
|
||||
INSERT INTO documents_fts(rowid, filepath, title, body)
|
||||
SELECT id, collection || '/' || path, title,
|
||||
(SELECT doc FROM content WHERE hash = documents.hash)
|
||||
FROM documents WHERE id = ?
|
||||
`).run(legacy.id);
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
if (!migrate()) return null;
|
||||
|
||||
return findActiveDocument(db, collectionName, path);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the title and modified_at timestamp for a document.
|
||||
*/
|
||||
|
||||
@ -837,8 +837,8 @@ describe("CLI ls Command", () => {
|
||||
test("lists files in a collection", async () => {
|
||||
const { stdout, exitCode } = await runQmd(["ls", "fixtures"], { dbPath: localDbPath });
|
||||
expect(exitCode).toBe(0);
|
||||
// handelize converts to lowercase
|
||||
expect(stdout).toContain("qmd://fixtures/readme.md");
|
||||
// handelize preserves original case
|
||||
expect(stdout).toContain("qmd://fixtures/README.md");
|
||||
expect(stdout).toContain("qmd://fixtures/notes/meeting.md");
|
||||
});
|
||||
|
||||
@ -847,8 +847,8 @@ describe("CLI ls Command", () => {
|
||||
expect(exitCode).toBe(0);
|
||||
expect(stdout).toContain("qmd://fixtures/notes/meeting.md");
|
||||
expect(stdout).toContain("qmd://fixtures/notes/ideas.md");
|
||||
// Should not include files outside the prefix (handelize converts to lowercase)
|
||||
expect(stdout).not.toContain("qmd://fixtures/readme.md");
|
||||
// Should not include files outside the prefix (case preserved)
|
||||
expect(stdout).not.toContain("qmd://fixtures/README.md");
|
||||
});
|
||||
|
||||
test("lists files with virtual path", async () => {
|
||||
|
||||
@ -119,14 +119,14 @@ describe("cleanupOrphanedVectors", () => {
|
||||
// =============================================================================
|
||||
|
||||
describe("handelize", () => {
|
||||
test("converts to lowercase", () => {
|
||||
expect(handelize("README.md")).toBe("readme.md");
|
||||
expect(handelize("MyFile.MD")).toBe("myfile.md");
|
||||
test("preserves original case", () => {
|
||||
expect(handelize("README.md")).toBe("README.md");
|
||||
expect(handelize("MyFile.MD")).toBe("MyFile.MD");
|
||||
});
|
||||
|
||||
test("preserves folder structure", () => {
|
||||
expect(handelize("a/b/c/d.md")).toBe("a/b/c/d.md");
|
||||
expect(handelize("docs/api/README.md")).toBe("docs/api/readme.md");
|
||||
expect(handelize("docs/api/README.md")).toBe("docs/api/README.md");
|
||||
});
|
||||
|
||||
test("replaces non-word characters with dash", () => {
|
||||
@ -156,7 +156,7 @@ describe("handelize", () => {
|
||||
test("handles complex real-world meeting notes", () => {
|
||||
const complexName = "Money Movement Licensing Review - 2025/11/19 10:25 EST - Notes by Gemini.md";
|
||||
const result = handelize(complexName);
|
||||
expect(result).toBe("money-movement-licensing-review-2025-11-19-10-25-est-notes-by-gemini.md");
|
||||
expect(result).toBe("Money-Movement-Licensing-Review-2025-11-19-10-25-EST-Notes-by-Gemini.md");
|
||||
expect(result).not.toContain(" ");
|
||||
expect(result).not.toContain("/");
|
||||
expect(result).not.toContain(":");
|
||||
@ -164,7 +164,7 @@ describe("handelize", () => {
|
||||
|
||||
test("handles unicode characters", () => {
|
||||
expect(handelize("日本語.md")).toBe("日本語.md");
|
||||
expect(handelize("Зоны и проекты.md")).toBe("зоны-и-проекты.md");
|
||||
expect(handelize("Зоны и проекты.md")).toBe("Зоны-и-проекты.md");
|
||||
expect(handelize("café-notes.md")).toBe("café-notes.md");
|
||||
expect(handelize("naïve.md")).toBe("naïve.md");
|
||||
expect(handelize("日本語-notes.md")).toBe("日本語-notes.md");
|
||||
@ -186,13 +186,13 @@ describe("handelize", () => {
|
||||
test("handles dates and times in filenames", () => {
|
||||
expect(handelize("meeting-2025-01-15.md")).toBe("meeting-2025-01-15.md");
|
||||
expect(handelize("notes 2025/01/15.md")).toBe("notes-2025/01/15.md");
|
||||
expect(handelize("call_10:30_AM.md")).toBe("call-10-30-am.md");
|
||||
expect(handelize("call_10:30_AM.md")).toBe("call-10-30-AM.md");
|
||||
});
|
||||
|
||||
test("handles special project naming patterns", () => {
|
||||
expect(handelize("PROJECT_ABC_v2.0.md")).toBe("project-abc-v2-0.md");
|
||||
expect(handelize("[WIP] Feature Request.md")).toBe("wip-feature-request.md");
|
||||
expect(handelize("(DRAFT) Proposal v1.md")).toBe("draft-proposal-v1.md");
|
||||
expect(handelize("PROJECT_ABC_v2.0.md")).toBe("PROJECT-ABC-v2-0.md");
|
||||
expect(handelize("[WIP] Feature Request.md")).toBe("WIP-Feature-Request.md");
|
||||
expect(handelize("(DRAFT) Proposal v1.md")).toBe("DRAFT-Proposal-v1.md");
|
||||
});
|
||||
|
||||
test("handles symbol-only route filenames", () => {
|
||||
|
||||
@ -3053,6 +3053,75 @@ describe("Content-Addressable Storage", () => {
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("findOrMigrateLegacyDocument renames lowercase path to case-preserved", async () => {
|
||||
const store = await createTestStore();
|
||||
const collectionName = await createTestCollection();
|
||||
const now = new Date().toISOString();
|
||||
|
||||
const content = "# My Skill";
|
||||
const hash = await hashContent(content);
|
||||
store.insertContent(hash, content, now);
|
||||
// Simulate legacy index: path stored as lowercase
|
||||
store.insertDocument(collectionName, "skills/skill.md", "My Skill", hash, now, now);
|
||||
|
||||
// Migration: look up case-preserved path, expect rename
|
||||
const result = store.findOrMigrateLegacyDocument(collectionName, "skills/SKILL.md");
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.hash).toBe(hash);
|
||||
|
||||
// Old lowercase path should no longer be findable
|
||||
expect(store.findActiveDocument(collectionName, "skills/skill.md")).toBeNull();
|
||||
// New case-preserved path should be active
|
||||
const migrated = store.findActiveDocument(collectionName, "skills/SKILL.md");
|
||||
expect(migrated).not.toBeNull();
|
||||
expect(migrated!.hash).toBe(hash);
|
||||
|
||||
// FTS should reflect the new path (documents_au trigger)
|
||||
const ftsRow = store.db.prepare(
|
||||
`SELECT filepath FROM documents_fts WHERE rowid = ?`
|
||||
).get(result!.id) as { filepath: string } | undefined;
|
||||
expect(ftsRow).toBeDefined();
|
||||
expect(ftsRow!.filepath).toContain("SKILL.md");
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("findOrMigrateLegacyDocument returns null when path is already lowercase", async () => {
|
||||
const store = await createTestStore();
|
||||
const collectionName = await createTestCollection();
|
||||
|
||||
// No document exists at all
|
||||
const result = store.findOrMigrateLegacyDocument(collectionName, "readme.md");
|
||||
expect(result).toBeNull();
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
|
||||
test("findOrMigrateLegacyDocument returns existing doc when canonical path already present", async () => {
|
||||
const store = await createTestStore();
|
||||
const collectionName = await createTestCollection();
|
||||
const now = new Date().toISOString();
|
||||
|
||||
const content = "# Content";
|
||||
const hash = await hashContent(content);
|
||||
store.insertContent(hash, content, now);
|
||||
// Both lowercase and case-preserved paths exist (edge case from prior partial migration)
|
||||
store.insertDocument(collectionName, "readme.md", "Readme", hash, now, now);
|
||||
store.insertDocument(collectionName, "README.md", "README", hash, now, now);
|
||||
|
||||
// Should return the canonical-path document directly (fast path)
|
||||
// The legacy "readme.md" row is untouched — no rename attempted.
|
||||
const result = store.findOrMigrateLegacyDocument(collectionName, "README.md");
|
||||
expect(result).not.toBeNull();
|
||||
expect(result!.hash).toBe(hash);
|
||||
|
||||
// Both rows still exist (legacy row not migrated, not deactivated here)
|
||||
expect(store.findActiveDocument(collectionName, "readme.md")).not.toBeNull();
|
||||
expect(store.findActiveDocument(collectionName, "README.md")).not.toBeNull();
|
||||
|
||||
await cleanupTestDb(store);
|
||||
});
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
|
||||
Loading…
Reference in New Issue
Block a user