Add status command, fix collections, improve CLI output

- Rename 'collections' to 'status' with richer output:
  - Index size
  - Documents count and vector embedding status
  - Time since last update
  - Per-collection stats

- Fix `qmd add .` to use default glob pattern
- Fix duplicate collections with cleanup and INSERT OR IGNORE
- Improve update-all with colored progress output
- Fix 'qmd vector' → 'qmd embed' in help messages
- Implement weighted RRF (2x weight for original query)
- Simplify CLAUDE.md for project-specific instructions

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Tobi Lutke 2025-12-07 19:19:34 -05:00
parent 39193ea252
commit e963555ff8
No known key found for this signature in database
2 changed files with 119 additions and 137 deletions

128
CLAUDE.md
View File

@ -1,111 +1,29 @@
---
description: Use Bun instead of Node.js, npm, pnpm, or vite.
globs: "*.ts, *.tsx, *.html, *.css, *.js, *.jsx, package.json"
alwaysApply: false
---
# QMD - Quick Markdown Search
Default to using Bun instead of Node.js.
Use Bun instead of Node.js (`bun` not `node`, `bun install` not `npm install`).
- Use `bun <file>` instead of `node <file>` or `ts-node <file>`
- Use `bun test` instead of `jest` or `vitest`
- Use `bun build <file.html|file.ts|file.css>` instead of `webpack` or `esbuild`
- Use `bun install` instead of `npm install` or `yarn install` or `pnpm install`
- Use `bun run <script>` instead of `npm run <script>` or `yarn run <script>` or `pnpm run <script>`
- Bun automatically loads .env, so don't use dotenv.
## APIs
- `Bun.serve()` supports WebSockets, HTTPS, and routes. Don't use `express`.
- `bun:sqlite` for SQLite. Don't use `better-sqlite3`.
- `Bun.redis` for Redis. Don't use `ioredis`.
- `Bun.sql` for Postgres. Don't use `pg` or `postgres.js`.
- `WebSocket` is built-in. Don't use `ws`.
- Prefer `Bun.file` over `node:fs`'s readFile/writeFile
- Bun.$`ls` instead of execa.
## Testing
Use `bun test` to run tests.
```ts#index.test.ts
import { test, expect } from "bun:test";
test("hello world", () => {
expect(1).toBe(1);
});
```
## Frontend
Use HTML imports with `Bun.serve()`. Don't use `vite`. HTML imports fully support React, CSS, Tailwind.
Server:
```ts#index.ts
import index from "./index.html"
Bun.serve({
routes: {
"/": index,
"/api/users/:id": {
GET: (req) => {
return new Response(JSON.stringify({ id: req.params.id }));
},
},
},
// optional websocket support
websocket: {
open: (ws) => {
ws.send("Hello, world!");
},
message: (ws, message) => {
ws.send(message);
},
close: (ws) => {
// handle close
}
},
development: {
hmr: true,
console: true,
}
})
```
HTML files can import .tsx, .jsx or .js files directly and Bun's bundler will transpile & bundle automatically. `<link>` tags can point to stylesheets and Bun's CSS bundler will bundle.
```html#index.html
<html>
<body>
<h1>Hello, world!</h1>
<script type="module" src="./frontend.tsx"></script>
</body>
</html>
```
With the following `frontend.tsx`:
```tsx#frontend.tsx
import React from "react";
// import .css files directly and it works
import './index.css';
import { createRoot } from "react-dom/client";
const root = createRoot(document.body);
export default function Frontend() {
return <h1>Hello, world!</h1>;
}
root.render(<Frontend />);
```
Then, run index.ts
## Commands
```sh
bun --hot ./index.ts
qmd add . # Index markdown files in current directory
qmd status # Show index status and collections
qmd update-all # Re-index all collections
qmd embed # Generate vector embeddings (requires Ollama)
qmd search <query> # BM25 full-text search
qmd vsearch <query> # Vector similarity search
qmd query <query> # Hybrid search with reranking (best quality)
```
For more information, read the Bun API docs in `node_modules/bun-types/docs/**.md`.
## Development
```sh
bun qmd.ts <command> # Run from source
bun link # Install globally as 'qmd'
```
## Architecture
- SQLite FTS5 for full-text search (BM25)
- sqlite-vec for vector similarity search
- Ollama for embeddings (embeddinggemma) and reranking (qwen3-reranker)
- Reciprocal Rank Fusion (RRF) for combining results

128
qmd.ts
View File

@ -392,69 +392,133 @@ async function rerank(query: string, documents: { file: string; text: string }[]
function getOrCreateCollection(db: Database, pwd: string, globPattern: string): number {
const now = new Date().toISOString();
const existing = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number } | null;
if (existing) return existing.id;
db.prepare(`INSERT INTO collections (pwd, glob_pattern, created_at) VALUES (?, ?, ?)`).run(pwd, globPattern, now);
return (db.prepare(`SELECT last_insert_rowid() as id`).get() as { id: number }).id;
// Use INSERT OR IGNORE to handle race conditions, then SELECT
db.prepare(`INSERT OR IGNORE INTO collections (pwd, glob_pattern, created_at) VALUES (?, ?, ?)`).run(pwd, globPattern, now);
const existing = db.prepare(`SELECT id FROM collections WHERE pwd = ? AND glob_pattern = ?`).get(pwd, globPattern) as { id: number };
return existing.id;
}
function listCollections(): void {
function cleanupDuplicateCollections(db: Database): void {
// Remove duplicate collections keeping the oldest one
db.exec(`
DELETE FROM collections WHERE id NOT IN (
SELECT MIN(id) FROM collections GROUP BY pwd, glob_pattern
)
`);
// Remove bogus "." glob pattern entries (from earlier bug)
db.exec(`DELETE FROM collections WHERE glob_pattern = '.'`);
}
function formatTimeAgo(date: Date): string {
const seconds = Math.floor((Date.now() - date.getTime()) / 1000);
if (seconds < 60) return `${seconds}s ago`;
const minutes = Math.floor(seconds / 60);
if (minutes < 60) return `${minutes}m ago`;
const hours = Math.floor(minutes / 60);
if (hours < 24) return `${hours}h ago`;
const days = Math.floor(hours / 24);
return `${days}d ago`;
}
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
}
function showStatus(): void {
const dbPath = getDbPath();
const db = getDb();
// Cleanup any duplicate collections
cleanupDuplicateCollections(db);
// Index size
let indexSize = 0;
try {
const stat = Bun.file(dbPath).size;
indexSize = stat;
} catch {}
// Collections info
const collections = db.prepare(`
SELECT c.id, c.pwd, c.glob_pattern, c.created_at,
COUNT(d.id) as doc_count,
SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count
SUM(CASE WHEN d.active = 1 THEN 1 ELSE 0 END) as active_count,
MAX(d.modified_at) as last_modified
FROM collections c
LEFT JOIN documents d ON d.collection_id = c.id
GROUP BY c.id
ORDER BY c.created_at DESC
`).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; doc_count: number; active_count: number }[];
`).all() as { id: number; pwd: string; glob_pattern: string; created_at: string; doc_count: number; active_count: number; last_modified: string | null }[];
if (collections.length === 0) {
console.log("No collections found.");
db.close();
return;
// Overall stats
const totalDocs = db.prepare(`SELECT COUNT(*) as count FROM documents WHERE active = 1`).get() as { count: number };
const vectorCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
const needsEmbedding = getHashesNeedingEmbedding(db);
// Most recent update across all collections
const mostRecent = db.prepare(`SELECT MAX(modified_at) as latest FROM documents WHERE active = 1`).get() as { latest: string | null };
console.log(`${c.bold}QMD Status${c.reset}\n`);
console.log(`Index: ${dbPath}`);
console.log(`Size: ${formatBytes(indexSize)}\n`);
console.log(`${c.bold}Documents${c.reset}`);
console.log(` Total: ${totalDocs.count} files indexed`);
console.log(` Vectors: ${vectorCount.count} embedded`);
if (needsEmbedding > 0) {
console.log(` ${c.yellow}Pending: ${needsEmbedding} need embedding${c.reset} (run 'qmd embed')`);
}
if (mostRecent.latest) {
const lastUpdate = new Date(mostRecent.latest);
console.log(` Updated: ${formatTimeAgo(lastUpdate)}`);
}
console.log("Collections:\n");
for (const c of collections) {
console.log(` ${c.pwd}`);
console.log(` Pattern: ${c.glob_pattern}`);
console.log(` Documents: ${c.active_count} active (${c.doc_count} total)`);
console.log(` Created: ${c.created_at}\n`);
if (collections.length > 0) {
console.log(`\n${c.bold}Collections${c.reset}`);
for (const col of collections) {
const lastMod = col.last_modified ? formatTimeAgo(new Date(col.last_modified)) : "never";
console.log(` ${c.cyan}${col.pwd}${c.reset}`);
console.log(` ${col.glob_pattern}${col.active_count} docs (updated ${lastMod})`);
}
} else {
console.log(`\n${c.dim}No collections. Run 'qmd add .' to index markdown files.${c.reset}`);
}
const hashCount = db.prepare(`SELECT COUNT(*) as count FROM content_vectors`).get() as { count: number };
console.log(`Vectors: ${hashCount.count} unique content hashes embedded`);
db.close();
}
async function updateAllCollections(): Promise<void> {
const db = getDb();
cleanupDuplicateCollections(db);
const collections = db.prepare(`SELECT id, pwd, glob_pattern FROM collections`).all() as { id: number; pwd: string; glob_pattern: string }[];
if (collections.length === 0) {
console.log("No collections found.");
console.log(`${c.dim}No collections found. Run 'qmd add .' to index markdown files.${c.reset}`);
db.close();
return;
}
db.close();
console.log(`Updating ${collections.length} collection(s)...\n`);
console.log(`${c.bold}Updating ${collections.length} collection(s)...${c.reset}\n`);
for (const c of collections) {
console.log(`\n--- ${c.pwd} (${c.glob_pattern}) ---`);
for (let i = 0; i < collections.length; i++) {
const col = collections[i];
console.log(`${c.cyan}[${i + 1}/${collections.length}]${c.reset} ${c.bold}${col.pwd}${c.reset}`);
console.log(`${c.dim} Pattern: ${col.glob_pattern}${c.reset}`);
// Temporarily set PWD for indexing
const originalPwd = process.env.PWD;
process.env.PWD = c.pwd;
await indexFiles(c.glob_pattern);
process.env.PWD = col.pwd;
await indexFiles(col.glob_pattern);
process.env.PWD = originalPwd;
console.log("");
}
console.log("\nAll collections updated.");
console.log(`${c.green}✓ All collections updated.${c.reset}`);
}
async function dropCollection(globPattern: string): Promise<void> {
@ -575,7 +639,7 @@ async function indexFiles(globPattern: string = DEFAULT_GLOB): Promise<void> {
console.log(`\nIndexed: ${indexed} new, ${updated} updated, ${unchanged} unchanged, ${removed} removed`);
if (needsEmbedding > 0) {
console.log(`\nRun 'qmd vector' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
console.log(`\nRun 'qmd embed' to update embeddings (${needsEmbedding} unique hashes need vectors)`);
}
db.close();
@ -958,7 +1022,7 @@ async function vectorSearch(query: string, opts: OutputOptions, model: string =
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
if (!tableExists) {
console.error("Vector index not found. Run 'qmd vector' first to create embeddings.");
console.error("Vector index not found. Run 'qmd embed' first to create embeddings.");
db.close();
return;
}
@ -1171,7 +1235,7 @@ const args = parseGlobalOptions(rawArgs);
if (args.length === 0) {
console.log("Usage:");
console.log(" qmd add [--drop] [glob] - Add/update collection from $PWD (default: **/*.md)");
console.log(" qmd collections - List all collections");
console.log(" qmd status - Show index status and collections");
console.log(" qmd update-all - Re-index all collections");
console.log(" qmd embed [-f] - Create vector embeddings for all content");
console.log(" qmd search <query> - Full-text search (BM25)");
@ -1214,8 +1278,8 @@ if (cmd === "add") {
} else {
await indexFiles(globPattern);
}
} else if (cmd === "collections") {
listCollections();
} else if (cmd === "status") {
showStatus();
} else if (cmd === "update-all") {
await updateAllCollections();
} else if (cmd === "embed") {