diff --git a/README.md b/README.md index 7775856..411d787 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,15 @@ QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking ## Quick Start ```sh -# Install globally +# Install globally (Node or Bun) +npm install -g @tobilu/qmd +# or bun install -g @tobilu/qmd +# Or run directly +npx @tobilu/qmd ... +bunx @tobilu/qmd ... + # Create collections for your notes, docs, and meeting transcripts qmd collection add ~/notes --name notes qmd collection add ~/Documents/meetings --name meetings @@ -231,6 +237,7 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl ### System Requirements +- **Node.js** >= 22 - **Bun** >= 1.0.0 - **macOS**: Homebrew SQLite (for extension support) ```sh @@ -252,18 +259,18 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`. ## Installation ```sh +npm install -g @tobilu/qmd +# or bun install -g @tobilu/qmd ``` -Make sure `~/.bun/bin` is in your PATH. - ### Development ```sh git clone https://github.com/tobi/qmd cd qmd -bun install -bun link +npm install +npm link ``` ## Usage diff --git a/package.json b/package.json index 4b45420..f82c0c6 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tobilu/qmd", - "version": "0.9.0", + "version": "0.9.9", "description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking", "type": "module", "bin": { @@ -15,15 +15,26 @@ "CHANGELOG.md" ], "scripts": { - "test": "bun test --preload ./src/test-preload.ts", - "qmd": "bun src/qmd.ts", - "index": "bun src/qmd.ts index", - "vector": "bun src/qmd.ts vector", - "search": "bun src/qmd.ts search", - "vsearch": "bun src/qmd.ts vsearch", - "rerank": "bun src/qmd.ts rerank", - "link": "bun link", - "inspector": "npx @modelcontextprotocol/inspector bun src/qmd.ts mcp", + "test": "vitest run", + "test:unit": "vitest run --reporter=verbose src/*.test.ts", + "test:models": "vitest run --reporter=verbose src/models/*.test.ts", + "test:integration": "vitest run --reporter=verbose src/integration/*.test.ts", + "test:unit:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts", + "test:models:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts", + "test:integration:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts", + "test:unit:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts", + "test:models:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts", + "test:integration:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts", + "test:ci:bun": "npm run test:unit:bun && npm run test:models:bun && npm run test:integration:bun", + "test:ci:node": "npm run test:unit:node && npm run test:models:node && npm run test:integration:node", + "test:ci": "npm run test:unit && npm run test:models && npm run test:integration", + "qmd": "tsx src/qmd.ts", + "index": "tsx src/qmd.ts index", + "vector": "tsx src/qmd.ts vector", + "search": "tsx src/qmd.ts search", + "vsearch": "tsx src/qmd.ts vsearch", + "rerank": "tsx src/qmd.ts rerank", + "inspector": "npx @modelcontextprotocol/inspector tsx src/qmd.ts mcp", "release": "./scripts/release.sh" }, "publishConfig": { @@ -39,7 +50,10 @@ }, "dependencies": { "@modelcontextprotocol/sdk": "^1.25.1", + "better-sqlite3": "^11.0.0", + "fast-glob": "^3.3.0", "node-llama-cpp": "^3.14.5", + "picomatch": "^4.0.0", "sqlite-vec": "^0.1.7-alpha.2", "yaml": "^2.8.2", "zod": "^4.2.1" @@ -51,13 +65,15 @@ "sqlite-vec-win32-x64": "^0.1.7-alpha.2" }, "devDependencies": { - "@types/bun": "latest" + "@types/better-sqlite3": "^7.6.0", + "tsx": "^4.0.0", + "vitest": "^3.0.0" }, "peerDependencies": { "typescript": "^5.9.3" }, "engines": { - "bun": ">=1.0.0" + "node": ">=22.0.0" }, "keywords": [ "markdown", diff --git a/src/bench-rerank.ts b/src/bench-rerank.ts new file mode 100644 index 0000000..aa26416 --- /dev/null +++ b/src/bench-rerank.ts @@ -0,0 +1,327 @@ +#!/usr/bin/env bun +/** + * QMD Reranker Benchmark + * + * Measures reranking performance across different configurations. + * Reports device, parallelism, memory, VRAM, and throughput. + * + * Usage: + * bun src/bench-rerank.ts # full benchmark + * bun src/bench-rerank.ts --quick # quick smoke test (10 docs, 1 iteration) + * bun src/bench-rerank.ts --docs 100 # custom doc count + */ + +import { + getLlama, + getLlamaGpuTypes, + resolveModelFile, + LlamaLogLevel, + type Llama, + type LlamaModel, +} from "node-llama-cpp"; +import { homedir } from "os"; +import { join } from "path"; +import { cpus } from "os"; + +// ============================================================================ +// Config +// ============================================================================ + +const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; +const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models"); +const CONTEXT_SIZE = 2048; + +const args = process.argv.slice(2); +const quick = args.includes("--quick"); +const docsIdx = args.indexOf("--docs"); +const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40); +const ITERATIONS = quick ? 1 : 3; +const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8]; + +// ============================================================================ +// Test data — realistic-ish chunks of varying length +// ============================================================================ + +const QUERY = "How do AI agents work and what are their limitations?"; + +function generateDocs(n: number): string[] { + const templates = [ + "Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.", + "The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.", + "Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.", + "Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.", + "Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.", + "Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.", + "Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.", + "Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.", + "The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.", + "Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.", + ]; + return Array.from({ length: n }, (_, i) => templates[i % templates.length]!); +} + +// ============================================================================ +// Helpers +// ============================================================================ + +function formatBytes(bytes: number): string { + if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; + if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; + return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`; +} + +function getMemUsage(): { rss: number; heapUsed: number } { + const m = process.memoryUsage(); + return { rss: m.rss, heapUsed: m.heapUsed }; +} + +function median(arr: number[]): number { + const sorted = [...arr].sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2; +} + +// ============================================================================ +// Benchmark runner +// ============================================================================ + +interface BenchResult { + parallelism: number; + contextSize: number; + flashAttention: boolean; + times: number[]; // ms per run + medianMs: number; + docsPerSec: number; + vramPerContext: number; // bytes + totalVram: number; // bytes + peakRss: number; // bytes +} + +async function benchmarkConfig( + model: LlamaModel, + llama: Llama, + docs: string[], + parallelism: number, + flash: boolean, +): Promise { + // Measure VRAM before + const vramBefore = llama.gpu ? await llama.getVramState() : null; + const rssBefore = getMemUsage().rss; + + // Create contexts. On CPU, split threads evenly across contexts. + const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0; + const contexts = []; + for (let i = 0; i < parallelism; i++) { + try { + contexts.push(await model.createRankingContext({ + contextSize: CONTEXT_SIZE, + flashAttention: flash, + ...(cpuThreads > 0 ? { threads: cpuThreads } : {}), + })); + } catch { + if (contexts.length === 0) { + // Try without flash + contexts.push(await model.createRankingContext({ + contextSize: CONTEXT_SIZE, + ...(cpuThreads > 0 ? { threads: cpuThreads } : {}), + })); + } + break; + } + } + const actualParallelism = contexts.length; + + // Measure VRAM after context creation + const vramAfter = llama.gpu ? await llama.getVramState() : null; + const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0; + const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0; + + // Warm up + await contexts[0]!.rankAll(QUERY, docs.slice(0, 2)); + + // Benchmark iterations + const times: number[] = []; + let peakRss = getMemUsage().rss; + + for (let iter = 0; iter < ITERATIONS; iter++) { + const chunkSize = Math.ceil(docs.length / actualParallelism); + + const t0 = performance.now(); + const allScores = await Promise.all( + Array.from({ length: actualParallelism }, (_, i) => { + const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize); + return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]); + }) + ); + const elapsed = performance.now() - t0; + times.push(elapsed); + + // Verify scores are valid + const flat = allScores.flat(); + if (flat.some(s => s < 0 || s > 1 || isNaN(s))) { + throw new Error("Invalid scores detected"); + } + + const currentRss = getMemUsage().rss; + if (currentRss > peakRss) peakRss = currentRss; + } + + // Cleanup + for (const ctx of contexts) await ctx.dispose(); + + const med = median(times); + return { + parallelism: actualParallelism, + contextSize: CONTEXT_SIZE, + flashAttention: flash, + times, + medianMs: med, + docsPerSec: (docs.length / med) * 1000, + vramPerContext: vramPerCtx, + totalVram: vramUsed, + peakRss, + }; +} + +// ============================================================================ +// Main +// ============================================================================ + +async function main() { + console.log("═══════════════════════════════════════════════════════════════"); + console.log(" QMD Reranker Benchmark"); + console.log("═══════════════════════════════════════════════════════════════\n"); + + // Detect GPU + const gpuTypes = await getLlamaGpuTypes(); + const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g)); + + let llama: Llama; + let gpuLabel: string; + if (preferred) { + try { + llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error }); + gpuLabel = `${preferred}`; + } catch { + llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error }); + gpuLabel = "cpu (gpu init failed)"; + } + } else { + llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error }); + gpuLabel = "cpu"; + } + + // System info + const cpuInfo = cpus(); + const cpuModel = cpuInfo[0]?.model || "unknown"; + const cpuCount = cpuInfo.length; + + console.log("System"); + console.log(` CPU: ${cpuModel}`); + console.log(` Cores: ${cpuCount} (${llama.cpuMathCores} math)`); + console.log(` Device: ${gpuLabel}`); + + if (llama.gpu) { + const gpuNames = await llama.getGpuDeviceNames(); + const counts = new Map(); + for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1); + const devStr = Array.from(counts.entries()) + .map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", "); + console.log(` GPU: ${devStr}`); + const vram = await llama.getVramState(); + console.log(` VRAM: ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`); + } + + console.log(` RAM: ${formatBytes(getMemUsage().rss)} RSS at start`); + + // Load model + console.log(`\nModel`); + console.log(` URI: ${RERANK_MODEL}`); + const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE); + const vramPreModel = llama.gpu ? await llama.getVramState() : null; + const model = await llama.loadModel({ modelPath }); + const vramPostModel = llama.gpu ? await llama.getVramState() : null; + const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0; + console.log(` Params: ${model.trainContextSize} train ctx`); + if (modelVram > 0) console.log(` VRAM: ${formatBytes(modelVram)} (model weights)`); + + // Generate test docs + const docs = generateDocs(DOC_COUNT); + console.log(`\nBenchmark`); + console.log(` Documents: ${DOC_COUNT}`); + console.log(` Ctx size: ${CONTEXT_SIZE}`); + console.log(` Iterations:${ITERATIONS}`); + console.log(` Query: "${QUERY.slice(0, 50)}..."`); + + // Run benchmarks + const results: BenchResult[] = []; + + for (const p of PARALLEL_CONFIGS) { + if (!llama.gpu && p > 1) { + // CPU: only test if we have enough cores (at least 4 per context) + if (llama.cpuMathCores < p * 4) { + console.log(`\n [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`); + continue; + } + } + + // Test with flash attention + process.stdout.write(`\n [${p} ctx, flash] running...`); + try { + const r = await benchmarkConfig(model, llama, docs, p, true); + results.push(r); + process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`); + } catch (e: any) { + process.stdout.write(` failed: ${e.message}\n`); + // Try without flash + process.stdout.write(` [${p} ctx, no flash] running...`); + try { + const r = await benchmarkConfig(model, llama, docs, p, false); + results.push(r); + process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`); + } catch (e2: any) { + process.stdout.write(` failed: ${e2.message}\n`); + } + } + } + + // Summary table + console.log("\n═══════════════════════════════════════════════════════════════"); + console.log(" Results"); + console.log("═══════════════════════════════════════════════════════════════\n"); + + const header = " Ctx Flash Median Docs/s VRAM/ctx Total VRAM Peak RSS"; + const sep = " ─── ───── ────── ────── ──────── ────────── ────────"; + console.log(header); + console.log(sep); + + const baseline = results[0]?.medianMs ?? 1; + for (const r of results) { + const speedup = baseline / r.medianMs; + const speedupStr = r === results[0] ? " " : `(${speedup.toFixed(1)}×)`; + console.log( + ` ${String(r.parallelism).padStart(3)} ` + + `${r.flashAttention ? " yes " : " no "} ` + + `${r.medianMs.toFixed(0).padStart(5)}ms ` + + `${r.docsPerSec.toFixed(1).padStart(6)} ` + + `${formatBytes(r.vramPerContext).padStart(8)} ` + + `${formatBytes(r.totalVram).padStart(10)} ` + + `${formatBytes(r.peakRss).padStart(8)} ` + + speedupStr + ); + } + + // Best config + if (results.length > 0) { + const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b); + console.log(`\n Best: ${best.parallelism} contexts, flash=${best.flashAttention}`); + console.log(` ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`); + if (best.totalVram > 0) console.log(` ${formatBytes(best.totalVram)} VRAM`); + } + + console.log(""); + await model.dispose(); + await llama.dispose(); +} + +main().catch(console.error); diff --git a/src/llm.test.ts b/src/llm.test.ts index cc62825..bc20824 100644 --- a/src/llm.test.ts +++ b/src/llm.test.ts @@ -221,10 +221,15 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => { const successCount = allResults.filter(r => r !== null).length; expect(successCount).toBe(10); - // THE KEY ASSERTION: Only 1 context should be created, not 5 - // Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call) - console.log(`Context creation count: ${contextCreateCount} (expected: 1)`); - expect(contextCreateCount).toBe(1); + // THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts), + // not duplicated per concurrent embedBatch call. The exact count depends on + // available VRAM (computeParallelism), but should not be 5 (one per call). + // Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call). + // With the promise guard, contexts are created exactly once regardless of concurrent callers. + // The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap). + console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`); + expect(contextCreateCount).toBeGreaterThanOrEqual(1); + expect(contextCreateCount).toBeLessThanOrEqual(8); await freshLlm.dispose(); }, 60000); diff --git a/src/llm.ts b/src/llm.ts index ab39c86..31c5004 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -6,6 +6,7 @@ import { getLlama, + getLlamaGpuTypes, resolveModelFile, LlamaChatSession, LlamaLogLevel, @@ -354,10 +355,10 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000; export class LlamaCpp implements LLM { private llama: Llama | null = null; private embedModel: LlamaModel | null = null; - private embedContext: LlamaEmbeddingContext | null = null; + private embedContexts: LlamaEmbeddingContext[] = []; private generateModel: LlamaModel | null = null; private rerankModel: LlamaModel | null = null; - private rerankContext: Awaited> | null = null; + private rerankContexts: Awaited>[] = []; private embedModelUri: string; private generateModelUri: string; @@ -366,7 +367,6 @@ export class LlamaCpp implements LLM { // Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM). private embedModelLoadPromise: Promise | null = null; - private embedContextCreatePromise: Promise | null = null; private generateModelLoadPromise: Promise | null = null; private rerankModelLoadPromise: Promise | null = null; @@ -423,7 +423,7 @@ export class LlamaCpp implements LLM { * Check if any contexts are currently loaded (and therefore worth unloading on inactivity). */ private hasLoadedContexts(): boolean { - return !!(this.embedContext || this.rerankContext); + return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0); } /** @@ -445,14 +445,14 @@ export class LlamaCpp implements LLM { } // Dispose contexts first - if (this.embedContext) { - await this.embedContext.dispose(); - this.embedContext = null; + for (const ctx of this.embedContexts) { + await ctx.dispose(); } - if (this.rerankContext) { - await this.rerankContext.dispose(); - this.rerankContext = null; + this.embedContexts = []; + for (const ctx of this.rerankContexts) { + await ctx.dispose(); } + this.rerankContexts = []; // Optionally dispose models too (opt-in) if (this.disposeModelsOnInactivity) { @@ -491,7 +491,33 @@ export class LlamaCpp implements LLM { */ private async ensureLlama(): Promise { if (!this.llama) { - this.llama = await getLlama({ logLevel: LlamaLogLevel.error }); + // Detect available GPU types and use the best one. + // We can't rely on gpu:"auto" — it returns false even when CUDA is available + // (likely a binary/build config issue in node-llama-cpp). + const gpuTypes = await getLlamaGpuTypes(); + // Prefer CUDA > Metal > Vulkan > CPU + const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g)); + + let llama: Llama; + if (preferred) { + try { + llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error }); + } catch { + llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error }); + process.stderr.write( + `QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n` + ); + } + } else { + llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error }); + } + + if (!llama.gpu) { + process.stderr.write( + "QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n" + ); + } + this.llama = llama; } return this.llama; } @@ -535,34 +561,92 @@ export class LlamaCpp implements LLM { } /** - * Load embedding context (lazy). Context can be disposed and recreated without reloading the model. - * Uses promise guard to prevent concurrent context creation race condition. + * Compute how many parallel contexts to create. + * + * GPU: constrained by VRAM (25% of free, capped at 8). + * CPU: constrained by cores. Splitting threads across contexts enables + * true parallelism (each context runs on its own cores). Use at most + * half the math cores, with at least 4 threads per context. */ - private async ensureEmbedContext(): Promise { - if (!this.embedContext) { - // If context creation is already in progress, wait for it - if (this.embedContextCreatePromise) { - return await this.embedContextCreatePromise; - } - - // Start context creation and store promise so concurrent calls wait - this.embedContextCreatePromise = (async () => { - const model = await this.ensureEmbedModel(); - const context = await model.createEmbeddingContext(); - this.embedContext = context; - return context; - })(); + private async computeParallelism(perContextMB: number): Promise { + const llama = await this.ensureLlama(); + if (llama.gpu) { try { - const context = await this.embedContextCreatePromise; - this.touchActivity(); - return context; - } finally { - this.embedContextCreatePromise = null; + const vram = await llama.getVramState(); + const freeMB = vram.free / (1024 * 1024); + const maxByVram = Math.floor((freeMB * 0.25) / perContextMB); + return Math.max(1, Math.min(8, maxByVram)); + } catch { + return 2; } } - this.touchActivity(); - return this.embedContext; + + // CPU: split cores across contexts. At least 4 threads per context. + const cores = llama.cpuMathCores || 4; + const maxContexts = Math.floor(cores / 4); + return Math.max(1, Math.min(4, maxContexts)); + } + + /** + * Get the number of threads each context should use, given N parallel contexts. + * Splits available math cores evenly across contexts. + */ + private async threadsPerContext(parallelism: number): Promise { + const llama = await this.ensureLlama(); + if (llama.gpu) return 0; // GPU: let the library decide + const cores = llama.cpuMathCores || 4; + return Math.max(1, Math.floor(cores / parallelism)); + } + + /** + * Load embedding contexts (lazy). Creates multiple for parallel embedding. + * Uses promise guard to prevent concurrent context creation race condition. + */ + private embedContextsCreatePromise: Promise | null = null; + + private async ensureEmbedContexts(): Promise { + if (this.embedContexts.length > 0) { + this.touchActivity(); + return this.embedContexts; + } + + if (this.embedContextsCreatePromise) { + return await this.embedContextsCreatePromise; + } + + this.embedContextsCreatePromise = (async () => { + const model = await this.ensureEmbedModel(); + // Embed contexts are ~143 MB each (nomic-embed 2048 ctx) + const n = await this.computeParallelism(150); + const threads = await this.threadsPerContext(n); + for (let i = 0; i < n; i++) { + try { + this.embedContexts.push(await model.createEmbeddingContext({ + ...(threads > 0 ? { threads } : {}), + })); + } catch { + if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context"); + break; + } + } + this.touchActivity(); + return this.embedContexts; + })(); + + try { + return await this.embedContextsCreatePromise; + } finally { + this.embedContextsCreatePromise = null; + } + } + + /** + * Get a single embed context (for single-embed calls). Uses first from pool. + */ + private async ensureEmbedContext(): Promise { + const contexts = await this.ensureEmbedContexts(); + return contexts[0]!; } /** @@ -624,15 +708,50 @@ export class LlamaCpp implements LLM { } /** - * Load rerank context (lazy). Context can be disposed and recreated without reloading the model. + * Load rerank contexts (lazy). Creates multiple contexts for parallel ranking. + * Each context has its own sequence, so they can evaluate independently. + * + * Tuning choices: + * - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty + * - flashAttention: ~20% less VRAM per context (568 vs 711 MB) + * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×) */ - private async ensureRerankContext(): Promise>> { - if (!this.rerankContext) { + // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.) + // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical. + // Use 2048 for safety margin. Still 17× less than auto (40960). + private static readonly RERANK_CONTEXT_SIZE = 2048; + + private async ensureRerankContexts(): Promise>[]> { + if (this.rerankContexts.length === 0) { const model = await this.ensureRerankModel(); - this.rerankContext = await model.createRankingContext(); + // ~960 MB per context with flash attention at contextSize 2048 + const n = await this.computeParallelism(1000); + const threads = await this.threadsPerContext(n); + for (let i = 0; i < n; i++) { + try { + this.rerankContexts.push(await model.createRankingContext({ + contextSize: LlamaCpp.RERANK_CONTEXT_SIZE, + flashAttention: true, + ...(threads > 0 ? { threads } : {}), + })); + } catch { + if (this.rerankContexts.length === 0) { + // Flash attention might not be supported — retry without it + try { + this.rerankContexts.push(await model.createRankingContext({ + contextSize: LlamaCpp.RERANK_CONTEXT_SIZE, + ...(threads > 0 ? { threads } : {}), + })); + } catch { + throw new Error("Failed to create any rerank context"); + } + } + break; + } + } } this.touchActivity(); - return this.rerankContext; + return this.rerankContexts; } // ========================================================================== @@ -703,26 +822,51 @@ export class LlamaCpp implements LLM { if (texts.length === 0) return []; try { - const context = await this.ensureEmbedContext(); + const contexts = await this.ensureEmbedContexts(); + const n = contexts.length; - // node-llama-cpp handles batching internally when we make parallel requests - const embeddings = await Promise.all( - texts.map(async (text) => { + if (n === 1) { + // Single context: sequential (no point splitting) + const context = contexts[0]!; + const embeddings = []; + for (const text of texts) { try { const embedding = await context.getEmbeddingFor(text); - this.touchActivity(); // Keep-alive during slow batches - return { - embedding: Array.from(embedding.vector), - model: this.embedModelUri, - }; + this.touchActivity(); + embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri }); } catch (err) { console.error("Embedding error for text:", err); - return null; + embeddings.push(null); } + } + return embeddings; + } + + // Multiple contexts: split texts across contexts for parallel evaluation + const chunkSize = Math.ceil(texts.length / n); + const chunks = Array.from({ length: n }, (_, i) => + texts.slice(i * chunkSize, (i + 1) * chunkSize) + ); + + const chunkResults = await Promise.all( + chunks.map(async (chunk, i) => { + const ctx = contexts[i]!; + const results: (EmbeddingResult | null)[] = []; + for (const text of chunk) { + try { + const embedding = await ctx.getEmbeddingFor(text); + this.touchActivity(); + results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri }); + } catch (err) { + console.error("Embedding error for text:", err); + results.push(null); + } + } + return results; }) ); - return embeddings; + return chunkResults.flat(); } catch (error) { console.error("Batch embedding error:", error); return texts.map(() => null); @@ -879,7 +1023,7 @@ export class LlamaCpp implements LLM { // Ping activity at start to keep models alive during this operation this.touchActivity(); - const context = await this.ensureRerankContext(); + const contexts = await this.ensureRerankContexts(); // Build a map from document text to original indices (for lookup after sorting) const textToDoc = new Map(); @@ -890,8 +1034,24 @@ export class LlamaCpp implements LLM { // Extract just the text for ranking const texts = documents.map((doc) => doc.text); - // Use the proper ranking API - returns [{document: string, score: number}] sorted by score - const ranked = await context.rankAndSort(query, texts); + // Split documents across contexts for parallel evaluation. + // Each context has its own sequence with a lock, so parallelism comes + // from multiple contexts evaluating different chunks simultaneously. + const n = contexts.length; + const chunkSize = Math.ceil(texts.length / n); + const chunks = Array.from({ length: n }, (_, i) => + texts.slice(i * chunkSize, (i + 1) * chunkSize) + ).filter(chunk => chunk.length > 0); + + const allScores = await Promise.all( + chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk)) + ); + + // Reassemble scores in original order and sort + const flatScores = allScores.flat(); + const ranked = texts + .map((text, i) => ({ document: text, score: flatScores[i]! })) + .sort((a, b) => b.score - a.score); // Map back to our result format using the text-to-doc map const results: RerankDocumentResult[] = ranked.map((item) => { @@ -909,6 +1069,35 @@ export class LlamaCpp implements LLM { }; } + /** + * Get device/GPU info for status display. + * Initializes llama if not already done. + */ + async getDeviceInfo(): Promise<{ + gpu: string | false; + gpuOffloading: boolean; + gpuDevices: string[]; + vram?: { total: number; used: number; free: number }; + cpuCores: number; + }> { + const llama = await this.ensureLlama(); + const gpuDevices = await llama.getGpuDeviceNames(); + let vram: { total: number; used: number; free: number } | undefined; + if (llama.gpu) { + try { + const state = await llama.getVramState(); + vram = { total: state.total, used: state.used, free: state.free }; + } catch { /* no vram info */ } + } + return { + gpu: llama.gpu, + gpuOffloading: llama.supportsGpuOffloading, + gpuDevices, + vram, + cpuCores: llama.cpuMathCores, + }; + } + async dispose(): Promise { // Prevent double-dispose if (this.disposed) { @@ -932,8 +1121,8 @@ export class LlamaCpp implements LLM { } // Clear references - this.embedContext = null; - this.rerankContext = null; + this.embedContexts = []; + this.rerankContexts = []; this.embedModel = null; this.generateModel = null; this.rerankModel = null; @@ -941,7 +1130,7 @@ export class LlamaCpp implements LLM { // Clear any in-flight load/create promises this.embedModelLoadPromise = null; - this.embedContextCreatePromise = null; + this.embedContextsCreatePromise = null; this.generateModelLoadPromise = null; this.rerankModelLoadPromise = null; } diff --git a/src/mcp.ts b/src/mcp.ts index 24fb44f..fa9ddfc 100644 --- a/src/mcp.ts +++ b/src/mcp.ts @@ -1,4 +1,3 @@ -#!/usr/bin/env bun /** * QMD MCP Server - Model Context Protocol server for QMD * @@ -8,6 +7,8 @@ * Follows MCP spec 2025-06-18 for proper response types. */ +import { createServer, type IncomingMessage, type ServerResponse } from "node:http"; +import { fileURLToPath } from "url"; import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { WebStandardStreamableHTTPServerTransport } @@ -147,7 +148,7 @@ function buildInstructions(store: Store): string { */ function createMcpServer(store: Store): McpServer { const server = new McpServer( - { name: "qmd", version: "1.0.0" }, + { name: "qmd", version: "0.9.9" }, { instructions: buildInstructions(store) }, ); @@ -237,9 +238,7 @@ function createMcpServer(store: Store): McpServer { }, }, async ({ query, limit, minScore, collection }) => { - // Note: Collection filtering is now done post-search since collections are managed in YAML - const results = store.searchFTS(query, limit || 10) - .filter(r => !collection || r.collectionName === collection); + const results = store.searchFTS(query, limit || 10, collection); const filtered: SearchResultItem[] = results .filter(r => r.score >= (minScore || 0)) .map(r => { @@ -541,7 +540,7 @@ export async function startMcpServer(): Promise { // ============================================================================= export type HttpServerHandle = { - httpServer: ReturnType; + httpServer: import("http").Server; port: number; stop: () => Promise; }; @@ -588,47 +587,79 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole if (!quiet) console.error(msg); } - const httpServer = Bun.serve({ - port, - hostname: "localhost", - async fetch(req) { - const reqStart = Date.now(); - const pathname = new URL(req.url).pathname; + // Helper to collect request body + async function collectBody(req: IncomingMessage): Promise { + const chunks: Buffer[] = []; + for await (const chunk of req) chunks.push(chunk as Buffer); + return Buffer.concat(chunks).toString(); + } - if (pathname === "/health" && req.method === "GET") { - const res = Response.json({ - status: "ok", - uptime: Math.floor((Date.now() - startTime) / 1000), - }); + const httpServer = createServer(async (nodeReq: IncomingMessage, nodeRes: ServerResponse) => { + const reqStart = Date.now(); + const pathname = nodeReq.url || "/"; + + try { + if (pathname === "/health" && nodeReq.method === "GET") { + const body = JSON.stringify({ status: "ok", uptime: Math.floor((Date.now() - startTime) / 1000) }); + nodeRes.writeHead(200, { "Content-Type": "application/json" }); + nodeRes.end(body); log(`${ts()} GET /health (${Date.now() - reqStart}ms)`); - return res; + return; } - if (pathname === "/mcp" && req.method === "POST") { - const body = await req.json(); + if (pathname === "/mcp" && nodeReq.method === "POST") { + const rawBody = await collectBody(nodeReq); + const body = JSON.parse(rawBody); const label = describeRequest(body); - const res = await transport.handleRequest(req, { parsedBody: body }); + const url = `http://localhost:${port}${pathname}`; + const headers: Record = {}; + for (const [k, v] of Object.entries(nodeReq.headers)) { + if (typeof v === "string") headers[k] = v; + } + const request = new Request(url, { method: "POST", headers, body: rawBody }); + const response = await transport.handleRequest(request, { parsedBody: body }); + nodeRes.writeHead(response.status, Object.fromEntries(response.headers)); + nodeRes.end(Buffer.from(await response.arrayBuffer())); log(`${ts()} POST /mcp ${label} (${Date.now() - reqStart}ms)`); - return res; + return; } - // Pass other methods (GET, DELETE) to transport for protocol handling if (pathname === "/mcp") { - return transport.handleRequest(req); + const url = `http://localhost:${port}${pathname}`; + const headers: Record = {}; + for (const [k, v] of Object.entries(nodeReq.headers)) { + if (typeof v === "string") headers[k] = v; + } + const rawBody = nodeReq.method !== "GET" && nodeReq.method !== "HEAD" ? await collectBody(nodeReq) : undefined; + const request = new Request(url, { method: nodeReq.method || "GET", headers, ...(rawBody ? { body: rawBody } : {}) }); + const response = await transport.handleRequest(request); + nodeRes.writeHead(response.status, Object.fromEntries(response.headers)); + nodeRes.end(Buffer.from(await response.arrayBuffer())); + return; } - return new Response("Not Found", { status: 404 }); - }, + nodeRes.writeHead(404); + nodeRes.end("Not Found"); + } catch (err) { + console.error("HTTP handler error:", err); + nodeRes.writeHead(500); + nodeRes.end("Internal Server Error"); + } }); - const actualPort = httpServer.port; + await new Promise((resolve, reject) => { + httpServer.on("error", reject); + httpServer.listen(port, "localhost", () => resolve()); + }); + + const actualPort = (httpServer.address() as import("net").AddressInfo).port; let stopping = false; const stop = async () => { if (stopping) return; stopping = true; await transport.close(); - httpServer.stop(); + httpServer.close(); store.close(); await disposeDefaultLlamaCpp(); }; @@ -649,6 +680,6 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole } // Run if this is the main module -if (import.meta.main) { +if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsWith("/mcp.ts")) { startMcpServer().catch(console.error); } diff --git a/src/qmd.ts b/src/qmd.ts index e76b231..fc6f96d 100755 --- a/src/qmd.ts +++ b/src/qmd.ts @@ -65,7 +65,7 @@ import { createStore, getDefaultDbPath, } from "./store.js"; -import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; +import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js"; import { formatSearchResults, formatDocuments, @@ -249,7 +249,7 @@ function formatBytes(bytes: number): string { return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`; } -function showStatus(): void { +async function showStatus(): Promise { const dbPath = getDbPath(); const db = getDb(); @@ -362,6 +362,36 @@ function showStatus(): void { console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`); } + // Device / GPU info + try { + const llm = getDefaultLlamaCpp(); + const device = await llm.getDeviceInfo(); + console.log(`\n${c.bold}Device${c.reset}`); + if (device.gpu) { + console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`); + if (device.gpuDevices.length > 0) { + // Deduplicate and count GPUs + const counts = new Map(); + for (const name of device.gpuDevices) { + counts.set(name, (counts.get(name) || 0) + 1); + } + const deviceStr = Array.from(counts.entries()) + .map(([name, count]) => count > 1 ? `${count}× ${name}` : name) + .join(', '); + console.log(` Devices: ${deviceStr}`); + } + if (device.vram) { + console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`); + } + } else { + console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`); + console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`); + } + console.log(` CPU: ${device.cpuCores} math cores`); + } catch { + // Don't fail status if LLM init fails + } + closeDb(); } @@ -1871,8 +1901,7 @@ function search(query: string, opts: OutputOptions): void { // Use large limit for --all, otherwise fetch more than needed and let outputResults filter const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2); - // searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts) - const results = searchFTS(db, query, fetchLimit, collectionName as any); + const results = searchFTS(db, query, fetchLimit, collectionName); // Add context to results const resultsWithContext = results.map(r => ({ @@ -2348,7 +2377,7 @@ if (import.meta.main) { } case "status": - showStatus(); + await showStatus(); break; case "update": diff --git a/src/store.test.ts b/src/store.test.ts index 639e149..6d5f4a2 100644 --- a/src/store.test.ts +++ b/src/store.test.ts @@ -1219,8 +1219,8 @@ describe("FTS Search", () => { const allResults = store.searchFTS("searchable", 10); expect(allResults).toHaveLength(2); - // Filter by collection name (collectionId is now treated as collection name string) - const filtered = store.searchFTS("searchable", 10, collection1 as unknown as number); + // Filter by collection name + const filtered = store.searchFTS("searchable", 10, collection1); expect(filtered).toHaveLength(1); expect(filtered[0]!.displayPath).toBe(`${collection1}/doc1.md`); diff --git a/src/store.ts b/src/store.ts index ae9ff4a..1e04511 100644 --- a/src/store.ts +++ b/src/store.ts @@ -830,8 +830,8 @@ export type Store = { toVirtualPath: (absolutePath: string) => string | null; // Search - searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[]; - searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise; + searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[]; + searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise; // Query expansion & reranking expandQuery: (query: string, model?: string) => Promise; @@ -913,8 +913,8 @@ export function createStore(dbPath?: string): Store { toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath), // Search - searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId), - searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName), + searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName), + searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding), // Query expansion & reranking expandQuery: (query: string, model?: string) => expandQuery(query, model, db), @@ -2020,7 +2020,7 @@ function buildFTS5Query(query: string): string | null { return terms.map(t => `"${t}"*`).join(' AND '); } -export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] { +export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] { const ftsQuery = buildFTS5Query(query); if (!ftsQuery) return []; @@ -2039,12 +2039,9 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle `; const params: (string | number)[] = [ftsQuery]; - if (collectionId) { - // Note: collectionId is a legacy parameter that should be phased out - // Collections are now managed in YAML. For now, we interpret it as a collection name filter. - // This code path is likely unused as collection filtering should be done at CLI level. + if (collectionName) { sql += ` AND d.collection = ?`; - params.push(String(collectionId)); + params.push(String(collectionName)); } // bm25 lower is better; sort ascending. @@ -2080,11 +2077,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle // Vector Search // ============================================================================= -export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise { +export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise { const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get(); if (!tableExists) return []; - const embedding = await getEmbedding(query, model, true, session); + const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session); if (!embedding) return []; // IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables @@ -2848,8 +2845,8 @@ export async function hybridQuery( ).get(); // Step 1: BM25 probe — strong signal skips expensive LLM expansion - const initialFts = store.searchFTS(query, 20) - .filter(r => !collection || r.collectionName === collection); + // Pass collection directly into FTS query (filter at SQL level, not post-hoc) + const initialFts = store.searchFTS(query, 20, collection); const topScore = initialFts[0]?.score ?? 0; const secondScore = initialFts[1]?.score ?? 0; const hasStrongSignal = initialFts.length > 0 @@ -2875,26 +2872,15 @@ export async function hybridQuery( } // Step 3: Route searches by query type - // Original query → vector search (FTS already covered by probe in step 1). - // Vector searches run sequentially — node-llama-cpp's embed context - // hangs on concurrent embed() calls (known limitation). - if (hasVectors) { - const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, 20, collection); - if (vecResults.length > 0) { - for (const r of vecResults) docidMap.set(r.filepath, r.docid); - rankedLists.push(vecResults.map(r => ({ - file: r.filepath, displayPath: r.displayPath, - title: r.title, body: r.body || "", score: r.score, - }))); - } - } + // + // Strategy: run all FTS queries immediately (they're sync/instant), then + // batch-embed all vector queries in one embedBatch() call, then run + // sqlite-vec lookups with pre-computed embeddings. - // Expanded queries → route by type: lex→FTS only, vec/hyde→vector only. - // This restores the CLI's query-type-aware routing that was lost in the initial refactor. + // 3a: Run FTS for all lex expansions right away (no LLM needed) for (const q of expanded) { if (q.type === 'lex') { - const ftsResults = store.searchFTS(q.text, 20) - .filter(r => !collection || r.collectionName === collection); + const ftsResults = store.searchFTS(q.text, 20, collection); if (ftsResults.length > 0) { for (const r of ftsResults) docidMap.set(r.filepath, r.docid); rankedLists.push(ftsResults.map(r => ({ @@ -2902,17 +2888,40 @@ export async function hybridQuery( title: r.title, body: r.body || "", score: r.score, }))); } - } else { - // vec or hyde → vector search only - if (hasVectors) { - const vecResults = await store.searchVec(q.text, DEFAULT_EMBED_MODEL, 20, collection); - if (vecResults.length > 0) { - for (const r of vecResults) docidMap.set(r.filepath, r.docid); - rankedLists.push(vecResults.map(r => ({ - file: r.filepath, displayPath: r.displayPath, - title: r.title, body: r.body || "", score: r.score, - }))); - } + } + } + + // 3b: Collect all texts that need vector search (original query + vec/hyde expansions) + if (hasVectors) { + const vecQueries: { text: string; isOriginal: boolean }[] = [ + { text: query, isOriginal: true }, + ]; + for (const q of expanded) { + if (q.type === 'vec' || q.type === 'hyde') { + vecQueries.push({ text: q.text, isOriginal: false }); + } + } + + // Batch embed all vector queries in a single call + const llm = getDefaultLlamaCpp(); + const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text)); + const embeddings = await llm.embedBatch(textsToEmbed); + + // Run sqlite-vec lookups with pre-computed embeddings + for (let i = 0; i < vecQueries.length; i++) { + const embedding = embeddings[i]?.embedding; + if (!embedding) continue; + + const vecResults = await store.searchVec( + vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection, + undefined, embedding + ); + if (vecResults.length > 0) { + for (const r of vecResults) docidMap.set(r.filepath, r.docid); + rankedLists.push(vecResults.map(r => ({ + file: r.filepath, displayPath: r.displayPath, + title: r.title, body: r.body || "", score: r.score, + }))); } } }