Merge origin/nodejs: Node.js compat, perf improvements, vitest

Brings in Node.js compatibility (tsx, vitest), GPU auto-detection,
parallel embedding/reranking contexts, and flash attention support.
Preserves @tobilu/qmd package scope and publish config from main.
This commit is contained in:
Tobi Lutke 2026-02-15 16:52:30 -04:00
commit 4df5505bd6
No known key found for this signature in database
9 changed files with 768 additions and 155 deletions

View File

@ -9,9 +9,15 @@ QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking
## Quick Start
```sh
# Install globally
# Install globally (Node or Bun)
npm install -g @tobilu/qmd
# or
bun install -g @tobilu/qmd
# Or run directly
npx @tobilu/qmd ...
bunx @tobilu/qmd ...
# Create collections for your notes, docs, and meeting transcripts
qmd collection add ~/notes --name notes
qmd collection add ~/Documents/meetings --name meetings
@ -231,6 +237,7 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
### System Requirements
- **Node.js** >= 22
- **Bun** >= 1.0.0
- **macOS**: Homebrew SQLite (for extension support)
```sh
@ -252,18 +259,18 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
## Installation
```sh
npm install -g @tobilu/qmd
# or
bun install -g @tobilu/qmd
```
Make sure `~/.bun/bin` is in your PATH.
### Development
```sh
git clone https://github.com/tobi/qmd
cd qmd
bun install
bun link
npm install
npm link
```
## Usage

View File

@ -1,6 +1,6 @@
{
"name": "@tobilu/qmd",
"version": "0.9.0",
"version": "0.9.9",
"description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
"type": "module",
"bin": {
@ -15,15 +15,26 @@
"CHANGELOG.md"
],
"scripts": {
"test": "bun test --preload ./src/test-preload.ts",
"qmd": "bun src/qmd.ts",
"index": "bun src/qmd.ts index",
"vector": "bun src/qmd.ts vector",
"search": "bun src/qmd.ts search",
"vsearch": "bun src/qmd.ts vsearch",
"rerank": "bun src/qmd.ts rerank",
"link": "bun link",
"inspector": "npx @modelcontextprotocol/inspector bun src/qmd.ts mcp",
"test": "vitest run",
"test:unit": "vitest run --reporter=verbose src/*.test.ts",
"test:models": "vitest run --reporter=verbose src/models/*.test.ts",
"test:integration": "vitest run --reporter=verbose src/integration/*.test.ts",
"test:unit:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
"test:models:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
"test:integration:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
"test:unit:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
"test:models:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
"test:integration:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
"test:ci:bun": "npm run test:unit:bun && npm run test:models:bun && npm run test:integration:bun",
"test:ci:node": "npm run test:unit:node && npm run test:models:node && npm run test:integration:node",
"test:ci": "npm run test:unit && npm run test:models && npm run test:integration",
"qmd": "tsx src/qmd.ts",
"index": "tsx src/qmd.ts index",
"vector": "tsx src/qmd.ts vector",
"search": "tsx src/qmd.ts search",
"vsearch": "tsx src/qmd.ts vsearch",
"rerank": "tsx src/qmd.ts rerank",
"inspector": "npx @modelcontextprotocol/inspector tsx src/qmd.ts mcp",
"release": "./scripts/release.sh"
},
"publishConfig": {
@ -39,7 +50,10 @@
},
"dependencies": {
"@modelcontextprotocol/sdk": "^1.25.1",
"better-sqlite3": "^11.0.0",
"fast-glob": "^3.3.0",
"node-llama-cpp": "^3.14.5",
"picomatch": "^4.0.0",
"sqlite-vec": "^0.1.7-alpha.2",
"yaml": "^2.8.2",
"zod": "^4.2.1"
@ -51,13 +65,15 @@
"sqlite-vec-win32-x64": "^0.1.7-alpha.2"
},
"devDependencies": {
"@types/bun": "latest"
"@types/better-sqlite3": "^7.6.0",
"tsx": "^4.0.0",
"vitest": "^3.0.0"
},
"peerDependencies": {
"typescript": "^5.9.3"
},
"engines": {
"bun": ">=1.0.0"
"node": ">=22.0.0"
},
"keywords": [
"markdown",

327
src/bench-rerank.ts Normal file
View File

@ -0,0 +1,327 @@
#!/usr/bin/env bun
/**
* QMD Reranker Benchmark
*
* Measures reranking performance across different configurations.
* Reports device, parallelism, memory, VRAM, and throughput.
*
* Usage:
* bun src/bench-rerank.ts # full benchmark
* bun src/bench-rerank.ts --quick # quick smoke test (10 docs, 1 iteration)
* bun src/bench-rerank.ts --docs 100 # custom doc count
*/
import {
getLlama,
getLlamaGpuTypes,
resolveModelFile,
LlamaLogLevel,
type Llama,
type LlamaModel,
} from "node-llama-cpp";
import { homedir } from "os";
import { join } from "path";
import { cpus } from "os";
// ============================================================================
// Config
// ============================================================================
const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models");
const CONTEXT_SIZE = 2048;
const args = process.argv.slice(2);
const quick = args.includes("--quick");
const docsIdx = args.indexOf("--docs");
const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40);
const ITERATIONS = quick ? 1 : 3;
const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8];
// ============================================================================
// Test data — realistic-ish chunks of varying length
// ============================================================================
const QUERY = "How do AI agents work and what are their limitations?";
function generateDocs(n: number): string[] {
const templates = [
"Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.",
"The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.",
"Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.",
"Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.",
"Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.",
"Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.",
"Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.",
"Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.",
"The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.",
"Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.",
];
return Array.from({ length: n }, (_, i) => templates[i % templates.length]!);
}
// ============================================================================
// Helpers
// ============================================================================
function formatBytes(bytes: number): string {
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
}
function getMemUsage(): { rss: number; heapUsed: number } {
const m = process.memoryUsage();
return { rss: m.rss, heapUsed: m.heapUsed };
}
function median(arr: number[]): number {
const sorted = [...arr].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2;
}
// ============================================================================
// Benchmark runner
// ============================================================================
interface BenchResult {
parallelism: number;
contextSize: number;
flashAttention: boolean;
times: number[]; // ms per run
medianMs: number;
docsPerSec: number;
vramPerContext: number; // bytes
totalVram: number; // bytes
peakRss: number; // bytes
}
async function benchmarkConfig(
model: LlamaModel,
llama: Llama,
docs: string[],
parallelism: number,
flash: boolean,
): Promise<BenchResult> {
// Measure VRAM before
const vramBefore = llama.gpu ? await llama.getVramState() : null;
const rssBefore = getMemUsage().rss;
// Create contexts. On CPU, split threads evenly across contexts.
const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
const contexts = [];
for (let i = 0; i < parallelism; i++) {
try {
contexts.push(await model.createRankingContext({
contextSize: CONTEXT_SIZE,
flashAttention: flash,
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
}));
} catch {
if (contexts.length === 0) {
// Try without flash
contexts.push(await model.createRankingContext({
contextSize: CONTEXT_SIZE,
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
}));
}
break;
}
}
const actualParallelism = contexts.length;
// Measure VRAM after context creation
const vramAfter = llama.gpu ? await llama.getVramState() : null;
const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0;
const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0;
// Warm up
await contexts[0]!.rankAll(QUERY, docs.slice(0, 2));
// Benchmark iterations
const times: number[] = [];
let peakRss = getMemUsage().rss;
for (let iter = 0; iter < ITERATIONS; iter++) {
const chunkSize = Math.ceil(docs.length / actualParallelism);
const t0 = performance.now();
const allScores = await Promise.all(
Array.from({ length: actualParallelism }, (_, i) => {
const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize);
return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]);
})
);
const elapsed = performance.now() - t0;
times.push(elapsed);
// Verify scores are valid
const flat = allScores.flat();
if (flat.some(s => s < 0 || s > 1 || isNaN(s))) {
throw new Error("Invalid scores detected");
}
const currentRss = getMemUsage().rss;
if (currentRss > peakRss) peakRss = currentRss;
}
// Cleanup
for (const ctx of contexts) await ctx.dispose();
const med = median(times);
return {
parallelism: actualParallelism,
contextSize: CONTEXT_SIZE,
flashAttention: flash,
times,
medianMs: med,
docsPerSec: (docs.length / med) * 1000,
vramPerContext: vramPerCtx,
totalVram: vramUsed,
peakRss,
};
}
// ============================================================================
// Main
// ============================================================================
async function main() {
console.log("═══════════════════════════════════════════════════════════════");
console.log(" QMD Reranker Benchmark");
console.log("═══════════════════════════════════════════════════════════════\n");
// Detect GPU
const gpuTypes = await getLlamaGpuTypes();
const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
let llama: Llama;
let gpuLabel: string;
if (preferred) {
try {
llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
gpuLabel = `${preferred}`;
} catch {
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
gpuLabel = "cpu (gpu init failed)";
}
} else {
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
gpuLabel = "cpu";
}
// System info
const cpuInfo = cpus();
const cpuModel = cpuInfo[0]?.model || "unknown";
const cpuCount = cpuInfo.length;
console.log("System");
console.log(` CPU: ${cpuModel}`);
console.log(` Cores: ${cpuCount} (${llama.cpuMathCores} math)`);
console.log(` Device: ${gpuLabel}`);
if (llama.gpu) {
const gpuNames = await llama.getGpuDeviceNames();
const counts = new Map<string, number>();
for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1);
const devStr = Array.from(counts.entries())
.map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", ");
console.log(` GPU: ${devStr}`);
const vram = await llama.getVramState();
console.log(` VRAM: ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`);
}
console.log(` RAM: ${formatBytes(getMemUsage().rss)} RSS at start`);
// Load model
console.log(`\nModel`);
console.log(` URI: ${RERANK_MODEL}`);
const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE);
const vramPreModel = llama.gpu ? await llama.getVramState() : null;
const model = await llama.loadModel({ modelPath });
const vramPostModel = llama.gpu ? await llama.getVramState() : null;
const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0;
console.log(` Params: ${model.trainContextSize} train ctx`);
if (modelVram > 0) console.log(` VRAM: ${formatBytes(modelVram)} (model weights)`);
// Generate test docs
const docs = generateDocs(DOC_COUNT);
console.log(`\nBenchmark`);
console.log(` Documents: ${DOC_COUNT}`);
console.log(` Ctx size: ${CONTEXT_SIZE}`);
console.log(` Iterations:${ITERATIONS}`);
console.log(` Query: "${QUERY.slice(0, 50)}..."`);
// Run benchmarks
const results: BenchResult[] = [];
for (const p of PARALLEL_CONFIGS) {
if (!llama.gpu && p > 1) {
// CPU: only test if we have enough cores (at least 4 per context)
if (llama.cpuMathCores < p * 4) {
console.log(`\n [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
continue;
}
}
// Test with flash attention
process.stdout.write(`\n [${p} ctx, flash] running...`);
try {
const r = await benchmarkConfig(model, llama, docs, p, true);
results.push(r);
process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
} catch (e: any) {
process.stdout.write(` failed: ${e.message}\n`);
// Try without flash
process.stdout.write(` [${p} ctx, no flash] running...`);
try {
const r = await benchmarkConfig(model, llama, docs, p, false);
results.push(r);
process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
} catch (e2: any) {
process.stdout.write(` failed: ${e2.message}\n`);
}
}
}
// Summary table
console.log("\n═══════════════════════════════════════════════════════════════");
console.log(" Results");
console.log("═══════════════════════════════════════════════════════════════\n");
const header = " Ctx Flash Median Docs/s VRAM/ctx Total VRAM Peak RSS";
const sep = " ─── ───── ────── ────── ──────── ────────── ────────";
console.log(header);
console.log(sep);
const baseline = results[0]?.medianMs ?? 1;
for (const r of results) {
const speedup = baseline / r.medianMs;
const speedupStr = r === results[0] ? " " : `(${speedup.toFixed(1)}×)`;
console.log(
` ${String(r.parallelism).padStart(3)} ` +
`${r.flashAttention ? " yes " : " no "} ` +
`${r.medianMs.toFixed(0).padStart(5)}ms ` +
`${r.docsPerSec.toFixed(1).padStart(6)} ` +
`${formatBytes(r.vramPerContext).padStart(8)} ` +
`${formatBytes(r.totalVram).padStart(10)} ` +
`${formatBytes(r.peakRss).padStart(8)} ` +
speedupStr
);
}
// Best config
if (results.length > 0) {
const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b);
console.log(`\n Best: ${best.parallelism} contexts, flash=${best.flashAttention}`);
console.log(` ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`);
if (best.totalVram > 0) console.log(` ${formatBytes(best.totalVram)} VRAM`);
}
console.log("");
await model.dispose();
await llama.dispose();
}
main().catch(console.error);

View File

@ -221,10 +221,15 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
const successCount = allResults.filter(r => r !== null).length;
expect(successCount).toBe(10);
// THE KEY ASSERTION: Only 1 context should be created, not 5
// Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call)
console.log(`Context creation count: ${contextCreateCount} (expected: 1)`);
expect(contextCreateCount).toBe(1);
// THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
// not duplicated per concurrent embedBatch call. The exact count depends on
// available VRAM (computeParallelism), but should not be 5 (one per call).
// Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
// With the promise guard, contexts are created exactly once regardless of concurrent callers.
// The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
expect(contextCreateCount).toBeGreaterThanOrEqual(1);
expect(contextCreateCount).toBeLessThanOrEqual(8);
await freshLlm.dispose();
}, 60000);

View File

@ -6,6 +6,7 @@
import {
getLlama,
getLlamaGpuTypes,
resolveModelFile,
LlamaChatSession,
LlamaLogLevel,
@ -354,10 +355,10 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
export class LlamaCpp implements LLM {
private llama: Llama | null = null;
private embedModel: LlamaModel | null = null;
private embedContext: LlamaEmbeddingContext | null = null;
private embedContexts: LlamaEmbeddingContext[] = [];
private generateModel: LlamaModel | null = null;
private rerankModel: LlamaModel | null = null;
private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
private embedModelUri: string;
private generateModelUri: string;
@ -366,7 +367,6 @@ export class LlamaCpp implements LLM {
// Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
private embedModelLoadPromise: Promise<LlamaModel> | null = null;
private embedContextCreatePromise: Promise<LlamaEmbeddingContext> | null = null;
private generateModelLoadPromise: Promise<LlamaModel> | null = null;
private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
@ -423,7 +423,7 @@ export class LlamaCpp implements LLM {
* Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
*/
private hasLoadedContexts(): boolean {
return !!(this.embedContext || this.rerankContext);
return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
}
/**
@ -445,14 +445,14 @@ export class LlamaCpp implements LLM {
}
// Dispose contexts first
if (this.embedContext) {
await this.embedContext.dispose();
this.embedContext = null;
for (const ctx of this.embedContexts) {
await ctx.dispose();
}
if (this.rerankContext) {
await this.rerankContext.dispose();
this.rerankContext = null;
this.embedContexts = [];
for (const ctx of this.rerankContexts) {
await ctx.dispose();
}
this.rerankContexts = [];
// Optionally dispose models too (opt-in)
if (this.disposeModelsOnInactivity) {
@ -491,7 +491,33 @@ export class LlamaCpp implements LLM {
*/
private async ensureLlama(): Promise<Llama> {
if (!this.llama) {
this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
// Detect available GPU types and use the best one.
// We can't rely on gpu:"auto" — it returns false even when CUDA is available
// (likely a binary/build config issue in node-llama-cpp).
const gpuTypes = await getLlamaGpuTypes();
// Prefer CUDA > Metal > Vulkan > CPU
const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
let llama: Llama;
if (preferred) {
try {
llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
} catch {
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
process.stderr.write(
`QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
);
}
} else {
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
}
if (!llama.gpu) {
process.stderr.write(
"QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
);
}
this.llama = llama;
}
return this.llama;
}
@ -535,34 +561,92 @@ export class LlamaCpp implements LLM {
}
/**
* Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
* Uses promise guard to prevent concurrent context creation race condition.
* Compute how many parallel contexts to create.
*
* GPU: constrained by VRAM (25% of free, capped at 8).
* CPU: constrained by cores. Splitting threads across contexts enables
* true parallelism (each context runs on its own cores). Use at most
* half the math cores, with at least 4 threads per context.
*/
private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
if (!this.embedContext) {
// If context creation is already in progress, wait for it
if (this.embedContextCreatePromise) {
return await this.embedContextCreatePromise;
}
// Start context creation and store promise so concurrent calls wait
this.embedContextCreatePromise = (async () => {
const model = await this.ensureEmbedModel();
const context = await model.createEmbeddingContext();
this.embedContext = context;
return context;
})();
private async computeParallelism(perContextMB: number): Promise<number> {
const llama = await this.ensureLlama();
if (llama.gpu) {
try {
const context = await this.embedContextCreatePromise;
this.touchActivity();
return context;
} finally {
this.embedContextCreatePromise = null;
const vram = await llama.getVramState();
const freeMB = vram.free / (1024 * 1024);
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
return Math.max(1, Math.min(8, maxByVram));
} catch {
return 2;
}
}
this.touchActivity();
return this.embedContext;
// CPU: split cores across contexts. At least 4 threads per context.
const cores = llama.cpuMathCores || 4;
const maxContexts = Math.floor(cores / 4);
return Math.max(1, Math.min(4, maxContexts));
}
/**
* Get the number of threads each context should use, given N parallel contexts.
* Splits available math cores evenly across contexts.
*/
private async threadsPerContext(parallelism: number): Promise<number> {
const llama = await this.ensureLlama();
if (llama.gpu) return 0; // GPU: let the library decide
const cores = llama.cpuMathCores || 4;
return Math.max(1, Math.floor(cores / parallelism));
}
/**
* Load embedding contexts (lazy). Creates multiple for parallel embedding.
* Uses promise guard to prevent concurrent context creation race condition.
*/
private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
if (this.embedContexts.length > 0) {
this.touchActivity();
return this.embedContexts;
}
if (this.embedContextsCreatePromise) {
return await this.embedContextsCreatePromise;
}
this.embedContextsCreatePromise = (async () => {
const model = await this.ensureEmbedModel();
// Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
const n = await this.computeParallelism(150);
const threads = await this.threadsPerContext(n);
for (let i = 0; i < n; i++) {
try {
this.embedContexts.push(await model.createEmbeddingContext({
...(threads > 0 ? { threads } : {}),
}));
} catch {
if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
break;
}
}
this.touchActivity();
return this.embedContexts;
})();
try {
return await this.embedContextsCreatePromise;
} finally {
this.embedContextsCreatePromise = null;
}
}
/**
* Get a single embed context (for single-embed calls). Uses first from pool.
*/
private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
const contexts = await this.ensureEmbedContexts();
return contexts[0]!;
}
/**
@ -624,15 +708,50 @@ export class LlamaCpp implements LLM {
}
/**
* Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
* Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
* Each context has its own sequence, so they can evaluate independently.
*
* Tuning choices:
* - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
* - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
* - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
*/
private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
if (!this.rerankContext) {
// Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
// Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
// Use 2048 for safety margin. Still 17× less than auto (40960).
private static readonly RERANK_CONTEXT_SIZE = 2048;
private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
if (this.rerankContexts.length === 0) {
const model = await this.ensureRerankModel();
this.rerankContext = await model.createRankingContext();
// ~960 MB per context with flash attention at contextSize 2048
const n = await this.computeParallelism(1000);
const threads = await this.threadsPerContext(n);
for (let i = 0; i < n; i++) {
try {
this.rerankContexts.push(await model.createRankingContext({
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
flashAttention: true,
...(threads > 0 ? { threads } : {}),
}));
} catch {
if (this.rerankContexts.length === 0) {
// Flash attention might not be supported — retry without it
try {
this.rerankContexts.push(await model.createRankingContext({
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
...(threads > 0 ? { threads } : {}),
}));
} catch {
throw new Error("Failed to create any rerank context");
}
}
break;
}
}
}
this.touchActivity();
return this.rerankContext;
return this.rerankContexts;
}
// ==========================================================================
@ -703,26 +822,51 @@ export class LlamaCpp implements LLM {
if (texts.length === 0) return [];
try {
const context = await this.ensureEmbedContext();
const contexts = await this.ensureEmbedContexts();
const n = contexts.length;
// node-llama-cpp handles batching internally when we make parallel requests
const embeddings = await Promise.all(
texts.map(async (text) => {
if (n === 1) {
// Single context: sequential (no point splitting)
const context = contexts[0]!;
const embeddings = [];
for (const text of texts) {
try {
const embedding = await context.getEmbeddingFor(text);
this.touchActivity(); // Keep-alive during slow batches
return {
embedding: Array.from(embedding.vector),
model: this.embedModelUri,
};
this.touchActivity();
embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
} catch (err) {
console.error("Embedding error for text:", err);
return null;
embeddings.push(null);
}
}
return embeddings;
}
// Multiple contexts: split texts across contexts for parallel evaluation
const chunkSize = Math.ceil(texts.length / n);
const chunks = Array.from({ length: n }, (_, i) =>
texts.slice(i * chunkSize, (i + 1) * chunkSize)
);
const chunkResults = await Promise.all(
chunks.map(async (chunk, i) => {
const ctx = contexts[i]!;
const results: (EmbeddingResult | null)[] = [];
for (const text of chunk) {
try {
const embedding = await ctx.getEmbeddingFor(text);
this.touchActivity();
results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
} catch (err) {
console.error("Embedding error for text:", err);
results.push(null);
}
}
return results;
})
);
return embeddings;
return chunkResults.flat();
} catch (error) {
console.error("Batch embedding error:", error);
return texts.map(() => null);
@ -879,7 +1023,7 @@ export class LlamaCpp implements LLM {
// Ping activity at start to keep models alive during this operation
this.touchActivity();
const context = await this.ensureRerankContext();
const contexts = await this.ensureRerankContexts();
// Build a map from document text to original indices (for lookup after sorting)
const textToDoc = new Map<string, { file: string; index: number }>();
@ -890,8 +1034,24 @@ export class LlamaCpp implements LLM {
// Extract just the text for ranking
const texts = documents.map((doc) => doc.text);
// Use the proper ranking API - returns [{document: string, score: number}] sorted by score
const ranked = await context.rankAndSort(query, texts);
// Split documents across contexts for parallel evaluation.
// Each context has its own sequence with a lock, so parallelism comes
// from multiple contexts evaluating different chunks simultaneously.
const n = contexts.length;
const chunkSize = Math.ceil(texts.length / n);
const chunks = Array.from({ length: n }, (_, i) =>
texts.slice(i * chunkSize, (i + 1) * chunkSize)
).filter(chunk => chunk.length > 0);
const allScores = await Promise.all(
chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
);
// Reassemble scores in original order and sort
const flatScores = allScores.flat();
const ranked = texts
.map((text, i) => ({ document: text, score: flatScores[i]! }))
.sort((a, b) => b.score - a.score);
// Map back to our result format using the text-to-doc map
const results: RerankDocumentResult[] = ranked.map((item) => {
@ -909,6 +1069,35 @@ export class LlamaCpp implements LLM {
};
}
/**
* Get device/GPU info for status display.
* Initializes llama if not already done.
*/
async getDeviceInfo(): Promise<{
gpu: string | false;
gpuOffloading: boolean;
gpuDevices: string[];
vram?: { total: number; used: number; free: number };
cpuCores: number;
}> {
const llama = await this.ensureLlama();
const gpuDevices = await llama.getGpuDeviceNames();
let vram: { total: number; used: number; free: number } | undefined;
if (llama.gpu) {
try {
const state = await llama.getVramState();
vram = { total: state.total, used: state.used, free: state.free };
} catch { /* no vram info */ }
}
return {
gpu: llama.gpu,
gpuOffloading: llama.supportsGpuOffloading,
gpuDevices,
vram,
cpuCores: llama.cpuMathCores,
};
}
async dispose(): Promise<void> {
// Prevent double-dispose
if (this.disposed) {
@ -932,8 +1121,8 @@ export class LlamaCpp implements LLM {
}
// Clear references
this.embedContext = null;
this.rerankContext = null;
this.embedContexts = [];
this.rerankContexts = [];
this.embedModel = null;
this.generateModel = null;
this.rerankModel = null;
@ -941,7 +1130,7 @@ export class LlamaCpp implements LLM {
// Clear any in-flight load/create promises
this.embedModelLoadPromise = null;
this.embedContextCreatePromise = null;
this.embedContextsCreatePromise = null;
this.generateModelLoadPromise = null;
this.rerankModelLoadPromise = null;
}

View File

@ -1,4 +1,3 @@
#!/usr/bin/env bun
/**
* QMD MCP Server - Model Context Protocol server for QMD
*
@ -8,6 +7,8 @@
* Follows MCP spec 2025-06-18 for proper response types.
*/
import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
import { fileURLToPath } from "url";
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { WebStandardStreamableHTTPServerTransport }
@ -147,7 +148,7 @@ function buildInstructions(store: Store): string {
*/
function createMcpServer(store: Store): McpServer {
const server = new McpServer(
{ name: "qmd", version: "1.0.0" },
{ name: "qmd", version: "0.9.9" },
{ instructions: buildInstructions(store) },
);
@ -237,9 +238,7 @@ function createMcpServer(store: Store): McpServer {
},
},
async ({ query, limit, minScore, collection }) => {
// Note: Collection filtering is now done post-search since collections are managed in YAML
const results = store.searchFTS(query, limit || 10)
.filter(r => !collection || r.collectionName === collection);
const results = store.searchFTS(query, limit || 10, collection);
const filtered: SearchResultItem[] = results
.filter(r => r.score >= (minScore || 0))
.map(r => {
@ -541,7 +540,7 @@ export async function startMcpServer(): Promise<void> {
// =============================================================================
export type HttpServerHandle = {
httpServer: ReturnType<typeof Bun.serve>;
httpServer: import("http").Server;
port: number;
stop: () => Promise<void>;
};
@ -588,47 +587,79 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
if (!quiet) console.error(msg);
}
const httpServer = Bun.serve({
port,
hostname: "localhost",
async fetch(req) {
const reqStart = Date.now();
const pathname = new URL(req.url).pathname;
// Helper to collect request body
async function collectBody(req: IncomingMessage): Promise<string> {
const chunks: Buffer[] = [];
for await (const chunk of req) chunks.push(chunk as Buffer);
return Buffer.concat(chunks).toString();
}
if (pathname === "/health" && req.method === "GET") {
const res = Response.json({
status: "ok",
uptime: Math.floor((Date.now() - startTime) / 1000),
});
const httpServer = createServer(async (nodeReq: IncomingMessage, nodeRes: ServerResponse) => {
const reqStart = Date.now();
const pathname = nodeReq.url || "/";
try {
if (pathname === "/health" && nodeReq.method === "GET") {
const body = JSON.stringify({ status: "ok", uptime: Math.floor((Date.now() - startTime) / 1000) });
nodeRes.writeHead(200, { "Content-Type": "application/json" });
nodeRes.end(body);
log(`${ts()} GET /health (${Date.now() - reqStart}ms)`);
return res;
return;
}
if (pathname === "/mcp" && req.method === "POST") {
const body = await req.json();
if (pathname === "/mcp" && nodeReq.method === "POST") {
const rawBody = await collectBody(nodeReq);
const body = JSON.parse(rawBody);
const label = describeRequest(body);
const res = await transport.handleRequest(req, { parsedBody: body });
const url = `http://localhost:${port}${pathname}`;
const headers: Record<string, string> = {};
for (const [k, v] of Object.entries(nodeReq.headers)) {
if (typeof v === "string") headers[k] = v;
}
const request = new Request(url, { method: "POST", headers, body: rawBody });
const response = await transport.handleRequest(request, { parsedBody: body });
nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
nodeRes.end(Buffer.from(await response.arrayBuffer()));
log(`${ts()} POST /mcp ${label} (${Date.now() - reqStart}ms)`);
return res;
return;
}
// Pass other methods (GET, DELETE) to transport for protocol handling
if (pathname === "/mcp") {
return transport.handleRequest(req);
const url = `http://localhost:${port}${pathname}`;
const headers: Record<string, string> = {};
for (const [k, v] of Object.entries(nodeReq.headers)) {
if (typeof v === "string") headers[k] = v;
}
const rawBody = nodeReq.method !== "GET" && nodeReq.method !== "HEAD" ? await collectBody(nodeReq) : undefined;
const request = new Request(url, { method: nodeReq.method || "GET", headers, ...(rawBody ? { body: rawBody } : {}) });
const response = await transport.handleRequest(request);
nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
nodeRes.end(Buffer.from(await response.arrayBuffer()));
return;
}
return new Response("Not Found", { status: 404 });
},
nodeRes.writeHead(404);
nodeRes.end("Not Found");
} catch (err) {
console.error("HTTP handler error:", err);
nodeRes.writeHead(500);
nodeRes.end("Internal Server Error");
}
});
const actualPort = httpServer.port;
await new Promise<void>((resolve, reject) => {
httpServer.on("error", reject);
httpServer.listen(port, "localhost", () => resolve());
});
const actualPort = (httpServer.address() as import("net").AddressInfo).port;
let stopping = false;
const stop = async () => {
if (stopping) return;
stopping = true;
await transport.close();
httpServer.stop();
httpServer.close();
store.close();
await disposeDefaultLlamaCpp();
};
@ -649,6 +680,6 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
}
// Run if this is the main module
if (import.meta.main) {
if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsWith("/mcp.ts")) {
startMcpServer().catch(console.error);
}

View File

@ -65,7 +65,7 @@ import {
createStore,
getDefaultDbPath,
} from "./store.js";
import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
import {
formatSearchResults,
formatDocuments,
@ -249,7 +249,7 @@ function formatBytes(bytes: number): string {
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
}
function showStatus(): void {
async function showStatus(): Promise<void> {
const dbPath = getDbPath();
const db = getDb();
@ -362,6 +362,36 @@ function showStatus(): void {
console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
}
// Device / GPU info
try {
const llm = getDefaultLlamaCpp();
const device = await llm.getDeviceInfo();
console.log(`\n${c.bold}Device${c.reset}`);
if (device.gpu) {
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
if (device.gpuDevices.length > 0) {
// Deduplicate and count GPUs
const counts = new Map<string, number>();
for (const name of device.gpuDevices) {
counts.set(name, (counts.get(name) || 0) + 1);
}
const deviceStr = Array.from(counts.entries())
.map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
.join(', ');
console.log(` Devices: ${deviceStr}`);
}
if (device.vram) {
console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
}
} else {
console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
}
console.log(` CPU: ${device.cpuCores} math cores`);
} catch {
// Don't fail status if LLM init fails
}
closeDb();
}
@ -1871,8 +1901,7 @@ function search(query: string, opts: OutputOptions): void {
// Use large limit for --all, otherwise fetch more than needed and let outputResults filter
const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
// searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
const results = searchFTS(db, query, fetchLimit, collectionName as any);
const results = searchFTS(db, query, fetchLimit, collectionName);
// Add context to results
const resultsWithContext = results.map(r => ({
@ -2348,7 +2377,7 @@ if (import.meta.main) {
}
case "status":
showStatus();
await showStatus();
break;
case "update":

View File

@ -1219,8 +1219,8 @@ describe("FTS Search", () => {
const allResults = store.searchFTS("searchable", 10);
expect(allResults).toHaveLength(2);
// Filter by collection name (collectionId is now treated as collection name string)
const filtered = store.searchFTS("searchable", 10, collection1 as unknown as number);
// Filter by collection name
const filtered = store.searchFTS("searchable", 10, collection1);
expect(filtered).toHaveLength(1);
expect(filtered[0]!.displayPath).toBe(`${collection1}/doc1.md`);

View File

@ -830,8 +830,8 @@ export type Store = {
toVirtualPath: (absolutePath: string) => string | null;
// Search
searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise<SearchResult[]>;
searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
// Query expansion & reranking
expandQuery: (query: string, model?: string) => Promise<ExpandedQuery[]>;
@ -913,8 +913,8 @@ export function createStore(dbPath?: string): Store {
toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
// Search
searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName),
searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
// Query expansion & reranking
expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
@ -2020,7 +2020,7 @@ function buildFTS5Query(query: string): string | null {
return terms.map(t => `"${t}"*`).join(' AND ');
}
export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] {
const ftsQuery = buildFTS5Query(query);
if (!ftsQuery) return [];
@ -2039,12 +2039,9 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
`;
const params: (string | number)[] = [ftsQuery];
if (collectionId) {
// Note: collectionId is a legacy parameter that should be phased out
// Collections are now managed in YAML. For now, we interpret it as a collection name filter.
// This code path is likely unused as collection filtering should be done at CLI level.
if (collectionName) {
sql += ` AND d.collection = ?`;
params.push(String(collectionId));
params.push(String(collectionName));
}
// bm25 lower is better; sort ascending.
@ -2080,11 +2077,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
// Vector Search
// =============================================================================
export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
if (!tableExists) return [];
const embedding = await getEmbedding(query, model, true, session);
const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
if (!embedding) return [];
// IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
@ -2848,8 +2845,8 @@ export async function hybridQuery(
).get();
// Step 1: BM25 probe — strong signal skips expensive LLM expansion
const initialFts = store.searchFTS(query, 20)
.filter(r => !collection || r.collectionName === collection);
// Pass collection directly into FTS query (filter at SQL level, not post-hoc)
const initialFts = store.searchFTS(query, 20, collection);
const topScore = initialFts[0]?.score ?? 0;
const secondScore = initialFts[1]?.score ?? 0;
const hasStrongSignal = initialFts.length > 0
@ -2875,26 +2872,15 @@ export async function hybridQuery(
}
// Step 3: Route searches by query type
// Original query → vector search (FTS already covered by probe in step 1).
// Vector searches run sequentially — node-llama-cpp's embed context
// hangs on concurrent embed() calls (known limitation).
if (hasVectors) {
const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, 20, collection);
if (vecResults.length > 0) {
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(vecResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
}
}
//
// Strategy: run all FTS queries immediately (they're sync/instant), then
// batch-embed all vector queries in one embedBatch() call, then run
// sqlite-vec lookups with pre-computed embeddings.
// Expanded queries → route by type: lex→FTS only, vec/hyde→vector only.
// This restores the CLI's query-type-aware routing that was lost in the initial refactor.
// 3a: Run FTS for all lex expansions right away (no LLM needed)
for (const q of expanded) {
if (q.type === 'lex') {
const ftsResults = store.searchFTS(q.text, 20)
.filter(r => !collection || r.collectionName === collection);
const ftsResults = store.searchFTS(q.text, 20, collection);
if (ftsResults.length > 0) {
for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(ftsResults.map(r => ({
@ -2902,17 +2888,40 @@ export async function hybridQuery(
title: r.title, body: r.body || "", score: r.score,
})));
}
} else {
// vec or hyde → vector search only
if (hasVectors) {
const vecResults = await store.searchVec(q.text, DEFAULT_EMBED_MODEL, 20, collection);
if (vecResults.length > 0) {
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(vecResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
}
}
}
// 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
if (hasVectors) {
const vecQueries: { text: string; isOriginal: boolean }[] = [
{ text: query, isOriginal: true },
];
for (const q of expanded) {
if (q.type === 'vec' || q.type === 'hyde') {
vecQueries.push({ text: q.text, isOriginal: false });
}
}
// Batch embed all vector queries in a single call
const llm = getDefaultLlamaCpp();
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
const embeddings = await llm.embedBatch(textsToEmbed);
// Run sqlite-vec lookups with pre-computed embeddings
for (let i = 0; i < vecQueries.length; i++) {
const embedding = embeddings[i]?.embedding;
if (!embedding) continue;
const vecResults = await store.searchVec(
vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
undefined, embedding
);
if (vecResults.length > 0) {
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
rankedLists.push(vecResults.map(r => ({
file: r.filepath, displayPath: r.displayPath,
title: r.title, body: r.body || "", score: r.score,
})));
}
}
}