Merge origin/nodejs: Node.js compat, perf improvements, vitest
Brings in Node.js compatibility (tsx, vitest), GPU auto-detection, parallel embedding/reranking contexts, and flash attention support. Preserves @tobilu/qmd package scope and publish config from main.
This commit is contained in:
commit
4df5505bd6
17
README.md
17
README.md
@ -9,9 +9,15 @@ QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking
|
||||
## Quick Start
|
||||
|
||||
```sh
|
||||
# Install globally
|
||||
# Install globally (Node or Bun)
|
||||
npm install -g @tobilu/qmd
|
||||
# or
|
||||
bun install -g @tobilu/qmd
|
||||
|
||||
# Or run directly
|
||||
npx @tobilu/qmd ...
|
||||
bunx @tobilu/qmd ...
|
||||
|
||||
# Create collections for your notes, docs, and meeting transcripts
|
||||
qmd collection add ~/notes --name notes
|
||||
qmd collection add ~/Documents/meetings --name meetings
|
||||
@ -231,6 +237,7 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
|
||||
|
||||
### System Requirements
|
||||
|
||||
- **Node.js** >= 22
|
||||
- **Bun** >= 1.0.0
|
||||
- **macOS**: Homebrew SQLite (for extension support)
|
||||
```sh
|
||||
@ -252,18 +259,18 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
|
||||
## Installation
|
||||
|
||||
```sh
|
||||
npm install -g @tobilu/qmd
|
||||
# or
|
||||
bun install -g @tobilu/qmd
|
||||
```
|
||||
|
||||
Make sure `~/.bun/bin` is in your PATH.
|
||||
|
||||
### Development
|
||||
|
||||
```sh
|
||||
git clone https://github.com/tobi/qmd
|
||||
cd qmd
|
||||
bun install
|
||||
bun link
|
||||
npm install
|
||||
npm link
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
40
package.json
40
package.json
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@tobilu/qmd",
|
||||
"version": "0.9.0",
|
||||
"version": "0.9.9",
|
||||
"description": "Query Markup Documents - On-device hybrid search for markdown files with BM25, vector search, and LLM reranking",
|
||||
"type": "module",
|
||||
"bin": {
|
||||
@ -15,15 +15,26 @@
|
||||
"CHANGELOG.md"
|
||||
],
|
||||
"scripts": {
|
||||
"test": "bun test --preload ./src/test-preload.ts",
|
||||
"qmd": "bun src/qmd.ts",
|
||||
"index": "bun src/qmd.ts index",
|
||||
"vector": "bun src/qmd.ts vector",
|
||||
"search": "bun src/qmd.ts search",
|
||||
"vsearch": "bun src/qmd.ts vsearch",
|
||||
"rerank": "bun src/qmd.ts rerank",
|
||||
"link": "bun link",
|
||||
"inspector": "npx @modelcontextprotocol/inspector bun src/qmd.ts mcp",
|
||||
"test": "vitest run",
|
||||
"test:unit": "vitest run --reporter=verbose src/*.test.ts",
|
||||
"test:models": "vitest run --reporter=verbose src/models/*.test.ts",
|
||||
"test:integration": "vitest run --reporter=verbose src/integration/*.test.ts",
|
||||
"test:unit:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
|
||||
"test:models:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
|
||||
"test:integration:bun": "bun run vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
|
||||
"test:unit:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/*.test.ts",
|
||||
"test:models:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/models/*.test.ts",
|
||||
"test:integration:node": "npx vitest run --reporter=verbose --testTimeout=120000 src/integration/*.test.ts",
|
||||
"test:ci:bun": "npm run test:unit:bun && npm run test:models:bun && npm run test:integration:bun",
|
||||
"test:ci:node": "npm run test:unit:node && npm run test:models:node && npm run test:integration:node",
|
||||
"test:ci": "npm run test:unit && npm run test:models && npm run test:integration",
|
||||
"qmd": "tsx src/qmd.ts",
|
||||
"index": "tsx src/qmd.ts index",
|
||||
"vector": "tsx src/qmd.ts vector",
|
||||
"search": "tsx src/qmd.ts search",
|
||||
"vsearch": "tsx src/qmd.ts vsearch",
|
||||
"rerank": "tsx src/qmd.ts rerank",
|
||||
"inspector": "npx @modelcontextprotocol/inspector tsx src/qmd.ts mcp",
|
||||
"release": "./scripts/release.sh"
|
||||
},
|
||||
"publishConfig": {
|
||||
@ -39,7 +50,10 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@modelcontextprotocol/sdk": "^1.25.1",
|
||||
"better-sqlite3": "^11.0.0",
|
||||
"fast-glob": "^3.3.0",
|
||||
"node-llama-cpp": "^3.14.5",
|
||||
"picomatch": "^4.0.0",
|
||||
"sqlite-vec": "^0.1.7-alpha.2",
|
||||
"yaml": "^2.8.2",
|
||||
"zod": "^4.2.1"
|
||||
@ -51,13 +65,15 @@
|
||||
"sqlite-vec-win32-x64": "^0.1.7-alpha.2"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/bun": "latest"
|
||||
"@types/better-sqlite3": "^7.6.0",
|
||||
"tsx": "^4.0.0",
|
||||
"vitest": "^3.0.0"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"typescript": "^5.9.3"
|
||||
},
|
||||
"engines": {
|
||||
"bun": ">=1.0.0"
|
||||
"node": ">=22.0.0"
|
||||
},
|
||||
"keywords": [
|
||||
"markdown",
|
||||
|
||||
327
src/bench-rerank.ts
Normal file
327
src/bench-rerank.ts
Normal file
@ -0,0 +1,327 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* QMD Reranker Benchmark
|
||||
*
|
||||
* Measures reranking performance across different configurations.
|
||||
* Reports device, parallelism, memory, VRAM, and throughput.
|
||||
*
|
||||
* Usage:
|
||||
* bun src/bench-rerank.ts # full benchmark
|
||||
* bun src/bench-rerank.ts --quick # quick smoke test (10 docs, 1 iteration)
|
||||
* bun src/bench-rerank.ts --docs 100 # custom doc count
|
||||
*/
|
||||
|
||||
import {
|
||||
getLlama,
|
||||
getLlamaGpuTypes,
|
||||
resolveModelFile,
|
||||
LlamaLogLevel,
|
||||
type Llama,
|
||||
type LlamaModel,
|
||||
} from "node-llama-cpp";
|
||||
import { homedir } from "os";
|
||||
import { join } from "path";
|
||||
import { cpus } from "os";
|
||||
|
||||
// ============================================================================
|
||||
// Config
|
||||
// ============================================================================
|
||||
|
||||
const RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
||||
const MODEL_CACHE = join(homedir(), ".cache", "qmd", "models");
|
||||
const CONTEXT_SIZE = 2048;
|
||||
|
||||
const args = process.argv.slice(2);
|
||||
const quick = args.includes("--quick");
|
||||
const docsIdx = args.indexOf("--docs");
|
||||
const DOC_COUNT = docsIdx >= 0 ? parseInt(args[docsIdx + 1]!) : (quick ? 10 : 40);
|
||||
const ITERATIONS = quick ? 1 : 3;
|
||||
const PARALLEL_CONFIGS = quick ? [1, 4] : [1, 2, 4, 8];
|
||||
|
||||
// ============================================================================
|
||||
// Test data — realistic-ish chunks of varying length
|
||||
// ============================================================================
|
||||
|
||||
const QUERY = "How do AI agents work and what are their limitations?";
|
||||
|
||||
function generateDocs(n: number): string[] {
|
||||
const templates = [
|
||||
"Artificial intelligence agents are software systems that perceive their environment and take actions to achieve goals. They use techniques like reinforcement learning, planning, and natural language processing to operate autonomously.",
|
||||
"The transformer architecture, introduced in 2017, revolutionized natural language processing. Self-attention mechanisms allow models to weigh the importance of different parts of input sequences when generating outputs.",
|
||||
"Machine learning models require careful evaluation to avoid overfitting. Cross-validation, holdout sets, and metrics like precision, recall, and F1 score help assess generalization performance.",
|
||||
"Retrieval-augmented generation combines information retrieval with language models. Documents are embedded into vector spaces, retrieved based on query similarity, and used as context for generation.",
|
||||
"Neural network training involves forward propagation, loss computation, and backpropagation. Optimizers like Adam and SGD adjust weights to minimize the loss function over training iterations.",
|
||||
"Large language models exhibit emergent capabilities at scale, including few-shot learning, chain-of-thought reasoning, and instruction following. These properties were not explicitly trained for.",
|
||||
"Embedding models convert text into dense vector representations that capture semantic meaning. Similar texts produce similar vectors, enabling efficient similarity search and clustering.",
|
||||
"Autonomous agents face challenges including hallucination, lack of grounding, limited planning horizons, and difficulty with multi-step reasoning. Safety and alignment remain open research problems.",
|
||||
"The attention mechanism computes query-key-value interactions to determine which parts of the input are most relevant. Multi-head attention allows the model to attend to different representation subspaces.",
|
||||
"Fine-tuning adapts a pre-trained model to specific tasks using domain-specific data. Techniques like LoRA reduce the number of trainable parameters while maintaining performance.",
|
||||
];
|
||||
return Array.from({ length: n }, (_, i) => templates[i % templates.length]!);
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Helpers
|
||||
// ============================================================================
|
||||
|
||||
function formatBytes(bytes: number): string {
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
if (bytes < 1024 * 1024 * 1024) return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
||||
return `${(bytes / (1024 * 1024 * 1024)).toFixed(2)} GB`;
|
||||
}
|
||||
|
||||
function getMemUsage(): { rss: number; heapUsed: number } {
|
||||
const m = process.memoryUsage();
|
||||
return { rss: m.rss, heapUsed: m.heapUsed };
|
||||
}
|
||||
|
||||
function median(arr: number[]): number {
|
||||
const sorted = [...arr].sort((a, b) => a - b);
|
||||
const mid = Math.floor(sorted.length / 2);
|
||||
return sorted.length % 2 !== 0 ? sorted[mid]! : (sorted[mid - 1]! + sorted[mid]!) / 2;
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Benchmark runner
|
||||
// ============================================================================
|
||||
|
||||
interface BenchResult {
|
||||
parallelism: number;
|
||||
contextSize: number;
|
||||
flashAttention: boolean;
|
||||
times: number[]; // ms per run
|
||||
medianMs: number;
|
||||
docsPerSec: number;
|
||||
vramPerContext: number; // bytes
|
||||
totalVram: number; // bytes
|
||||
peakRss: number; // bytes
|
||||
}
|
||||
|
||||
async function benchmarkConfig(
|
||||
model: LlamaModel,
|
||||
llama: Llama,
|
||||
docs: string[],
|
||||
parallelism: number,
|
||||
flash: boolean,
|
||||
): Promise<BenchResult> {
|
||||
// Measure VRAM before
|
||||
const vramBefore = llama.gpu ? await llama.getVramState() : null;
|
||||
const rssBefore = getMemUsage().rss;
|
||||
|
||||
// Create contexts. On CPU, split threads evenly across contexts.
|
||||
const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
|
||||
const contexts = [];
|
||||
for (let i = 0; i < parallelism; i++) {
|
||||
try {
|
||||
contexts.push(await model.createRankingContext({
|
||||
contextSize: CONTEXT_SIZE,
|
||||
flashAttention: flash,
|
||||
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
if (contexts.length === 0) {
|
||||
// Try without flash
|
||||
contexts.push(await model.createRankingContext({
|
||||
contextSize: CONTEXT_SIZE,
|
||||
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
|
||||
}));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
const actualParallelism = contexts.length;
|
||||
|
||||
// Measure VRAM after context creation
|
||||
const vramAfter = llama.gpu ? await llama.getVramState() : null;
|
||||
const vramUsed = vramBefore && vramAfter ? vramAfter.used - vramBefore.used : 0;
|
||||
const vramPerCtx = actualParallelism > 0 ? vramUsed / actualParallelism : 0;
|
||||
|
||||
// Warm up
|
||||
await contexts[0]!.rankAll(QUERY, docs.slice(0, 2));
|
||||
|
||||
// Benchmark iterations
|
||||
const times: number[] = [];
|
||||
let peakRss = getMemUsage().rss;
|
||||
|
||||
for (let iter = 0; iter < ITERATIONS; iter++) {
|
||||
const chunkSize = Math.ceil(docs.length / actualParallelism);
|
||||
|
||||
const t0 = performance.now();
|
||||
const allScores = await Promise.all(
|
||||
Array.from({ length: actualParallelism }, (_, i) => {
|
||||
const chunk = docs.slice(i * chunkSize, (i + 1) * chunkSize);
|
||||
return chunk.length > 0 ? contexts[i]!.rankAll(QUERY, chunk) : Promise.resolve([]);
|
||||
})
|
||||
);
|
||||
const elapsed = performance.now() - t0;
|
||||
times.push(elapsed);
|
||||
|
||||
// Verify scores are valid
|
||||
const flat = allScores.flat();
|
||||
if (flat.some(s => s < 0 || s > 1 || isNaN(s))) {
|
||||
throw new Error("Invalid scores detected");
|
||||
}
|
||||
|
||||
const currentRss = getMemUsage().rss;
|
||||
if (currentRss > peakRss) peakRss = currentRss;
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
for (const ctx of contexts) await ctx.dispose();
|
||||
|
||||
const med = median(times);
|
||||
return {
|
||||
parallelism: actualParallelism,
|
||||
contextSize: CONTEXT_SIZE,
|
||||
flashAttention: flash,
|
||||
times,
|
||||
medianMs: med,
|
||||
docsPerSec: (docs.length / med) * 1000,
|
||||
vramPerContext: vramPerCtx,
|
||||
totalVram: vramUsed,
|
||||
peakRss,
|
||||
};
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Main
|
||||
// ============================================================================
|
||||
|
||||
async function main() {
|
||||
console.log("═══════════════════════════════════════════════════════════════");
|
||||
console.log(" QMD Reranker Benchmark");
|
||||
console.log("═══════════════════════════════════════════════════════════════\n");
|
||||
|
||||
// Detect GPU
|
||||
const gpuTypes = await getLlamaGpuTypes();
|
||||
const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
|
||||
|
||||
let llama: Llama;
|
||||
let gpuLabel: string;
|
||||
if (preferred) {
|
||||
try {
|
||||
llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
|
||||
gpuLabel = `${preferred}`;
|
||||
} catch {
|
||||
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
||||
gpuLabel = "cpu (gpu init failed)";
|
||||
}
|
||||
} else {
|
||||
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
||||
gpuLabel = "cpu";
|
||||
}
|
||||
|
||||
// System info
|
||||
const cpuInfo = cpus();
|
||||
const cpuModel = cpuInfo[0]?.model || "unknown";
|
||||
const cpuCount = cpuInfo.length;
|
||||
|
||||
console.log("System");
|
||||
console.log(` CPU: ${cpuModel}`);
|
||||
console.log(` Cores: ${cpuCount} (${llama.cpuMathCores} math)`);
|
||||
console.log(` Device: ${gpuLabel}`);
|
||||
|
||||
if (llama.gpu) {
|
||||
const gpuNames = await llama.getGpuDeviceNames();
|
||||
const counts = new Map<string, number>();
|
||||
for (const name of gpuNames) counts.set(name, (counts.get(name) || 0) + 1);
|
||||
const devStr = Array.from(counts.entries())
|
||||
.map(([name, n]) => n > 1 ? `${n}× ${name}` : name).join(", ");
|
||||
console.log(` GPU: ${devStr}`);
|
||||
const vram = await llama.getVramState();
|
||||
console.log(` VRAM: ${formatBytes(vram.total)} total, ${formatBytes(vram.free)} free`);
|
||||
}
|
||||
|
||||
console.log(` RAM: ${formatBytes(getMemUsage().rss)} RSS at start`);
|
||||
|
||||
// Load model
|
||||
console.log(`\nModel`);
|
||||
console.log(` URI: ${RERANK_MODEL}`);
|
||||
const modelPath = await resolveModelFile(RERANK_MODEL, MODEL_CACHE);
|
||||
const vramPreModel = llama.gpu ? await llama.getVramState() : null;
|
||||
const model = await llama.loadModel({ modelPath });
|
||||
const vramPostModel = llama.gpu ? await llama.getVramState() : null;
|
||||
const modelVram = vramPreModel && vramPostModel ? vramPostModel.used - vramPreModel.used : 0;
|
||||
console.log(` Params: ${model.trainContextSize} train ctx`);
|
||||
if (modelVram > 0) console.log(` VRAM: ${formatBytes(modelVram)} (model weights)`);
|
||||
|
||||
// Generate test docs
|
||||
const docs = generateDocs(DOC_COUNT);
|
||||
console.log(`\nBenchmark`);
|
||||
console.log(` Documents: ${DOC_COUNT}`);
|
||||
console.log(` Ctx size: ${CONTEXT_SIZE}`);
|
||||
console.log(` Iterations:${ITERATIONS}`);
|
||||
console.log(` Query: "${QUERY.slice(0, 50)}..."`);
|
||||
|
||||
// Run benchmarks
|
||||
const results: BenchResult[] = [];
|
||||
|
||||
for (const p of PARALLEL_CONFIGS) {
|
||||
if (!llama.gpu && p > 1) {
|
||||
// CPU: only test if we have enough cores (at least 4 per context)
|
||||
if (llama.cpuMathCores < p * 4) {
|
||||
console.log(`\n [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Test with flash attention
|
||||
process.stdout.write(`\n [${p} ctx, flash] running...`);
|
||||
try {
|
||||
const r = await benchmarkConfig(model, llama, docs, p, true);
|
||||
results.push(r);
|
||||
process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
|
||||
} catch (e: any) {
|
||||
process.stdout.write(` failed: ${e.message}\n`);
|
||||
// Try without flash
|
||||
process.stdout.write(` [${p} ctx, no flash] running...`);
|
||||
try {
|
||||
const r = await benchmarkConfig(model, llama, docs, p, false);
|
||||
results.push(r);
|
||||
process.stdout.write(` ${r.medianMs.toFixed(0)}ms (${r.docsPerSec.toFixed(1)} docs/s)\n`);
|
||||
} catch (e2: any) {
|
||||
process.stdout.write(` failed: ${e2.message}\n`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Summary table
|
||||
console.log("\n═══════════════════════════════════════════════════════════════");
|
||||
console.log(" Results");
|
||||
console.log("═══════════════════════════════════════════════════════════════\n");
|
||||
|
||||
const header = " Ctx Flash Median Docs/s VRAM/ctx Total VRAM Peak RSS";
|
||||
const sep = " ─── ───── ────── ────── ──────── ────────── ────────";
|
||||
console.log(header);
|
||||
console.log(sep);
|
||||
|
||||
const baseline = results[0]?.medianMs ?? 1;
|
||||
for (const r of results) {
|
||||
const speedup = baseline / r.medianMs;
|
||||
const speedupStr = r === results[0] ? " " : `(${speedup.toFixed(1)}×)`;
|
||||
console.log(
|
||||
` ${String(r.parallelism).padStart(3)} ` +
|
||||
`${r.flashAttention ? " yes " : " no "} ` +
|
||||
`${r.medianMs.toFixed(0).padStart(5)}ms ` +
|
||||
`${r.docsPerSec.toFixed(1).padStart(6)} ` +
|
||||
`${formatBytes(r.vramPerContext).padStart(8)} ` +
|
||||
`${formatBytes(r.totalVram).padStart(10)} ` +
|
||||
`${formatBytes(r.peakRss).padStart(8)} ` +
|
||||
speedupStr
|
||||
);
|
||||
}
|
||||
|
||||
// Best config
|
||||
if (results.length > 0) {
|
||||
const best = results.reduce((a, b) => a.docsPerSec > b.docsPerSec ? a : b);
|
||||
console.log(`\n Best: ${best.parallelism} contexts, flash=${best.flashAttention}`);
|
||||
console.log(` ${best.medianMs.toFixed(0)}ms for ${DOC_COUNT} docs (${best.docsPerSec.toFixed(1)} docs/s)`);
|
||||
if (best.totalVram > 0) console.log(` ${formatBytes(best.totalVram)} VRAM`);
|
||||
}
|
||||
|
||||
console.log("");
|
||||
await model.dispose();
|
||||
await llama.dispose();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@ -221,10 +221,15 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
|
||||
const successCount = allResults.filter(r => r !== null).length;
|
||||
expect(successCount).toBe(10);
|
||||
|
||||
// THE KEY ASSERTION: Only 1 context should be created, not 5
|
||||
// Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call)
|
||||
console.log(`Context creation count: ${contextCreateCount} (expected: 1)`);
|
||||
expect(contextCreateCount).toBe(1);
|
||||
// THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
|
||||
// not duplicated per concurrent embedBatch call. The exact count depends on
|
||||
// available VRAM (computeParallelism), but should not be 5 (one per call).
|
||||
// Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
|
||||
// With the promise guard, contexts are created exactly once regardless of concurrent callers.
|
||||
// The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
|
||||
console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
|
||||
expect(contextCreateCount).toBeGreaterThanOrEqual(1);
|
||||
expect(contextCreateCount).toBeLessThanOrEqual(8);
|
||||
|
||||
await freshLlm.dispose();
|
||||
}, 60000);
|
||||
|
||||
301
src/llm.ts
301
src/llm.ts
@ -6,6 +6,7 @@
|
||||
|
||||
import {
|
||||
getLlama,
|
||||
getLlamaGpuTypes,
|
||||
resolveModelFile,
|
||||
LlamaChatSession,
|
||||
LlamaLogLevel,
|
||||
@ -354,10 +355,10 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
|
||||
export class LlamaCpp implements LLM {
|
||||
private llama: Llama | null = null;
|
||||
private embedModel: LlamaModel | null = null;
|
||||
private embedContext: LlamaEmbeddingContext | null = null;
|
||||
private embedContexts: LlamaEmbeddingContext[] = [];
|
||||
private generateModel: LlamaModel | null = null;
|
||||
private rerankModel: LlamaModel | null = null;
|
||||
private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
|
||||
private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
|
||||
|
||||
private embedModelUri: string;
|
||||
private generateModelUri: string;
|
||||
@ -366,7 +367,6 @@ export class LlamaCpp implements LLM {
|
||||
|
||||
// Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
|
||||
private embedModelLoadPromise: Promise<LlamaModel> | null = null;
|
||||
private embedContextCreatePromise: Promise<LlamaEmbeddingContext> | null = null;
|
||||
private generateModelLoadPromise: Promise<LlamaModel> | null = null;
|
||||
private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
|
||||
|
||||
@ -423,7 +423,7 @@ export class LlamaCpp implements LLM {
|
||||
* Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
|
||||
*/
|
||||
private hasLoadedContexts(): boolean {
|
||||
return !!(this.embedContext || this.rerankContext);
|
||||
return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -445,14 +445,14 @@ export class LlamaCpp implements LLM {
|
||||
}
|
||||
|
||||
// Dispose contexts first
|
||||
if (this.embedContext) {
|
||||
await this.embedContext.dispose();
|
||||
this.embedContext = null;
|
||||
for (const ctx of this.embedContexts) {
|
||||
await ctx.dispose();
|
||||
}
|
||||
if (this.rerankContext) {
|
||||
await this.rerankContext.dispose();
|
||||
this.rerankContext = null;
|
||||
this.embedContexts = [];
|
||||
for (const ctx of this.rerankContexts) {
|
||||
await ctx.dispose();
|
||||
}
|
||||
this.rerankContexts = [];
|
||||
|
||||
// Optionally dispose models too (opt-in)
|
||||
if (this.disposeModelsOnInactivity) {
|
||||
@ -491,7 +491,33 @@ export class LlamaCpp implements LLM {
|
||||
*/
|
||||
private async ensureLlama(): Promise<Llama> {
|
||||
if (!this.llama) {
|
||||
this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
|
||||
// Detect available GPU types and use the best one.
|
||||
// We can't rely on gpu:"auto" — it returns false even when CUDA is available
|
||||
// (likely a binary/build config issue in node-llama-cpp).
|
||||
const gpuTypes = await getLlamaGpuTypes();
|
||||
// Prefer CUDA > Metal > Vulkan > CPU
|
||||
const preferred = (["cuda", "metal", "vulkan"] as const).find(g => gpuTypes.includes(g));
|
||||
|
||||
let llama: Llama;
|
||||
if (preferred) {
|
||||
try {
|
||||
llama = await getLlama({ gpu: preferred, logLevel: LlamaLogLevel.error });
|
||||
} catch {
|
||||
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
||||
process.stderr.write(
|
||||
`QMD Warning: ${preferred} reported available but failed to initialize. Falling back to CPU.\n`
|
||||
);
|
||||
}
|
||||
} else {
|
||||
llama = await getLlama({ gpu: false, logLevel: LlamaLogLevel.error });
|
||||
}
|
||||
|
||||
if (!llama.gpu) {
|
||||
process.stderr.write(
|
||||
"QMD Warning: no GPU acceleration, running on CPU (slow). Run 'qmd status' for details.\n"
|
||||
);
|
||||
}
|
||||
this.llama = llama;
|
||||
}
|
||||
return this.llama;
|
||||
}
|
||||
@ -535,34 +561,92 @@ export class LlamaCpp implements LLM {
|
||||
}
|
||||
|
||||
/**
|
||||
* Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
|
||||
* Uses promise guard to prevent concurrent context creation race condition.
|
||||
* Compute how many parallel contexts to create.
|
||||
*
|
||||
* GPU: constrained by VRAM (25% of free, capped at 8).
|
||||
* CPU: constrained by cores. Splitting threads across contexts enables
|
||||
* true parallelism (each context runs on its own cores). Use at most
|
||||
* half the math cores, with at least 4 threads per context.
|
||||
*/
|
||||
private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
|
||||
if (!this.embedContext) {
|
||||
// If context creation is already in progress, wait for it
|
||||
if (this.embedContextCreatePromise) {
|
||||
return await this.embedContextCreatePromise;
|
||||
}
|
||||
|
||||
// Start context creation and store promise so concurrent calls wait
|
||||
this.embedContextCreatePromise = (async () => {
|
||||
const model = await this.ensureEmbedModel();
|
||||
const context = await model.createEmbeddingContext();
|
||||
this.embedContext = context;
|
||||
return context;
|
||||
})();
|
||||
private async computeParallelism(perContextMB: number): Promise<number> {
|
||||
const llama = await this.ensureLlama();
|
||||
|
||||
if (llama.gpu) {
|
||||
try {
|
||||
const context = await this.embedContextCreatePromise;
|
||||
this.touchActivity();
|
||||
return context;
|
||||
} finally {
|
||||
this.embedContextCreatePromise = null;
|
||||
const vram = await llama.getVramState();
|
||||
const freeMB = vram.free / (1024 * 1024);
|
||||
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
|
||||
return Math.max(1, Math.min(8, maxByVram));
|
||||
} catch {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
this.touchActivity();
|
||||
return this.embedContext;
|
||||
|
||||
// CPU: split cores across contexts. At least 4 threads per context.
|
||||
const cores = llama.cpuMathCores || 4;
|
||||
const maxContexts = Math.floor(cores / 4);
|
||||
return Math.max(1, Math.min(4, maxContexts));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of threads each context should use, given N parallel contexts.
|
||||
* Splits available math cores evenly across contexts.
|
||||
*/
|
||||
private async threadsPerContext(parallelism: number): Promise<number> {
|
||||
const llama = await this.ensureLlama();
|
||||
if (llama.gpu) return 0; // GPU: let the library decide
|
||||
const cores = llama.cpuMathCores || 4;
|
||||
return Math.max(1, Math.floor(cores / parallelism));
|
||||
}
|
||||
|
||||
/**
|
||||
* Load embedding contexts (lazy). Creates multiple for parallel embedding.
|
||||
* Uses promise guard to prevent concurrent context creation race condition.
|
||||
*/
|
||||
private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
|
||||
|
||||
private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
|
||||
if (this.embedContexts.length > 0) {
|
||||
this.touchActivity();
|
||||
return this.embedContexts;
|
||||
}
|
||||
|
||||
if (this.embedContextsCreatePromise) {
|
||||
return await this.embedContextsCreatePromise;
|
||||
}
|
||||
|
||||
this.embedContextsCreatePromise = (async () => {
|
||||
const model = await this.ensureEmbedModel();
|
||||
// Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
|
||||
const n = await this.computeParallelism(150);
|
||||
const threads = await this.threadsPerContext(n);
|
||||
for (let i = 0; i < n; i++) {
|
||||
try {
|
||||
this.embedContexts.push(await model.createEmbeddingContext({
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
|
||||
break;
|
||||
}
|
||||
}
|
||||
this.touchActivity();
|
||||
return this.embedContexts;
|
||||
})();
|
||||
|
||||
try {
|
||||
return await this.embedContextsCreatePromise;
|
||||
} finally {
|
||||
this.embedContextsCreatePromise = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a single embed context (for single-embed calls). Uses first from pool.
|
||||
*/
|
||||
private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
|
||||
const contexts = await this.ensureEmbedContexts();
|
||||
return contexts[0]!;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -624,15 +708,50 @@ export class LlamaCpp implements LLM {
|
||||
}
|
||||
|
||||
/**
|
||||
* Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
|
||||
* Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
|
||||
* Each context has its own sequence, so they can evaluate independently.
|
||||
*
|
||||
* Tuning choices:
|
||||
* - contextSize 1024: reranking chunks are ~800 tokens max, 1024 is plenty
|
||||
* - flashAttention: ~20% less VRAM per context (568 vs 711 MB)
|
||||
* - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×)
|
||||
*/
|
||||
private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
|
||||
if (!this.rerankContext) {
|
||||
// Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.)
|
||||
// Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical.
|
||||
// Use 2048 for safety margin. Still 17× less than auto (40960).
|
||||
private static readonly RERANK_CONTEXT_SIZE = 2048;
|
||||
|
||||
private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
|
||||
if (this.rerankContexts.length === 0) {
|
||||
const model = await this.ensureRerankModel();
|
||||
this.rerankContext = await model.createRankingContext();
|
||||
// ~960 MB per context with flash attention at contextSize 2048
|
||||
const n = await this.computeParallelism(1000);
|
||||
const threads = await this.threadsPerContext(n);
|
||||
for (let i = 0; i < n; i++) {
|
||||
try {
|
||||
this.rerankContexts.push(await model.createRankingContext({
|
||||
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
|
||||
flashAttention: true,
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
if (this.rerankContexts.length === 0) {
|
||||
// Flash attention might not be supported — retry without it
|
||||
try {
|
||||
this.rerankContexts.push(await model.createRankingContext({
|
||||
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
throw new Error("Failed to create any rerank context");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
this.touchActivity();
|
||||
return this.rerankContext;
|
||||
return this.rerankContexts;
|
||||
}
|
||||
|
||||
// ==========================================================================
|
||||
@ -703,26 +822,51 @@ export class LlamaCpp implements LLM {
|
||||
if (texts.length === 0) return [];
|
||||
|
||||
try {
|
||||
const context = await this.ensureEmbedContext();
|
||||
const contexts = await this.ensureEmbedContexts();
|
||||
const n = contexts.length;
|
||||
|
||||
// node-llama-cpp handles batching internally when we make parallel requests
|
||||
const embeddings = await Promise.all(
|
||||
texts.map(async (text) => {
|
||||
if (n === 1) {
|
||||
// Single context: sequential (no point splitting)
|
||||
const context = contexts[0]!;
|
||||
const embeddings = [];
|
||||
for (const text of texts) {
|
||||
try {
|
||||
const embedding = await context.getEmbeddingFor(text);
|
||||
this.touchActivity(); // Keep-alive during slow batches
|
||||
return {
|
||||
embedding: Array.from(embedding.vector),
|
||||
model: this.embedModelUri,
|
||||
};
|
||||
this.touchActivity();
|
||||
embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
|
||||
} catch (err) {
|
||||
console.error("Embedding error for text:", err);
|
||||
return null;
|
||||
embeddings.push(null);
|
||||
}
|
||||
}
|
||||
return embeddings;
|
||||
}
|
||||
|
||||
// Multiple contexts: split texts across contexts for parallel evaluation
|
||||
const chunkSize = Math.ceil(texts.length / n);
|
||||
const chunks = Array.from({ length: n }, (_, i) =>
|
||||
texts.slice(i * chunkSize, (i + 1) * chunkSize)
|
||||
);
|
||||
|
||||
const chunkResults = await Promise.all(
|
||||
chunks.map(async (chunk, i) => {
|
||||
const ctx = contexts[i]!;
|
||||
const results: (EmbeddingResult | null)[] = [];
|
||||
for (const text of chunk) {
|
||||
try {
|
||||
const embedding = await ctx.getEmbeddingFor(text);
|
||||
this.touchActivity();
|
||||
results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
|
||||
} catch (err) {
|
||||
console.error("Embedding error for text:", err);
|
||||
results.push(null);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
})
|
||||
);
|
||||
|
||||
return embeddings;
|
||||
return chunkResults.flat();
|
||||
} catch (error) {
|
||||
console.error("Batch embedding error:", error);
|
||||
return texts.map(() => null);
|
||||
@ -879,7 +1023,7 @@ export class LlamaCpp implements LLM {
|
||||
// Ping activity at start to keep models alive during this operation
|
||||
this.touchActivity();
|
||||
|
||||
const context = await this.ensureRerankContext();
|
||||
const contexts = await this.ensureRerankContexts();
|
||||
|
||||
// Build a map from document text to original indices (for lookup after sorting)
|
||||
const textToDoc = new Map<string, { file: string; index: number }>();
|
||||
@ -890,8 +1034,24 @@ export class LlamaCpp implements LLM {
|
||||
// Extract just the text for ranking
|
||||
const texts = documents.map((doc) => doc.text);
|
||||
|
||||
// Use the proper ranking API - returns [{document: string, score: number}] sorted by score
|
||||
const ranked = await context.rankAndSort(query, texts);
|
||||
// Split documents across contexts for parallel evaluation.
|
||||
// Each context has its own sequence with a lock, so parallelism comes
|
||||
// from multiple contexts evaluating different chunks simultaneously.
|
||||
const n = contexts.length;
|
||||
const chunkSize = Math.ceil(texts.length / n);
|
||||
const chunks = Array.from({ length: n }, (_, i) =>
|
||||
texts.slice(i * chunkSize, (i + 1) * chunkSize)
|
||||
).filter(chunk => chunk.length > 0);
|
||||
|
||||
const allScores = await Promise.all(
|
||||
chunks.map((chunk, i) => contexts[i]!.rankAll(query, chunk))
|
||||
);
|
||||
|
||||
// Reassemble scores in original order and sort
|
||||
const flatScores = allScores.flat();
|
||||
const ranked = texts
|
||||
.map((text, i) => ({ document: text, score: flatScores[i]! }))
|
||||
.sort((a, b) => b.score - a.score);
|
||||
|
||||
// Map back to our result format using the text-to-doc map
|
||||
const results: RerankDocumentResult[] = ranked.map((item) => {
|
||||
@ -909,6 +1069,35 @@ export class LlamaCpp implements LLM {
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Get device/GPU info for status display.
|
||||
* Initializes llama if not already done.
|
||||
*/
|
||||
async getDeviceInfo(): Promise<{
|
||||
gpu: string | false;
|
||||
gpuOffloading: boolean;
|
||||
gpuDevices: string[];
|
||||
vram?: { total: number; used: number; free: number };
|
||||
cpuCores: number;
|
||||
}> {
|
||||
const llama = await this.ensureLlama();
|
||||
const gpuDevices = await llama.getGpuDeviceNames();
|
||||
let vram: { total: number; used: number; free: number } | undefined;
|
||||
if (llama.gpu) {
|
||||
try {
|
||||
const state = await llama.getVramState();
|
||||
vram = { total: state.total, used: state.used, free: state.free };
|
||||
} catch { /* no vram info */ }
|
||||
}
|
||||
return {
|
||||
gpu: llama.gpu,
|
||||
gpuOffloading: llama.supportsGpuOffloading,
|
||||
gpuDevices,
|
||||
vram,
|
||||
cpuCores: llama.cpuMathCores,
|
||||
};
|
||||
}
|
||||
|
||||
async dispose(): Promise<void> {
|
||||
// Prevent double-dispose
|
||||
if (this.disposed) {
|
||||
@ -932,8 +1121,8 @@ export class LlamaCpp implements LLM {
|
||||
}
|
||||
|
||||
// Clear references
|
||||
this.embedContext = null;
|
||||
this.rerankContext = null;
|
||||
this.embedContexts = [];
|
||||
this.rerankContexts = [];
|
||||
this.embedModel = null;
|
||||
this.generateModel = null;
|
||||
this.rerankModel = null;
|
||||
@ -941,7 +1130,7 @@ export class LlamaCpp implements LLM {
|
||||
|
||||
// Clear any in-flight load/create promises
|
||||
this.embedModelLoadPromise = null;
|
||||
this.embedContextCreatePromise = null;
|
||||
this.embedContextsCreatePromise = null;
|
||||
this.generateModelLoadPromise = null;
|
||||
this.rerankModelLoadPromise = null;
|
||||
}
|
||||
|
||||
89
src/mcp.ts
89
src/mcp.ts
@ -1,4 +1,3 @@
|
||||
#!/usr/bin/env bun
|
||||
/**
|
||||
* QMD MCP Server - Model Context Protocol server for QMD
|
||||
*
|
||||
@ -8,6 +7,8 @@
|
||||
* Follows MCP spec 2025-06-18 for proper response types.
|
||||
*/
|
||||
|
||||
import { createServer, type IncomingMessage, type ServerResponse } from "node:http";
|
||||
import { fileURLToPath } from "url";
|
||||
import { McpServer, ResourceTemplate } from "@modelcontextprotocol/sdk/server/mcp.js";
|
||||
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
||||
import { WebStandardStreamableHTTPServerTransport }
|
||||
@ -147,7 +148,7 @@ function buildInstructions(store: Store): string {
|
||||
*/
|
||||
function createMcpServer(store: Store): McpServer {
|
||||
const server = new McpServer(
|
||||
{ name: "qmd", version: "1.0.0" },
|
||||
{ name: "qmd", version: "0.9.9" },
|
||||
{ instructions: buildInstructions(store) },
|
||||
);
|
||||
|
||||
@ -237,9 +238,7 @@ function createMcpServer(store: Store): McpServer {
|
||||
},
|
||||
},
|
||||
async ({ query, limit, minScore, collection }) => {
|
||||
// Note: Collection filtering is now done post-search since collections are managed in YAML
|
||||
const results = store.searchFTS(query, limit || 10)
|
||||
.filter(r => !collection || r.collectionName === collection);
|
||||
const results = store.searchFTS(query, limit || 10, collection);
|
||||
const filtered: SearchResultItem[] = results
|
||||
.filter(r => r.score >= (minScore || 0))
|
||||
.map(r => {
|
||||
@ -541,7 +540,7 @@ export async function startMcpServer(): Promise<void> {
|
||||
// =============================================================================
|
||||
|
||||
export type HttpServerHandle = {
|
||||
httpServer: ReturnType<typeof Bun.serve>;
|
||||
httpServer: import("http").Server;
|
||||
port: number;
|
||||
stop: () => Promise<void>;
|
||||
};
|
||||
@ -588,47 +587,79 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
|
||||
if (!quiet) console.error(msg);
|
||||
}
|
||||
|
||||
const httpServer = Bun.serve({
|
||||
port,
|
||||
hostname: "localhost",
|
||||
async fetch(req) {
|
||||
const reqStart = Date.now();
|
||||
const pathname = new URL(req.url).pathname;
|
||||
// Helper to collect request body
|
||||
async function collectBody(req: IncomingMessage): Promise<string> {
|
||||
const chunks: Buffer[] = [];
|
||||
for await (const chunk of req) chunks.push(chunk as Buffer);
|
||||
return Buffer.concat(chunks).toString();
|
||||
}
|
||||
|
||||
if (pathname === "/health" && req.method === "GET") {
|
||||
const res = Response.json({
|
||||
status: "ok",
|
||||
uptime: Math.floor((Date.now() - startTime) / 1000),
|
||||
});
|
||||
const httpServer = createServer(async (nodeReq: IncomingMessage, nodeRes: ServerResponse) => {
|
||||
const reqStart = Date.now();
|
||||
const pathname = nodeReq.url || "/";
|
||||
|
||||
try {
|
||||
if (pathname === "/health" && nodeReq.method === "GET") {
|
||||
const body = JSON.stringify({ status: "ok", uptime: Math.floor((Date.now() - startTime) / 1000) });
|
||||
nodeRes.writeHead(200, { "Content-Type": "application/json" });
|
||||
nodeRes.end(body);
|
||||
log(`${ts()} GET /health (${Date.now() - reqStart}ms)`);
|
||||
return res;
|
||||
return;
|
||||
}
|
||||
|
||||
if (pathname === "/mcp" && req.method === "POST") {
|
||||
const body = await req.json();
|
||||
if (pathname === "/mcp" && nodeReq.method === "POST") {
|
||||
const rawBody = await collectBody(nodeReq);
|
||||
const body = JSON.parse(rawBody);
|
||||
const label = describeRequest(body);
|
||||
const res = await transport.handleRequest(req, { parsedBody: body });
|
||||
const url = `http://localhost:${port}${pathname}`;
|
||||
const headers: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(nodeReq.headers)) {
|
||||
if (typeof v === "string") headers[k] = v;
|
||||
}
|
||||
const request = new Request(url, { method: "POST", headers, body: rawBody });
|
||||
const response = await transport.handleRequest(request, { parsedBody: body });
|
||||
nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
|
||||
nodeRes.end(Buffer.from(await response.arrayBuffer()));
|
||||
log(`${ts()} POST /mcp ${label} (${Date.now() - reqStart}ms)`);
|
||||
return res;
|
||||
return;
|
||||
}
|
||||
|
||||
// Pass other methods (GET, DELETE) to transport for protocol handling
|
||||
if (pathname === "/mcp") {
|
||||
return transport.handleRequest(req);
|
||||
const url = `http://localhost:${port}${pathname}`;
|
||||
const headers: Record<string, string> = {};
|
||||
for (const [k, v] of Object.entries(nodeReq.headers)) {
|
||||
if (typeof v === "string") headers[k] = v;
|
||||
}
|
||||
const rawBody = nodeReq.method !== "GET" && nodeReq.method !== "HEAD" ? await collectBody(nodeReq) : undefined;
|
||||
const request = new Request(url, { method: nodeReq.method || "GET", headers, ...(rawBody ? { body: rawBody } : {}) });
|
||||
const response = await transport.handleRequest(request);
|
||||
nodeRes.writeHead(response.status, Object.fromEntries(response.headers));
|
||||
nodeRes.end(Buffer.from(await response.arrayBuffer()));
|
||||
return;
|
||||
}
|
||||
|
||||
return new Response("Not Found", { status: 404 });
|
||||
},
|
||||
nodeRes.writeHead(404);
|
||||
nodeRes.end("Not Found");
|
||||
} catch (err) {
|
||||
console.error("HTTP handler error:", err);
|
||||
nodeRes.writeHead(500);
|
||||
nodeRes.end("Internal Server Error");
|
||||
}
|
||||
});
|
||||
|
||||
const actualPort = httpServer.port;
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
httpServer.on("error", reject);
|
||||
httpServer.listen(port, "localhost", () => resolve());
|
||||
});
|
||||
|
||||
const actualPort = (httpServer.address() as import("net").AddressInfo).port;
|
||||
|
||||
let stopping = false;
|
||||
const stop = async () => {
|
||||
if (stopping) return;
|
||||
stopping = true;
|
||||
await transport.close();
|
||||
httpServer.stop();
|
||||
httpServer.close();
|
||||
store.close();
|
||||
await disposeDefaultLlamaCpp();
|
||||
};
|
||||
@ -649,6 +680,6 @@ export async function startMcpHttpServer(port: number, options?: { quiet?: boole
|
||||
}
|
||||
|
||||
// Run if this is the main module
|
||||
if (import.meta.main) {
|
||||
if (fileURLToPath(import.meta.url) === process.argv[1] || process.argv[1]?.endsWith("/mcp.ts")) {
|
||||
startMcpServer().catch(console.error);
|
||||
}
|
||||
|
||||
39
src/qmd.ts
39
src/qmd.ts
@ -65,7 +65,7 @@ import {
|
||||
createStore,
|
||||
getDefaultDbPath,
|
||||
} from "./store.js";
|
||||
import { disposeDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
|
||||
import { disposeDefaultLlamaCpp, getDefaultLlamaCpp, withLLMSession, pullModels, DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, DEFAULT_MODEL_CACHE_DIR } from "./llm.js";
|
||||
import {
|
||||
formatSearchResults,
|
||||
formatDocuments,
|
||||
@ -249,7 +249,7 @@ function formatBytes(bytes: number): string {
|
||||
return `${(bytes / (1024 * 1024 * 1024)).toFixed(1)} GB`;
|
||||
}
|
||||
|
||||
function showStatus(): void {
|
||||
async function showStatus(): Promise<void> {
|
||||
const dbPath = getDbPath();
|
||||
const db = getDb();
|
||||
|
||||
@ -362,6 +362,36 @@ function showStatus(): void {
|
||||
console.log(`\n${c.dim}No collections. Run 'qmd collection add .' to index markdown files.${c.reset}`);
|
||||
}
|
||||
|
||||
// Device / GPU info
|
||||
try {
|
||||
const llm = getDefaultLlamaCpp();
|
||||
const device = await llm.getDeviceInfo();
|
||||
console.log(`\n${c.bold}Device${c.reset}`);
|
||||
if (device.gpu) {
|
||||
console.log(` GPU: ${c.green}${device.gpu}${c.reset} (offloading: ${device.gpuOffloading ? 'yes' : 'no'})`);
|
||||
if (device.gpuDevices.length > 0) {
|
||||
// Deduplicate and count GPUs
|
||||
const counts = new Map<string, number>();
|
||||
for (const name of device.gpuDevices) {
|
||||
counts.set(name, (counts.get(name) || 0) + 1);
|
||||
}
|
||||
const deviceStr = Array.from(counts.entries())
|
||||
.map(([name, count]) => count > 1 ? `${count}× ${name}` : name)
|
||||
.join(', ');
|
||||
console.log(` Devices: ${deviceStr}`);
|
||||
}
|
||||
if (device.vram) {
|
||||
console.log(` VRAM: ${formatBytes(device.vram.free)} free / ${formatBytes(device.vram.total)} total`);
|
||||
}
|
||||
} else {
|
||||
console.log(` GPU: ${c.yellow}none${c.reset} (running on CPU — models will be slow)`);
|
||||
console.log(` ${c.dim}Tip: Install CUDA, Vulkan, or Metal support for GPU acceleration.${c.reset}`);
|
||||
}
|
||||
console.log(` CPU: ${device.cpuCores} math cores`);
|
||||
} catch {
|
||||
// Don't fail status if LLM init fails
|
||||
}
|
||||
|
||||
closeDb();
|
||||
}
|
||||
|
||||
@ -1871,8 +1901,7 @@ function search(query: string, opts: OutputOptions): void {
|
||||
|
||||
// Use large limit for --all, otherwise fetch more than needed and let outputResults filter
|
||||
const fetchLimit = opts.all ? 100000 : Math.max(50, opts.limit * 2);
|
||||
// searchFTS accepts collection name as number parameter for legacy reasons (will be fixed in store.ts)
|
||||
const results = searchFTS(db, query, fetchLimit, collectionName as any);
|
||||
const results = searchFTS(db, query, fetchLimit, collectionName);
|
||||
|
||||
// Add context to results
|
||||
const resultsWithContext = results.map(r => ({
|
||||
@ -2348,7 +2377,7 @@ if (import.meta.main) {
|
||||
}
|
||||
|
||||
case "status":
|
||||
showStatus();
|
||||
await showStatus();
|
||||
break;
|
||||
|
||||
case "update":
|
||||
|
||||
@ -1219,8 +1219,8 @@ describe("FTS Search", () => {
|
||||
const allResults = store.searchFTS("searchable", 10);
|
||||
expect(allResults).toHaveLength(2);
|
||||
|
||||
// Filter by collection name (collectionId is now treated as collection name string)
|
||||
const filtered = store.searchFTS("searchable", 10, collection1 as unknown as number);
|
||||
// Filter by collection name
|
||||
const filtered = store.searchFTS("searchable", 10, collection1);
|
||||
expect(filtered).toHaveLength(1);
|
||||
expect(filtered[0]!.displayPath).toBe(`${collection1}/doc1.md`);
|
||||
|
||||
|
||||
93
src/store.ts
93
src/store.ts
@ -830,8 +830,8 @@ export type Store = {
|
||||
toVirtualPath: (absolutePath: string) => string | null;
|
||||
|
||||
// Search
|
||||
searchFTS: (query: string, limit?: number, collectionId?: number) => SearchResult[];
|
||||
searchVec: (query: string, model: string, limit?: number, collectionName?: string) => Promise<SearchResult[]>;
|
||||
searchFTS: (query: string, limit?: number, collectionName?: string) => SearchResult[];
|
||||
searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => Promise<SearchResult[]>;
|
||||
|
||||
// Query expansion & reranking
|
||||
expandQuery: (query: string, model?: string) => Promise<ExpandedQuery[]>;
|
||||
@ -913,8 +913,8 @@ export function createStore(dbPath?: string): Store {
|
||||
toVirtualPath: (absolutePath: string) => toVirtualPath(db, absolutePath),
|
||||
|
||||
// Search
|
||||
searchFTS: (query: string, limit?: number, collectionId?: number) => searchFTS(db, query, limit, collectionId),
|
||||
searchVec: (query: string, model: string, limit?: number, collectionName?: string) => searchVec(db, query, model, limit, collectionName),
|
||||
searchFTS: (query: string, limit?: number, collectionName?: string) => searchFTS(db, query, limit, collectionName),
|
||||
searchVec: (query: string, model: string, limit?: number, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]) => searchVec(db, query, model, limit, collectionName, session, precomputedEmbedding),
|
||||
|
||||
// Query expansion & reranking
|
||||
expandQuery: (query: string, model?: string) => expandQuery(query, model, db),
|
||||
@ -2020,7 +2020,7 @@ function buildFTS5Query(query: string): string | null {
|
||||
return terms.map(t => `"${t}"*`).join(' AND ');
|
||||
}
|
||||
|
||||
export function searchFTS(db: Database, query: string, limit: number = 20, collectionId?: number): SearchResult[] {
|
||||
export function searchFTS(db: Database, query: string, limit: number = 20, collectionName?: string): SearchResult[] {
|
||||
const ftsQuery = buildFTS5Query(query);
|
||||
if (!ftsQuery) return [];
|
||||
|
||||
@ -2039,12 +2039,9 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
|
||||
`;
|
||||
const params: (string | number)[] = [ftsQuery];
|
||||
|
||||
if (collectionId) {
|
||||
// Note: collectionId is a legacy parameter that should be phased out
|
||||
// Collections are now managed in YAML. For now, we interpret it as a collection name filter.
|
||||
// This code path is likely unused as collection filtering should be done at CLI level.
|
||||
if (collectionName) {
|
||||
sql += ` AND d.collection = ?`;
|
||||
params.push(String(collectionId));
|
||||
params.push(String(collectionName));
|
||||
}
|
||||
|
||||
// bm25 lower is better; sort ascending.
|
||||
@ -2080,11 +2077,11 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle
|
||||
// Vector Search
|
||||
// =============================================================================
|
||||
|
||||
export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession): Promise<SearchResult[]> {
|
||||
export async function searchVec(db: Database, query: string, model: string, limit: number = 20, collectionName?: string, session?: ILLMSession, precomputedEmbedding?: number[]): Promise<SearchResult[]> {
|
||||
const tableExists = db.prepare(`SELECT name FROM sqlite_master WHERE type='table' AND name='vectors_vec'`).get();
|
||||
if (!tableExists) return [];
|
||||
|
||||
const embedding = await getEmbedding(query, model, true, session);
|
||||
const embedding = precomputedEmbedding ?? await getEmbedding(query, model, true, session);
|
||||
if (!embedding) return [];
|
||||
|
||||
// IMPORTANT: We use a two-step query approach here because sqlite-vec virtual tables
|
||||
@ -2848,8 +2845,8 @@ export async function hybridQuery(
|
||||
).get();
|
||||
|
||||
// Step 1: BM25 probe — strong signal skips expensive LLM expansion
|
||||
const initialFts = store.searchFTS(query, 20)
|
||||
.filter(r => !collection || r.collectionName === collection);
|
||||
// Pass collection directly into FTS query (filter at SQL level, not post-hoc)
|
||||
const initialFts = store.searchFTS(query, 20, collection);
|
||||
const topScore = initialFts[0]?.score ?? 0;
|
||||
const secondScore = initialFts[1]?.score ?? 0;
|
||||
const hasStrongSignal = initialFts.length > 0
|
||||
@ -2875,26 +2872,15 @@ export async function hybridQuery(
|
||||
}
|
||||
|
||||
// Step 3: Route searches by query type
|
||||
// Original query → vector search (FTS already covered by probe in step 1).
|
||||
// Vector searches run sequentially — node-llama-cpp's embed context
|
||||
// hangs on concurrent embed() calls (known limitation).
|
||||
if (hasVectors) {
|
||||
const vecResults = await store.searchVec(query, DEFAULT_EMBED_MODEL, 20, collection);
|
||||
if (vecResults.length > 0) {
|
||||
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
|
||||
rankedLists.push(vecResults.map(r => ({
|
||||
file: r.filepath, displayPath: r.displayPath,
|
||||
title: r.title, body: r.body || "", score: r.score,
|
||||
})));
|
||||
}
|
||||
}
|
||||
//
|
||||
// Strategy: run all FTS queries immediately (they're sync/instant), then
|
||||
// batch-embed all vector queries in one embedBatch() call, then run
|
||||
// sqlite-vec lookups with pre-computed embeddings.
|
||||
|
||||
// Expanded queries → route by type: lex→FTS only, vec/hyde→vector only.
|
||||
// This restores the CLI's query-type-aware routing that was lost in the initial refactor.
|
||||
// 3a: Run FTS for all lex expansions right away (no LLM needed)
|
||||
for (const q of expanded) {
|
||||
if (q.type === 'lex') {
|
||||
const ftsResults = store.searchFTS(q.text, 20)
|
||||
.filter(r => !collection || r.collectionName === collection);
|
||||
const ftsResults = store.searchFTS(q.text, 20, collection);
|
||||
if (ftsResults.length > 0) {
|
||||
for (const r of ftsResults) docidMap.set(r.filepath, r.docid);
|
||||
rankedLists.push(ftsResults.map(r => ({
|
||||
@ -2902,17 +2888,40 @@ export async function hybridQuery(
|
||||
title: r.title, body: r.body || "", score: r.score,
|
||||
})));
|
||||
}
|
||||
} else {
|
||||
// vec or hyde → vector search only
|
||||
if (hasVectors) {
|
||||
const vecResults = await store.searchVec(q.text, DEFAULT_EMBED_MODEL, 20, collection);
|
||||
if (vecResults.length > 0) {
|
||||
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
|
||||
rankedLists.push(vecResults.map(r => ({
|
||||
file: r.filepath, displayPath: r.displayPath,
|
||||
title: r.title, body: r.body || "", score: r.score,
|
||||
})));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3b: Collect all texts that need vector search (original query + vec/hyde expansions)
|
||||
if (hasVectors) {
|
||||
const vecQueries: { text: string; isOriginal: boolean }[] = [
|
||||
{ text: query, isOriginal: true },
|
||||
];
|
||||
for (const q of expanded) {
|
||||
if (q.type === 'vec' || q.type === 'hyde') {
|
||||
vecQueries.push({ text: q.text, isOriginal: false });
|
||||
}
|
||||
}
|
||||
|
||||
// Batch embed all vector queries in a single call
|
||||
const llm = getDefaultLlamaCpp();
|
||||
const textsToEmbed = vecQueries.map(q => formatQueryForEmbedding(q.text));
|
||||
const embeddings = await llm.embedBatch(textsToEmbed);
|
||||
|
||||
// Run sqlite-vec lookups with pre-computed embeddings
|
||||
for (let i = 0; i < vecQueries.length; i++) {
|
||||
const embedding = embeddings[i]?.embedding;
|
||||
if (!embedding) continue;
|
||||
|
||||
const vecResults = await store.searchVec(
|
||||
vecQueries[i]!.text, DEFAULT_EMBED_MODEL, 20, collection,
|
||||
undefined, embedding
|
||||
);
|
||||
if (vecResults.length > 0) {
|
||||
for (const r of vecResults) docidMap.set(r.filepath, r.docid);
|
||||
rankedLists.push(vecResults.map(r => ({
|
||||
file: r.filepath, displayPath: r.displayPath,
|
||||
title: r.title, body: r.body || "", score: r.score,
|
||||
})));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user