perf: CPU parallelism via multi-context thread splitting

Our assumption that CPU can't benefit from multiple contexts was
wrong. The withLock in node-llama-cpp serializes within a single
context, but separate contexts with split threads run on different
cores in true parallel.

Key changes:
- computeParallelism() now returns >1 on CPU (cores / 4, max 4)
- threadsPerContext() splits math cores evenly across contexts
- Both embed and rerank contexts get proper thread counts
- Benchmark updated to test CPU parallelism

Before (CPU, 40 docs): 9.7s (4.1 docs/s) — 6 threads, 1 context
After  (CPU, 40 docs): 2.3s (17.2 docs/s) — 32 threads, 8 contexts

Two fixes stacked:
1. Thread count: default was 6 (library hardcode), now uses all
   math cores — 2× improvement alone
2. Multi-context: splitting cores across 8 contexts gives another
   2.2× on top

End-to-end 'qmd query' on CPU: 10.3s → 2.9s

CPU benchmark (Threadripper PRO 7975WX, 32 math cores):
  1 ctx: 5001ms (8.0 docs/s)
  2 ctx: 3585ms (11.2 docs/s)  1.4×
  4 ctx: 2874ms (13.9 docs/s)  1.7×
  8 ctx: 2323ms (17.2 docs/s)  2.2×
This commit is contained in:
Tobi Lütke 2026-02-15 11:21:45 -05:00
parent bf42223086
commit 392934e78a
No known key found for this signature in database
2 changed files with 50 additions and 16 deletions

View File

@ -108,18 +108,23 @@ async function benchmarkConfig(
const vramBefore = llama.gpu ? await llama.getVramState() : null;
const rssBefore = getMemUsage().rss;
// Create contexts
// Create contexts. On CPU, split threads evenly across contexts.
const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
const contexts = [];
for (let i = 0; i < parallelism; i++) {
try {
contexts.push(await model.createRankingContext({
contextSize: CONTEXT_SIZE,
flashAttention: flash,
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
}));
} catch {
if (contexts.length === 0) {
// Try without flash
contexts.push(await model.createRankingContext({ contextSize: CONTEXT_SIZE }));
contexts.push(await model.createRankingContext({
contextSize: CONTEXT_SIZE,
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
}));
}
break;
}
@ -253,8 +258,11 @@ async function main() {
for (const p of PARALLEL_CONFIGS) {
if (!llama.gpu && p > 1) {
console.log(`\n [${p} ctx] skipped (CPU — no benefit from parallelism)`);
continue;
// CPU: only test if we have enough cores (at least 4 per context)
if (llama.cpuMathCores < p * 4) {
console.log(`\n [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
continue;
}
}
// Test with flash attention

View File

@ -561,22 +561,42 @@ export class LlamaCpp implements LLM {
}
/**
* Compute how many parallel contexts to create based on available VRAM.
* Conservative: uses at most 25% of free VRAM for contexts, capped at 8.
* Compute how many parallel contexts to create.
*
* GPU: constrained by VRAM (25% of free, capped at 8).
* CPU: constrained by cores. Splitting threads across contexts enables
* true parallelism (each context runs on its own cores). Use at most
* half the math cores, with at least 4 threads per context.
*/
private async computeParallelism(perContextMB: number): Promise<number> {
const llama = await this.ensureLlama();
if (!llama.gpu) return 1; // CPU: no benefit from multiple contexts
try {
const vram = await llama.getVramState();
const freeMB = vram.free / (1024 * 1024);
// Use at most 25% of free VRAM, min 1, max 8
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
return Math.max(1, Math.min(8, maxByVram));
} catch {
return 2; // Conservative fallback
if (llama.gpu) {
try {
const vram = await llama.getVramState();
const freeMB = vram.free / (1024 * 1024);
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
return Math.max(1, Math.min(8, maxByVram));
} catch {
return 2;
}
}
// CPU: split cores across contexts. At least 4 threads per context.
const cores = llama.cpuMathCores || 4;
const maxContexts = Math.floor(cores / 4);
return Math.max(1, Math.min(4, maxContexts));
}
/**
* Get the number of threads each context should use, given N parallel contexts.
* Splits available math cores evenly across contexts.
*/
private async threadsPerContext(parallelism: number): Promise<number> {
const llama = await this.ensureLlama();
if (llama.gpu) return 0; // GPU: let the library decide
const cores = llama.cpuMathCores || 4;
return Math.max(1, Math.floor(cores / parallelism));
}
/**
@ -599,9 +619,12 @@ export class LlamaCpp implements LLM {
const model = await this.ensureEmbedModel();
// Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
const n = await this.computeParallelism(150);
const threads = await this.threadsPerContext(n);
for (let i = 0; i < n; i++) {
try {
this.embedContexts.push(await model.createEmbeddingContext());
this.embedContexts.push(await model.createEmbeddingContext({
...(threads > 0 ? { threads } : {}),
}));
} catch {
if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
break;
@ -703,11 +726,13 @@ export class LlamaCpp implements LLM {
const model = await this.ensureRerankModel();
// ~960 MB per context with flash attention at contextSize 2048
const n = await this.computeParallelism(1000);
const threads = await this.threadsPerContext(n);
for (let i = 0; i < n; i++) {
try {
this.rerankContexts.push(await model.createRankingContext({
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
flashAttention: true,
...(threads > 0 ? { threads } : {}),
}));
} catch {
if (this.rerankContexts.length === 0) {
@ -715,6 +740,7 @@ export class LlamaCpp implements LLM {
try {
this.rerankContexts.push(await model.createRankingContext({
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
...(threads > 0 ? { threads } : {}),
}));
} catch {
throw new Error("Failed to create any rerank context");