perf: CPU parallelism via multi-context thread splitting
Our assumption that CPU can't benefit from multiple contexts was wrong. The withLock in node-llama-cpp serializes within a single context, but separate contexts with split threads run on different cores in true parallel. Key changes: - computeParallelism() now returns >1 on CPU (cores / 4, max 4) - threadsPerContext() splits math cores evenly across contexts - Both embed and rerank contexts get proper thread counts - Benchmark updated to test CPU parallelism Before (CPU, 40 docs): 9.7s (4.1 docs/s) — 6 threads, 1 context After (CPU, 40 docs): 2.3s (17.2 docs/s) — 32 threads, 8 contexts Two fixes stacked: 1. Thread count: default was 6 (library hardcode), now uses all math cores — 2× improvement alone 2. Multi-context: splitting cores across 8 contexts gives another 2.2× on top End-to-end 'qmd query' on CPU: 10.3s → 2.9s CPU benchmark (Threadripper PRO 7975WX, 32 math cores): 1 ctx: 5001ms (8.0 docs/s) 2 ctx: 3585ms (11.2 docs/s) 1.4× 4 ctx: 2874ms (13.9 docs/s) 1.7× 8 ctx: 2323ms (17.2 docs/s) 2.2×
This commit is contained in:
parent
bf42223086
commit
392934e78a
@ -108,18 +108,23 @@ async function benchmarkConfig(
|
||||
const vramBefore = llama.gpu ? await llama.getVramState() : null;
|
||||
const rssBefore = getMemUsage().rss;
|
||||
|
||||
// Create contexts
|
||||
// Create contexts. On CPU, split threads evenly across contexts.
|
||||
const cpuThreads = !llama.gpu ? Math.floor(llama.cpuMathCores / parallelism) : 0;
|
||||
const contexts = [];
|
||||
for (let i = 0; i < parallelism; i++) {
|
||||
try {
|
||||
contexts.push(await model.createRankingContext({
|
||||
contextSize: CONTEXT_SIZE,
|
||||
flashAttention: flash,
|
||||
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
if (contexts.length === 0) {
|
||||
// Try without flash
|
||||
contexts.push(await model.createRankingContext({ contextSize: CONTEXT_SIZE }));
|
||||
contexts.push(await model.createRankingContext({
|
||||
contextSize: CONTEXT_SIZE,
|
||||
...(cpuThreads > 0 ? { threads: cpuThreads } : {}),
|
||||
}));
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -253,8 +258,11 @@ async function main() {
|
||||
|
||||
for (const p of PARALLEL_CONFIGS) {
|
||||
if (!llama.gpu && p > 1) {
|
||||
console.log(`\n [${p} ctx] skipped (CPU — no benefit from parallelism)`);
|
||||
continue;
|
||||
// CPU: only test if we have enough cores (at least 4 per context)
|
||||
if (llama.cpuMathCores < p * 4) {
|
||||
console.log(`\n [${p} ctx] skipped (need ${p * 4} cores, have ${llama.cpuMathCores})`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Test with flash attention
|
||||
|
||||
50
src/llm.ts
50
src/llm.ts
@ -561,22 +561,42 @@ export class LlamaCpp implements LLM {
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute how many parallel contexts to create based on available VRAM.
|
||||
* Conservative: uses at most 25% of free VRAM for contexts, capped at 8.
|
||||
* Compute how many parallel contexts to create.
|
||||
*
|
||||
* GPU: constrained by VRAM (25% of free, capped at 8).
|
||||
* CPU: constrained by cores. Splitting threads across contexts enables
|
||||
* true parallelism (each context runs on its own cores). Use at most
|
||||
* half the math cores, with at least 4 threads per context.
|
||||
*/
|
||||
private async computeParallelism(perContextMB: number): Promise<number> {
|
||||
const llama = await this.ensureLlama();
|
||||
if (!llama.gpu) return 1; // CPU: no benefit from multiple contexts
|
||||
|
||||
try {
|
||||
const vram = await llama.getVramState();
|
||||
const freeMB = vram.free / (1024 * 1024);
|
||||
// Use at most 25% of free VRAM, min 1, max 8
|
||||
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
|
||||
return Math.max(1, Math.min(8, maxByVram));
|
||||
} catch {
|
||||
return 2; // Conservative fallback
|
||||
if (llama.gpu) {
|
||||
try {
|
||||
const vram = await llama.getVramState();
|
||||
const freeMB = vram.free / (1024 * 1024);
|
||||
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
|
||||
return Math.max(1, Math.min(8, maxByVram));
|
||||
} catch {
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// CPU: split cores across contexts. At least 4 threads per context.
|
||||
const cores = llama.cpuMathCores || 4;
|
||||
const maxContexts = Math.floor(cores / 4);
|
||||
return Math.max(1, Math.min(4, maxContexts));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of threads each context should use, given N parallel contexts.
|
||||
* Splits available math cores evenly across contexts.
|
||||
*/
|
||||
private async threadsPerContext(parallelism: number): Promise<number> {
|
||||
const llama = await this.ensureLlama();
|
||||
if (llama.gpu) return 0; // GPU: let the library decide
|
||||
const cores = llama.cpuMathCores || 4;
|
||||
return Math.max(1, Math.floor(cores / parallelism));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -599,9 +619,12 @@ export class LlamaCpp implements LLM {
|
||||
const model = await this.ensureEmbedModel();
|
||||
// Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
|
||||
const n = await this.computeParallelism(150);
|
||||
const threads = await this.threadsPerContext(n);
|
||||
for (let i = 0; i < n; i++) {
|
||||
try {
|
||||
this.embedContexts.push(await model.createEmbeddingContext());
|
||||
this.embedContexts.push(await model.createEmbeddingContext({
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
|
||||
break;
|
||||
@ -703,11 +726,13 @@ export class LlamaCpp implements LLM {
|
||||
const model = await this.ensureRerankModel();
|
||||
// ~960 MB per context with flash attention at contextSize 2048
|
||||
const n = await this.computeParallelism(1000);
|
||||
const threads = await this.threadsPerContext(n);
|
||||
for (let i = 0; i < n; i++) {
|
||||
try {
|
||||
this.rerankContexts.push(await model.createRankingContext({
|
||||
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
|
||||
flashAttention: true,
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
if (this.rerankContexts.length === 0) {
|
||||
@ -715,6 +740,7 @@ export class LlamaCpp implements LLM {
|
||||
try {
|
||||
this.rerankContexts.push(await model.createRankingContext({
|
||||
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
throw new Error("Failed to create any rerank context");
|
||||
|
||||
Loading…
Reference in New Issue
Block a user