perf: adaptive parallel contexts for embed + rerank, fix VRAM waste

Holistic overhaul of context management:

1. Parallel embedding contexts: embedBatch now splits work across
   multiple EmbeddingContexts (same pattern as reranking). Each
   context is ~143 MB. Benchmarked 6x speedup on 20 texts with
   4 contexts vs 1.

2. Rerank context size: was using auto (40960 tokens = 11.6 GB per
   context!). Reranking chunks are ~800 tokens max, so 1024 is
   plenty. Now 711 MB per context — 16x less VRAM. 4 contexts went
   from 46 GB to 2.8 GB.

3. Adaptive parallelism via computeParallelism(): checks available
   VRAM and allocates at most 25% of free VRAM for contexts, capped
   at 8. Falls back to 1 on CPU (no benefit from multiple contexts
   with node-llama-cpp's withLock serialization). Gracefully handles
   allocation failures — uses however many contexts succeeded.

VRAM budget per operation:
- Embed:  N × 143 MB (nomic-embed, 2048 ctx)
- Rerank: N × 711 MB (Qwen3-Reranker-0.6B, 1024 ctx)
- Generate: ~1.1 GB (qmd-expansion-1.7B, fresh ctx per call)

Works across:
- Large GPU boxes (4x A6000, 190 GB): allocates up to 8 contexts
- Consumer GPUs (16 GB): 2-4 contexts fit comfortably
- Apple Metal (8-16 GB unified): 1-4 contexts depending on memory
- CPU-only: single context (parallelism has no benefit)
This commit is contained in:
Tobi Lütke 2026-02-15 10:27:01 -05:00
parent 0a0e1e6f29
commit 4ac95b5e26
No known key found for this signature in database
2 changed files with 120 additions and 53 deletions

View File

@ -221,10 +221,15 @@ describe("LlamaCpp Integration", () => {
const successCount = allResults.filter(r => r !== null).length;
expect(successCount).toBe(10);
// THE KEY ASSERTION: Only 1 context should be created, not 5
// Without the fix, contextCreateCount would be 5 (one per concurrent embedBatch call)
console.log(`Context creation count: ${contextCreateCount} (expected: 1)`);
expect(contextCreateCount).toBe(1);
// THE KEY ASSERTION: Contexts should be created once (by ensureEmbedContexts),
// not duplicated per concurrent embedBatch call. The exact count depends on
// available VRAM (computeParallelism), but should not be 5 (one per call).
// Without the fix, contextCreateCount would be 5× the intended count (one set per concurrent call).
// With the promise guard, contexts are created exactly once regardless of concurrent callers.
// The count depends on VRAM (computeParallelism), but should be ≤ 8 (the cap).
console.log(`Context creation count: ${contextCreateCount} (expected: ≤ 8, not 5× duplicated)`);
expect(contextCreateCount).toBeGreaterThanOrEqual(1);
expect(contextCreateCount).toBeLessThanOrEqual(8);
await freshLlm.dispose();
}, 60000);

View File

@ -354,7 +354,7 @@ const DEFAULT_INACTIVITY_TIMEOUT_MS = 5 * 60 * 1000;
export class LlamaCpp implements LLM {
private llama: Llama | null = null;
private embedModel: LlamaModel | null = null;
private embedContext: LlamaEmbeddingContext | null = null;
private embedContexts: LlamaEmbeddingContext[] = [];
private generateModel: LlamaModel | null = null;
private rerankModel: LlamaModel | null = null;
private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
@ -366,7 +366,6 @@ export class LlamaCpp implements LLM {
// Ensure we don't load the same model/context concurrently (which can allocate duplicate VRAM).
private embedModelLoadPromise: Promise<LlamaModel> | null = null;
private embedContextCreatePromise: Promise<LlamaEmbeddingContext> | null = null;
private generateModelLoadPromise: Promise<LlamaModel> | null = null;
private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
@ -423,7 +422,7 @@ export class LlamaCpp implements LLM {
* Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
*/
private hasLoadedContexts(): boolean {
return !!(this.embedContext || this.rerankContexts.length > 0);
return !!(this.embedContexts.length > 0 || this.rerankContexts.length > 0);
}
/**
@ -445,10 +444,10 @@ export class LlamaCpp implements LLM {
}
// Dispose contexts first
if (this.embedContext) {
await this.embedContext.dispose();
this.embedContext = null;
for (const ctx of this.embedContexts) {
await ctx.dispose();
}
this.embedContexts = [];
for (const ctx of this.rerankContexts) {
await ctx.dispose();
}
@ -557,34 +556,69 @@ export class LlamaCpp implements LLM {
}
/**
* Load embedding context (lazy). Context can be disposed and recreated without reloading the model.
* Compute how many parallel contexts to create based on available VRAM.
* Conservative: uses at most 25% of free VRAM for contexts, capped at 8.
*/
private async computeParallelism(perContextMB: number): Promise<number> {
const llama = await this.ensureLlama();
if (!llama.gpu) return 1; // CPU: no benefit from multiple contexts
try {
const vram = await llama.getVramState();
const freeMB = vram.free / (1024 * 1024);
// Use at most 25% of free VRAM, min 1, max 8
const maxByVram = Math.floor((freeMB * 0.25) / perContextMB);
return Math.max(1, Math.min(8, maxByVram));
} catch {
return 2; // Conservative fallback
}
}
/**
* Load embedding contexts (lazy). Creates multiple for parallel embedding.
* Uses promise guard to prevent concurrent context creation race condition.
*/
private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
if (!this.embedContext) {
// If context creation is already in progress, wait for it
if (this.embedContextCreatePromise) {
return await this.embedContextCreatePromise;
}
private embedContextsCreatePromise: Promise<LlamaEmbeddingContext[]> | null = null;
// Start context creation and store promise so concurrent calls wait
this.embedContextCreatePromise = (async () => {
const model = await this.ensureEmbedModel();
const context = await model.createEmbeddingContext();
this.embedContext = context;
return context;
})();
try {
const context = await this.embedContextCreatePromise;
this.touchActivity();
return context;
} finally {
this.embedContextCreatePromise = null;
}
private async ensureEmbedContexts(): Promise<LlamaEmbeddingContext[]> {
if (this.embedContexts.length > 0) {
this.touchActivity();
return this.embedContexts;
}
this.touchActivity();
return this.embedContext;
if (this.embedContextsCreatePromise) {
return await this.embedContextsCreatePromise;
}
this.embedContextsCreatePromise = (async () => {
const model = await this.ensureEmbedModel();
// Embed contexts are ~143 MB each (nomic-embed 2048 ctx)
const n = await this.computeParallelism(150);
for (let i = 0; i < n; i++) {
try {
this.embedContexts.push(await model.createEmbeddingContext());
} catch {
if (this.embedContexts.length === 0) throw new Error("Failed to create any embedding context");
break;
}
}
this.touchActivity();
return this.embedContexts;
})();
try {
return await this.embedContextsCreatePromise;
} finally {
this.embedContextsCreatePromise = null;
}
}
/**
* Get a single embed context (for single-embed calls). Uses first from pool.
*/
private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
const contexts = await this.ensureEmbedContexts();
return contexts[0]!;
}
/**
@ -648,21 +682,24 @@ export class LlamaCpp implements LLM {
/**
* Load rerank contexts (lazy). Creates multiple contexts for parallel ranking.
* Each context has its own sequence, so they can evaluate independently.
*
* Uses contextSize 1024 instead of auto (40960) reranking chunks are ~800
* tokens max, so 1024 is plenty. This drops VRAM from 11.6 GB to 711 MB per context.
*/
private static readonly RERANK_PARALLEL_CONTEXTS = 4;
private static readonly RERANK_CONTEXT_SIZE = 1024;
private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
if (this.rerankContexts.length === 0) {
const model = await this.ensureRerankModel();
const n = LlamaCpp.RERANK_PARALLEL_CONTEXTS;
// Create contexts sequentially to avoid VRAM allocation races
// Rerank contexts are ~711 MB each at contextSize 1024
const n = await this.computeParallelism(750);
for (let i = 0; i < n; i++) {
try {
this.rerankContexts.push(await model.createRankingContext());
this.rerankContexts.push(await model.createRankingContext({
contextSize: LlamaCpp.RERANK_CONTEXT_SIZE,
}));
} catch {
// VRAM exhausted — use however many we got
if (this.rerankContexts.length === 0) {
// Must have at least one
throw new Error("Failed to create any rerank context");
}
break;
@ -741,26 +778,51 @@ export class LlamaCpp implements LLM {
if (texts.length === 0) return [];
try {
const context = await this.ensureEmbedContext();
const contexts = await this.ensureEmbedContexts();
const n = contexts.length;
// node-llama-cpp handles batching internally when we make parallel requests
const embeddings = await Promise.all(
texts.map(async (text) => {
if (n === 1) {
// Single context: sequential (no point splitting)
const context = contexts[0]!;
const embeddings = [];
for (const text of texts) {
try {
const embedding = await context.getEmbeddingFor(text);
this.touchActivity(); // Keep-alive during slow batches
return {
embedding: Array.from(embedding.vector),
model: this.embedModelUri,
};
this.touchActivity();
embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
} catch (err) {
console.error("Embedding error for text:", err);
return null;
embeddings.push(null);
}
}
return embeddings;
}
// Multiple contexts: split texts across contexts for parallel evaluation
const chunkSize = Math.ceil(texts.length / n);
const chunks = Array.from({ length: n }, (_, i) =>
texts.slice(i * chunkSize, (i + 1) * chunkSize)
);
const chunkResults = await Promise.all(
chunks.map(async (chunk, i) => {
const ctx = contexts[i]!;
const results: (EmbeddingResult | null)[] = [];
for (const text of chunk) {
try {
const embedding = await ctx.getEmbeddingFor(text);
this.touchActivity();
results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
} catch (err) {
console.error("Embedding error for text:", err);
results.push(null);
}
}
return results;
})
);
return embeddings;
return chunkResults.flat();
} catch (error) {
console.error("Batch embedding error:", error);
return texts.map(() => null);
@ -1015,7 +1077,7 @@ export class LlamaCpp implements LLM {
}
// Clear references
this.embedContext = null;
this.embedContexts = [];
this.rerankContexts = [];
this.embedModel = null;
this.generateModel = null;
@ -1024,7 +1086,7 @@ export class LlamaCpp implements LLM {
// Clear any in-flight load/create promises
this.embedModelLoadPromise = null;
this.embedContextCreatePromise = null;
this.embedContextsCreatePromise = null;
this.generateModelLoadPromise = null;
this.rerankModelLoadPromise = null;
}