fix(llm): set explicit embed context size, default 2048, configurable via env var (#500)

Without an explicit contextSize, node-llama-cpp defaults to "auto" which
allocates the model's full training context (often 32k). For embedding
chunks that are typically ~900 tokens this wastes ~3.5 GB of KV cache
per context on Apple Silicon unified memory.

Default to 2048 (matching the rerank context pattern) and allow override
via QMD_EMBED_CONTEXT_SIZE for users with larger chunks.

Addresses #329, related to #297

Co-authored-by: JohnRichardEnders <john@telli.com>
This commit is contained in:
John R. Enders 2026-04-05 22:45:12 +02:00 committed by GitHub
parent 698b44fe87
commit 54550a3366
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -665,6 +665,7 @@ export class LlamaCpp implements LLM {
for (let i = 0; i < n; i++) {
try {
this.embedContexts.push(await model.createEmbeddingContext({
contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
...(threads > 0 ? { threads } : {}),
}));
} catch {
@ -769,6 +770,11 @@ export class LlamaCpp implements LLM {
const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
return Number.isFinite(v) && v > 0 ? v : 4096;
})();
private static readonly EMBED_CONTEXT_SIZE: number = (() => {
const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
return Number.isFinite(v) && v > 0 ? v : 2048;
})();
private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
if (this.rerankContexts.length === 0) {
const model = await this.ensureRerankModel();