fix(llm): set explicit embed context size, default 2048, configurable via env var (#500)

Without an explicit contextSize, node-llama-cpp defaults to "auto" which allocates the model's full training context (often 32k). For embedding chunks that are typically ~900 tokens this wastes ~3.5 GB of KV cache per context on Apple Silicon unified memory. Default to 2048 (matching the rerank context pattern) and allow override via QMD_EMBED_CONTEXT_SIZE for users with larger chunks. Addresses #329, related to #297 Co-authored-by: JohnRichardEnders <john@telli.com>
2026-04-05 22:45:12 +02:00 · 2026-04-05 22:45:12 +02:00 · 54550a3366
commit 54550a3366
parent 698b44fe87
1 changed files with 6 additions and 0 deletions
--- a/src/llm.ts
+++ b/src/llm.ts
@ -665,6 +665,7 @@ export class LlamaCpp implements LLM {
      for (let i = 0; i < n; i++) {
        try {
          this.embedContexts.push(await model.createEmbeddingContext({
+            contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
            ...(threads > 0 ? { threads } : {}),
          }));
        } catch {
@ -769,6 +770,11 @@ export class LlamaCpp implements LLM {
    const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
    return Number.isFinite(v) && v > 0 ? v : 4096;
  })();
+
+  private static readonly EMBED_CONTEXT_SIZE: number = (() => {
+    const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
+    return Number.isFinite(v) && v > 0 ? v : 2048;
+  })();
  private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
    if (this.rerankContexts.length === 0) {
      const model = await this.ensureRerankModel();