fix(llm): set explicit embed context size, default 2048, configurable via env var (#500)
Without an explicit contextSize, node-llama-cpp defaults to "auto" which allocates the model's full training context (often 32k). For embedding chunks that are typically ~900 tokens this wastes ~3.5 GB of KV cache per context on Apple Silicon unified memory. Default to 2048 (matching the rerank context pattern) and allow override via QMD_EMBED_CONTEXT_SIZE for users with larger chunks. Addresses #329, related to #297 Co-authored-by: JohnRichardEnders <john@telli.com>
This commit is contained in:
parent
698b44fe87
commit
54550a3366
@ -665,6 +665,7 @@ export class LlamaCpp implements LLM {
|
||||
for (let i = 0; i < n; i++) {
|
||||
try {
|
||||
this.embedContexts.push(await model.createEmbeddingContext({
|
||||
contextSize: LlamaCpp.EMBED_CONTEXT_SIZE,
|
||||
...(threads > 0 ? { threads } : {}),
|
||||
}));
|
||||
} catch {
|
||||
@ -769,6 +770,11 @@ export class LlamaCpp implements LLM {
|
||||
const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10);
|
||||
return Number.isFinite(v) && v > 0 ? v : 4096;
|
||||
})();
|
||||
|
||||
private static readonly EMBED_CONTEXT_SIZE: number = (() => {
|
||||
const v = parseInt(process.env.QMD_EMBED_CONTEXT_SIZE ?? "", 10);
|
||||
return Number.isFinite(v) && v > 0 ? v : 2048;
|
||||
})();
|
||||
private async ensureRerankContexts(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>[]> {
|
||||
if (this.rerankContexts.length === 0) {
|
||||
const model = await this.ensureRerankModel();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user