From 9718d3767caef55def847b7a4621e42d6de357f8 Mon Sep 17 00:00:00 2001 From: edy Date: Fri, 13 Mar 2026 09:35:17 +0800 Subject: [PATCH] fix: truncate oversized text before embedding to prevent GGML crash When a chunk exceeds the embedding model's context window (trainContextSize), node-llama-cpp's getEmbeddingFor() triggers a native SIGABRT in GGML/Metal, crashing the entire process. Fix: Add truncateToContextSize() guard in embed() and embedBatch() that uses the model's own tokenizer to check token count before calling getEmbeddingFor(). Oversized text is truncated to (trainContextSize - 4) tokens with a warning, preserving partial embedding coverage instead of crashing. Fixes #303 --- src/llm.ts | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) diff --git a/src/llm.ts b/src/llm.ts index 0ea94bc..13714bd 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -800,13 +800,42 @@ export class LlamaCpp implements LLM { // Core API methods // ========================================================================== + /** + * Truncate text to fit within the embedding model's context window. + * Uses the model's own tokenizer for accurate token counting, then + * detokenizes back to text if truncation is needed. + * Returns the (possibly truncated) text and whether truncation occurred. + */ + private async truncateToContextSize(text: string): Promise<{ text: string; truncated: boolean }> { + if (!this.embedModel) return { text, truncated: false }; + + const maxTokens = this.embedModel.trainContextSize; + if (maxTokens <= 0) return { text, truncated: false }; + + const tokens = this.embedModel.tokenize(text); + if (tokens.length <= maxTokens) return { text, truncated: false }; + + // Leave a small margin (4 tokens) for BOS/EOS overhead + const safeLimit = Math.max(1, maxTokens - 4); + const truncatedTokens = tokens.slice(0, safeLimit); + const truncatedText = this.embedModel.detokenize(truncatedTokens); + return { text: truncatedText, truncated: true }; + } + async embed(text: string, options: EmbedOptions = {}): Promise { // Ping activity at start to keep models alive during this operation this.touchActivity(); try { const context = await this.ensureEmbedContext(); - const embedding = await context.getEmbeddingFor(text); + + // Guard: truncate text that exceeds model context window to prevent GGML crash + const { text: safeText, truncated } = await this.truncateToContextSize(text); + if (truncated) { + console.warn(`⚠ Text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`); + } + + const embedding = await context.getEmbeddingFor(safeText); return { embedding: Array.from(embedding.vector), @@ -838,7 +867,11 @@ export class LlamaCpp implements LLM { const embeddings: ({ embedding: number[]; model: string } | null)[] = []; for (const text of texts) { try { - const embedding = await context.getEmbeddingFor(text); + const { text: safeText, truncated } = await this.truncateToContextSize(text); + if (truncated) { + console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`); + } + const embedding = await context.getEmbeddingFor(safeText); this.touchActivity(); embeddings.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri }); } catch (err) { @@ -861,7 +894,11 @@ export class LlamaCpp implements LLM { const results: (EmbeddingResult | null)[] = []; for (const text of chunk) { try { - const embedding = await ctx.getEmbeddingFor(text); + const { text: safeText, truncated } = await this.truncateToContextSize(text); + if (truncated) { + console.warn(`⚠ Batch text truncated to fit embedding context (${this.embedModel?.trainContextSize} tokens)`); + } + const embedding = await ctx.getEmbeddingFor(safeText); this.touchActivity(); results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri }); } catch (err) {