From 5233e676d9e351155950bac1afd6f0faf1d80f7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobi=20L=C3=BCtke?= Date: Fri, 20 Feb 2026 14:17:38 -0500 Subject: [PATCH] fix(rerank): truncate documents exceeding 2048-token context size node-llama-cpp throws a hard error when any document + query + template overhead exceeds the ranking context size. Truncate oversized documents using the rerank model's tokenizer before passing them to rankAll(). Co-Authored-By: Claude Opus 4.6 --- src/llm.ts | 20 ++++++++++++++++++-- test/llm.test.ts | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/llm.ts b/src/llm.ts index 46c6295..0ea94bc 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -1022,6 +1022,9 @@ export class LlamaCpp implements LLM { } } + // Qwen3 reranker chat template overhead (system prompt, tags, separators) + private static readonly RERANK_TEMPLATE_OVERHEAD = 200; + async rerank( query: string, documents: RerankDocument[], @@ -1031,15 +1034,28 @@ export class LlamaCpp implements LLM { this.touchActivity(); const contexts = await this.ensureRerankContexts(); + const model = await this.ensureRerankModel(); + + // Truncate documents that would exceed the rerank context size. + // Budget = contextSize - template overhead - query tokens + const queryTokens = model.tokenize(query).length; + const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens; + + const truncatedDocs = documents.map((doc) => { + const tokens = model.tokenize(doc.text); + if (tokens.length <= maxDocTokens) return doc; + const truncatedText = model.detokenize(tokens.slice(0, maxDocTokens)); + return { ...doc, text: truncatedText }; + }); // Build a map from document text to original indices (for lookup after sorting) const textToDoc = new Map(); - documents.forEach((doc, index) => { + truncatedDocs.forEach((doc, index) => { textToDoc.set(doc.text, { file: doc.file, index }); }); // Extract just the text for ranking - const texts = documents.map((doc) => doc.text); + const texts = truncatedDocs.map((doc) => doc.text); // Split documents across contexts for parallel evaluation. // Each context has its own sequence with a lock, so parallelism comes diff --git a/test/llm.test.ts b/test/llm.test.ts index 662d11c..228eb8b 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -365,6 +365,45 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => { // Log timing for monitoring batch performance console.log(`Batch rerank of 10 docs took ${elapsed}ms`); }); + + test("truncates and reranks document exceeding 2048 token context size", async () => { + // The reranker context is created with contextSize=2048. Documents that + // exceed the token budget (contextSize - template overhead - query tokens) + // should be silently truncated rather than crashing. + const paragraph = "The quick brown fox jumps over the lazy dog near the riverbank. " + + "Authentication tokens must be validated on every request to ensure security. " + + "Database queries should use prepared statements to prevent SQL injection attacks. " + + "The deployment pipeline includes linting, testing, building, and publishing stages. "; + // ~320 chars per paragraph, repeat 40 times = ~12800 chars ≈ 3200 tokens + const longText = paragraph.repeat(40); + + const query = "How do I configure authentication?"; + const documents: RerankDocument[] = [ + { file: "short-relevant.md", text: "Authentication can be configured by setting AUTH_SECRET." }, + { file: "long-doc.md", text: longText }, + { file: "short-irrelevant.md", text: "The weather is sunny today." }, + ]; + + console.log(`Long doc length: ${longText.length} chars (~${Math.round(longText.length / 4)} tokens)`); + + const result = await llm.rerank(query, documents); + + // Should return all 3 documents without crashing + expect(result.results).toHaveLength(3); + + // All scores should be valid numbers in [0, 1] + for (const doc of result.results) { + expect(doc.score).toBeGreaterThanOrEqual(0); + expect(doc.score).toBeLessThanOrEqual(1); + expect(Number.isNaN(doc.score)).toBe(false); + } + + // The short, directly relevant doc should still rank highest + console.log("Rerank results for long doc test:"); + for (const doc of result.results) { + console.log(` ${doc.file}: ${doc.score.toFixed(4)}`); + } + }); }); describe("expandQuery", () => {