fix(rerank): truncate documents exceeding 2048-token context size
node-llama-cpp throws a hard error when any document + query + template overhead exceeds the ranking context size. Truncate oversized documents using the rerank model's tokenizer before passing them to rankAll(). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1d7d167b29
commit
5233e676d9
20
src/llm.ts
20
src/llm.ts
@ -1022,6 +1022,9 @@ export class LlamaCpp implements LLM {
|
||||
}
|
||||
}
|
||||
|
||||
// Qwen3 reranker chat template overhead (system prompt, tags, separators)
|
||||
private static readonly RERANK_TEMPLATE_OVERHEAD = 200;
|
||||
|
||||
async rerank(
|
||||
query: string,
|
||||
documents: RerankDocument[],
|
||||
@ -1031,15 +1034,28 @@ export class LlamaCpp implements LLM {
|
||||
this.touchActivity();
|
||||
|
||||
const contexts = await this.ensureRerankContexts();
|
||||
const model = await this.ensureRerankModel();
|
||||
|
||||
// Truncate documents that would exceed the rerank context size.
|
||||
// Budget = contextSize - template overhead - query tokens
|
||||
const queryTokens = model.tokenize(query).length;
|
||||
const maxDocTokens = LlamaCpp.RERANK_CONTEXT_SIZE - LlamaCpp.RERANK_TEMPLATE_OVERHEAD - queryTokens;
|
||||
|
||||
const truncatedDocs = documents.map((doc) => {
|
||||
const tokens = model.tokenize(doc.text);
|
||||
if (tokens.length <= maxDocTokens) return doc;
|
||||
const truncatedText = model.detokenize(tokens.slice(0, maxDocTokens));
|
||||
return { ...doc, text: truncatedText };
|
||||
});
|
||||
|
||||
// Build a map from document text to original indices (for lookup after sorting)
|
||||
const textToDoc = new Map<string, { file: string; index: number }>();
|
||||
documents.forEach((doc, index) => {
|
||||
truncatedDocs.forEach((doc, index) => {
|
||||
textToDoc.set(doc.text, { file: doc.file, index });
|
||||
});
|
||||
|
||||
// Extract just the text for ranking
|
||||
const texts = documents.map((doc) => doc.text);
|
||||
const texts = truncatedDocs.map((doc) => doc.text);
|
||||
|
||||
// Split documents across contexts for parallel evaluation.
|
||||
// Each context has its own sequence with a lock, so parallelism comes
|
||||
|
||||
@ -365,6 +365,45 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
|
||||
// Log timing for monitoring batch performance
|
||||
console.log(`Batch rerank of 10 docs took ${elapsed}ms`);
|
||||
});
|
||||
|
||||
test("truncates and reranks document exceeding 2048 token context size", async () => {
|
||||
// The reranker context is created with contextSize=2048. Documents that
|
||||
// exceed the token budget (contextSize - template overhead - query tokens)
|
||||
// should be silently truncated rather than crashing.
|
||||
const paragraph = "The quick brown fox jumps over the lazy dog near the riverbank. " +
|
||||
"Authentication tokens must be validated on every request to ensure security. " +
|
||||
"Database queries should use prepared statements to prevent SQL injection attacks. " +
|
||||
"The deployment pipeline includes linting, testing, building, and publishing stages. ";
|
||||
// ~320 chars per paragraph, repeat 40 times = ~12800 chars ≈ 3200 tokens
|
||||
const longText = paragraph.repeat(40);
|
||||
|
||||
const query = "How do I configure authentication?";
|
||||
const documents: RerankDocument[] = [
|
||||
{ file: "short-relevant.md", text: "Authentication can be configured by setting AUTH_SECRET." },
|
||||
{ file: "long-doc.md", text: longText },
|
||||
{ file: "short-irrelevant.md", text: "The weather is sunny today." },
|
||||
];
|
||||
|
||||
console.log(`Long doc length: ${longText.length} chars (~${Math.round(longText.length / 4)} tokens)`);
|
||||
|
||||
const result = await llm.rerank(query, documents);
|
||||
|
||||
// Should return all 3 documents without crashing
|
||||
expect(result.results).toHaveLength(3);
|
||||
|
||||
// All scores should be valid numbers in [0, 1]
|
||||
for (const doc of result.results) {
|
||||
expect(doc.score).toBeGreaterThanOrEqual(0);
|
||||
expect(doc.score).toBeLessThanOrEqual(1);
|
||||
expect(Number.isNaN(doc.score)).toBe(false);
|
||||
}
|
||||
|
||||
// The short, directly relevant doc should still rank highest
|
||||
console.log("Rerank results for long doc test:");
|
||||
for (const doc of result.results) {
|
||||
console.log(` ${doc.file}: ${doc.score.toFixed(4)}`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("expandQuery", () => {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user