fix: handle dense content (code) that tokenizes to more than expected

The 4 chars/token estimate is accurate for prose but code can be
1.7-2 chars/token. This caused chunks to exceed the embedding
model's 2048 token context limit.

- Use 3 chars/token as initial estimate (balanced for mixed content)
- Add safety net: re-chunk any chunks that still exceed token limit
- Use actual chars/token ratio when re-chunking for accuracy

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Tobi Lutke 2026-01-31 15:19:28 -05:00
parent 537d15a9e6
commit 31dd977c32
No known key found for this signature in database

View File

@ -1453,29 +1453,44 @@ export async function chunkDocumentByTokens(
): Promise<{ text: string; pos: number; tokens: number }[]> {
const llm = getDefaultLlamaCpp();
// Convert token params to character params (~4 chars per token)
const avgCharsPerToken = 4;
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
// If chunks exceed limit, they'll be re-split with actual ratio
const avgCharsPerToken = 3;
const maxChars = maxTokens * avgCharsPerToken;
const overlapChars = overlapTokens * avgCharsPerToken;
const windowChars = windowTokens * avgCharsPerToken;
// Chunk entirely in character space
const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
// Chunk in character space with conservative estimate
let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
// Batch tokenize: get token counts for all chunks
const tokenCounts = await Promise.all(
charChunks.map(async (chunk) => {
const tokens = await llm.tokenize(chunk.text);
return tokens.length;
})
);
// Tokenize and split any chunks that still exceed limit
const results: { text: string; pos: number; tokens: number }[] = [];
// Combine chunks with their token counts
return charChunks.map((chunk, i) => ({
text: chunk.text,
pos: chunk.pos,
tokens: tokenCounts[i]!,
}));
for (const chunk of charChunks) {
const tokens = await llm.tokenize(chunk.text);
if (tokens.length <= maxTokens) {
results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
} else {
// Chunk is still too large - split it further
// Use actual token count to estimate better char limit
const actualCharsPerToken = chunk.text.length / tokens.length;
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
for (const subChunk of subChunks) {
const subTokens = await llm.tokenize(subChunk.text);
results.push({
text: subChunk.text,
pos: chunk.pos + subChunk.pos,
tokens: subTokens.length,
});
}
}
}
return results;
}
// =============================================================================