fix: handle dense content (code) that tokenizes to more than expected
The 4 chars/token estimate is accurate for prose but code can be 1.7-2 chars/token. This caused chunks to exceed the embedding model's 2048 token context limit. - Use 3 chars/token as initial estimate (balanced for mixed content) - Add safety net: re-chunk any chunks that still exceed token limit - Use actual chars/token ratio when re-chunking for accuracy Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
537d15a9e6
commit
31dd977c32
49
src/store.ts
49
src/store.ts
@ -1453,29 +1453,44 @@ export async function chunkDocumentByTokens(
|
||||
): Promise<{ text: string; pos: number; tokens: number }[]> {
|
||||
const llm = getDefaultLlamaCpp();
|
||||
|
||||
// Convert token params to character params (~4 chars per token)
|
||||
const avgCharsPerToken = 4;
|
||||
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
||||
// If chunks exceed limit, they'll be re-split with actual ratio
|
||||
const avgCharsPerToken = 3;
|
||||
const maxChars = maxTokens * avgCharsPerToken;
|
||||
const overlapChars = overlapTokens * avgCharsPerToken;
|
||||
const windowChars = windowTokens * avgCharsPerToken;
|
||||
|
||||
// Chunk entirely in character space
|
||||
const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
|
||||
// Chunk in character space with conservative estimate
|
||||
let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
|
||||
|
||||
// Batch tokenize: get token counts for all chunks
|
||||
const tokenCounts = await Promise.all(
|
||||
charChunks.map(async (chunk) => {
|
||||
const tokens = await llm.tokenize(chunk.text);
|
||||
return tokens.length;
|
||||
})
|
||||
);
|
||||
// Tokenize and split any chunks that still exceed limit
|
||||
const results: { text: string; pos: number; tokens: number }[] = [];
|
||||
|
||||
// Combine chunks with their token counts
|
||||
return charChunks.map((chunk, i) => ({
|
||||
text: chunk.text,
|
||||
pos: chunk.pos,
|
||||
tokens: tokenCounts[i]!,
|
||||
}));
|
||||
for (const chunk of charChunks) {
|
||||
const tokens = await llm.tokenize(chunk.text);
|
||||
|
||||
if (tokens.length <= maxTokens) {
|
||||
results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
|
||||
} else {
|
||||
// Chunk is still too large - split it further
|
||||
// Use actual token count to estimate better char limit
|
||||
const actualCharsPerToken = chunk.text.length / tokens.length;
|
||||
const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
|
||||
|
||||
const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
|
||||
|
||||
for (const subChunk of subChunks) {
|
||||
const subTokens = await llm.tokenize(subChunk.text);
|
||||
results.push({
|
||||
text: subChunk.text,
|
||||
pos: chunk.pos + subChunk.pos,
|
||||
tokens: subTokens.length,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
|
||||
Loading…
Reference in New Issue
Block a user