fix: handle dense content (code) that tokenizes to more than expected

The 4 chars/token estimate is accurate for prose but code can be 1.7-2 chars/token. This caused chunks to exceed the embedding model's 2048 token context limit. - Use 3 chars/token as initial estimate (balanced for mixed content) - Add safety net: re-chunk any chunks that still exceed token limit - Use actual chars/token ratio when re-chunking for accuracy Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-31 15:19:28 -05:00 · 2026-01-31 15:19:28 -05:00 · 31dd977c32
commit 31dd977c32
parent 537d15a9e6
1 changed files with 32 additions and 17 deletions
--- a/src/store.ts
+++ b/src/store.ts
@ -1453,29 +1453,44 @@ export async function chunkDocumentByTokens(
 ): Promise<{ text: string; pos: number; tokens: number }[]> {
  const llm = getDefaultLlamaCpp();

-  // Convert token params to character params (~4 chars per token)
-  const avgCharsPerToken = 4;
+  // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
+  // If chunks exceed limit, they'll be re-split with actual ratio
+  const avgCharsPerToken = 3;
  const maxChars = maxTokens * avgCharsPerToken;
  const overlapChars = overlapTokens * avgCharsPerToken;
  const windowChars = windowTokens * avgCharsPerToken;

-  // Chunk entirely in character space
-  const charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);
+  // Chunk in character space with conservative estimate
+  let charChunks = chunkDocument(content, maxChars, overlapChars, windowChars);

-  // Batch tokenize: get token counts for all chunks
-  const tokenCounts = await Promise.all(
-    charChunks.map(async (chunk) => {
-      const tokens = await llm.tokenize(chunk.text);
-      return tokens.length;
-    })
-  );
+  // Tokenize and split any chunks that still exceed limit
+  const results: { text: string; pos: number; tokens: number }[] = [];

-  // Combine chunks with their token counts
-  return charChunks.map((chunk, i) => ({
-    text: chunk.text,
-    pos: chunk.pos,
-    tokens: tokenCounts[i]!,
-  }));
+  for (const chunk of charChunks) {
+    const tokens = await llm.tokenize(chunk.text);
+
+    if (tokens.length <= maxTokens) {
+      results.push({ text: chunk.text, pos: chunk.pos, tokens: tokens.length });
+    } else {
+      // Chunk is still too large - split it further
+      // Use actual token count to estimate better char limit
+      const actualCharsPerToken = chunk.text.length / tokens.length;
+      const safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); // 5% safety margin
+
+      const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2));
+
+      for (const subChunk of subChunks) {
+        const subTokens = await llm.tokenize(subChunk.text);
+        results.push({
+          text: subChunk.text,
+          pos: chunk.pos + subChunk.pos,
+          tokens: subTokens.length,
+        });
+      }
+    }
+  }
+
+  return results;
 }

 // =============================================================================