From 46010e6342add1c3a1ba5bb0f295c7bac62bf855 Mon Sep 17 00:00:00 2001 From: Tobi Lutke Date: Mon, 8 Dec 2025 07:55:24 -0500 Subject: [PATCH] Improve embed: truncate large docs, better error messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Truncate documents > 64KB with warning showing filenames - Show document title in error messages instead of hash - Format total time as "15m 4s" instead of "904.2s" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- qmd.ts | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/qmd.ts b/qmd.ts index 6877869..4adfcf5 100755 --- a/qmd.ts +++ b/qmd.ts @@ -678,11 +678,27 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = } // Calculate total bytes for accurate progress tracking, skip empty files + // Truncate documents larger than 64KB + const MAX_EMBED_BYTES = 64 * 1024; + const truncated: string[] = []; + const itemsWithSize = hashesToEmbed - .map(item => ({ - ...item, - bytes: new TextEncoder().encode(item.body).length - })) + .map(item => { + const originalBytes = new TextEncoder().encode(item.body).length; + let body = item.body; + if (originalBytes > MAX_EMBED_BYTES) { + // Truncate to MAX_EMBED_BYTES + const encoder = new TextEncoder(); + const decoder = new TextDecoder(); + body = decoder.decode(encoder.encode(item.body).slice(0, MAX_EMBED_BYTES)); + truncated.push(item.title); + } + return { + ...item, + body, + bytes: new TextEncoder().encode(body).length + }; + }) .filter(item => item.bytes > 0); // Skip empty documents if (itemsWithSize.length === 0) { @@ -699,6 +715,15 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = if (skipped > 0) { console.log(`${c.dim}Skipped ${skipped} empty documents${c.reset}`); } + if (truncated.length > 0) { + console.log(`${c.yellow}⚠ Truncated ${truncated.length} large documents to 64KB:${c.reset}`); + for (const title of truncated.slice(0, 5)) { + console.log(`${c.dim} - ${title}${c.reset}`); + } + if (truncated.length > 5) { + console.log(`${c.dim} ... and ${truncated.length - 5} more${c.reset}`); + } + } console.log(`${c.dim}Model: ${model}${c.reset}\n`); progress.indeterminate(); @@ -729,7 +754,7 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = errors++; bytesProcessed += item.bytes; progress.error(); - console.error(`\n${c.yellow}⚠ Error embedding ${item.hash.slice(0, 8)}...: ${err}${c.reset}`); + console.error(`\n${c.yellow}⚠ Error embedding "${item.title}": ${err}${c.reset}`); } const processed = embedded + errors; @@ -751,11 +776,11 @@ async function vectorIndex(model: string = DEFAULT_EMBED_MODEL, force: boolean = } progress.clear(); - const totalTime = ((Date.now() - startTime) / 1000).toFixed(1); - const avgThroughput = formatBytes(totalBytes / parseFloat(totalTime)); + const totalTimeSec = (Date.now() - startTime) / 1000; + const avgThroughput = formatBytes(totalBytes / totalTimeSec); console.log(`\r${c.green}${renderProgressBar(100)}${c.reset} ${c.bold}100%${c.reset} `); - console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${embedded}${c.reset} documents in ${c.bold}${totalTime}s${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`); + console.log(`\n${c.green}✓ Done!${c.reset} Embedded ${c.bold}${embedded}${c.reset} documents in ${c.bold}${formatETA(totalTimeSec)}${c.reset} ${c.dim}(${avgThroughput}/s)${c.reset}`); if (errors > 0) { console.log(`${c.yellow}⚠ ${errors} documents failed${c.reset}`); }