diff --git a/CHANGELOG.md b/CHANGELOG.md index 2dd9ad2..1558305 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,11 @@ ### Fixes - Embedding: default to an external OpenAI-compatible embeddings API - (`text-embedding-3-small`) and require explicit `hf:`/`.gguf` - configuration to use local node-llama-cpp embedding models. + (`nvidia/llama-3.2-nv-embedqa-1b-v2`) and require + `QMD_ENABLE_LOCAL_MODELS=1` for local node-llama-cpp embedding, reranking, + and query expansion models. +- Embedding: use approximate token counts in external embedding mode so + chunking does not load a local GGUF tokenizer. - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529 - Fix: preserve original filename case in `handelize()`. The previous `.toLowerCase()` call made indexed paths unreachable on case-sensitive diff --git a/README.md b/README.md index 74efadd..a51d8e9 100644 --- a/README.md +++ b/README.md @@ -490,19 +490,20 @@ by default. Configure it with: export NVIDIA_API_KEY="..." export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1" export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2" -export QMD_DISABLE_LOCAL_MODELS=1 ``` QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required `input_type` automatically (`passage` while indexing, `query` while searching). -`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load -local GGUF models. In that mode QMD rejects local embedding model URIs, skips -local query expansion, and defaults search reranking off while still using the -configured external embedding service for vector search. +Local GGUF models are disabled by default. In the default mode QMD rejects local +embedding model URIs, skips local query expansion, and search reranking uses RRF +scores only while still using the configured external embedding service for +vector search. -Reranking and query expansion still use local GGUF models via node-llama-cpp: +Set `QMD_ENABLE_LOCAL_MODELS=1` to opt into local GGUF model loading. The first +query expansion or reranking call can download and load the configured local +model, which may take a while. | Model | Purpose | Size | |-------|---------|------| @@ -513,10 +514,12 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`. ### Local Embedding Model -Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings. +Set `QMD_ENABLE_LOCAL_MODELS=1` and `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` +path to opt into local node-llama-cpp embeddings. ```sh # Use Qwen3-Embedding-0.6B locally +export QMD_ENABLE_LOCAL_MODELS=1 export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf" # After changing the model, re-embed all collections: @@ -943,8 +946,9 @@ YAML configuration can override those defaults; see `example-index.yml` for a co ```yaml models: embed: nvidia/llama-3.2-nv-embedqa-1b-v2 - rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf - generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf + # Optional local models, used only when QMD_ENABLE_LOCAL_MODELS=1: + # rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf + # generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf ``` ### EmbeddingGemma Prompt Format diff --git a/example-index.yml b/example-index.yml index d9afe1a..62f8279 100644 --- a/example-index.yml +++ b/example-index.yml @@ -9,14 +9,16 @@ global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context." # Model overrides. -# Embeddings use an external OpenAI-compatible /embeddings API by default. -# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth. +# Embeddings use NVIDIA's OpenAI-compatible /embeddings API by default. +# Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY in the environment for API auth. +# Local GGUF models are disabled unless QMD_ENABLE_LOCAL_MODELS=1 is set. models: - embed: text-embedding-3-small + embed: nvidia/llama-3.2-nv-embedqa-1b-v2 # Optional local embedding model instead of the external API: # embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf - rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf - generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf + # Optional local rerank/generation models: + # rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf + # generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf # Collection definitions collections: diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts index bfcd392..3ee10aa 100755 --- a/src/cli/qmd.ts +++ b/src/cli/qmd.ts @@ -463,8 +463,8 @@ async function showStatus(): Promise { }; console.log(`\n${c.bold}Models${c.reset}`); console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); - console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); - console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); + console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`); + console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`); } // Device / GPU info @@ -3125,11 +3125,13 @@ if (isMain) { case "pull": { const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh); + const isLocalModelUri = (uri: string) => + uri.startsWith("hf:") || uri.endsWith(".gguf") || uri.startsWith("/") || uri.startsWith("./") || uri.startsWith("../"); const models = [ DEFAULT_EMBED_MODEL_URI, DEFAULT_GENERATE_MODEL_URI, DEFAULT_RERANK_MODEL_URI, - ]; + ].filter(isLocalModelUri); console.log(`${c.bold}Pulling models${c.reset}`); const results = await pullModels(models, { refresh, diff --git a/src/llm.ts b/src/llm.ts index cb8c6a2..e8be977 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -192,7 +192,7 @@ export type RerankDocument = { // ============================================================================= // Embeddings use NVIDIA's OpenAI-compatible API by default. -// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings. +// Set QMD_ENABLE_LOCAL_MODELS=1 before using any local node-llama-cpp GGUF models. const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"; @@ -216,14 +216,24 @@ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR; const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1"; -export function localModelsDisabled(): boolean { - return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? ""); +export function localModelsEnabled(): boolean { + return /^(1|true|yes|on)$/i.test(process.env.QMD_ENABLE_LOCAL_MODELS ?? ""); } function isLocalEmbeddingModel(model: string): boolean { return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../"); } +export function approximateTokenCount(text: string): number { + if (text.length === 0) return 0; + return Math.max(1, Math.ceil(text.length / 3)); +} + +export function truncateByApproxTokens(text: string, maxTokens: number): string { + if (maxTokens <= 0) return ""; + return text.slice(0, Math.max(1, maxTokens * 3)); +} + export type PullResult = { model: string; path: string; @@ -929,10 +939,17 @@ export class LlamaCpp implements LLM { // ========================================================================== /** - * Tokenize text using the embedding model's tokenizer - * Returns tokenizer tokens (opaque type from node-llama-cpp) + * Tokenize text using the embedding model's tokenizer when local embeddings + * are explicitly active. External embedding mode uses a conservative + * approximation and must not load a local tokenizer. */ async tokenize(text: string): Promise { + if (!this.usesLocalEmbedding) { + return Array.from( + { length: approximateTokenCount(text) }, + (_, index) => index as unknown as LlamaToken, + ); + } await this.ensureEmbedContext(); // Ensure model is loaded if (!this.embedModel) { throw new Error("Embed model not loaded"); @@ -941,17 +958,25 @@ export class LlamaCpp implements LLM { } /** - * Count tokens in text using the embedding model's tokenizer + * Count tokens in text. External embedding mode uses an approximation so + * chunking never pulls in a local GGUF tokenizer by accident. */ async countTokens(text: string): Promise { + if (!this.usesLocalEmbedding) { + return approximateTokenCount(text); + } const tokens = await this.tokenize(text); return tokens.length; } /** - * Detokenize token IDs back to text + * Detokenize token IDs back to text. External embedding mode has no local + * tokenizer, so return an approximate-width placeholder for guardrail paths. */ async detokenize(tokens: readonly LlamaToken[]): Promise { + if (!this.usesLocalEmbedding) { + return " ".repeat(tokens.length * 3); + } await this.ensureEmbedContext(); if (!this.embedModel) { throw new Error("Embed model not loaded"); @@ -1047,8 +1072,8 @@ export class LlamaCpp implements LLM { async embed(text: string, options: EmbedOptions = {}): Promise { const model = options.model ?? this.embedModelUri; - if (localModelsDisabled() && isLocalEmbeddingModel(model)) { - throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model."); + if (!localModelsEnabled() && isLocalEmbeddingModel(model)) { + throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models."); } if (!isLocalEmbeddingModel(model)) { const results = await this.embedExternal([text], model, options); @@ -1085,8 +1110,8 @@ export class LlamaCpp implements LLM { */ async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> { const model = options.model ?? this.embedModelUri; - if (localModelsDisabled() && isLocalEmbeddingModel(model)) { - throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model."); + if (!localModelsEnabled() && isLocalEmbeddingModel(model)) { + throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models."); } if (!isLocalEmbeddingModel(model)) { return this.embedExternal(texts, model, options); @@ -1219,7 +1244,7 @@ export class LlamaCpp implements LLM { // ========================================================================== async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise { - if (localModelsDisabled()) return []; + if (!localModelsEnabled()) return []; if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); // Ping activity at start to keep models alive during this operation this.touchActivity(); @@ -1319,7 +1344,7 @@ export class LlamaCpp implements LLM { documents: RerankDocument[], options: RerankOptions = {} ): Promise { - if (localModelsDisabled()) { + if (!localModelsEnabled()) { return { model: "disabled", results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })), diff --git a/src/store.ts b/src/store.ts index f7c853a..b0f4ee3 100644 --- a/src/store.ts +++ b/src/store.ts @@ -21,11 +21,13 @@ import fastGlob from "fast-glob"; import { LlamaCpp, getDefaultLlamaCpp, + approximateTokenCount, + truncateByApproxTokens, formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, DEFAULT_EMBED_MODEL_URI, - localModelsDisabled, + localModelsEnabled, type RerankDocument, type ILLMSession, } from "./llm.js"; @@ -2279,6 +2281,15 @@ export async function chunkDocumentByTokens( signal?: AbortSignal ): Promise<{ text: string; pos: number; tokens: number }[]> { const llm = getDefaultLlamaCpp(); + const useLocalTokenizer = typeof (llm as any).usesLocalEmbedding === "boolean" + ? Boolean((llm as any).usesLocalEmbedding) + : true; + + const countTokens = async (text: string): Promise => { + if (!useLocalTokenizer) return approximateTokenCount(text); + const tokens = await llm.tokenize(text); + return tokens.length; + }; // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3) // If chunks exceed limit, they'll be re-split with actual ratio @@ -2301,13 +2312,13 @@ export async function chunkDocumentByTokens( const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise => { if (signal?.aborted) return; - const tokens = await llm.tokenize(text); - if (tokens.length <= maxTokens || text.length <= 1) { - results.push({ text, pos, tokens: tokens.length }); + const tokenCount = await countTokens(text); + if (tokenCount <= maxTokens || text.length <= 1) { + results.push({ text, pos, tokens: tokenCount }); return; } - const actualCharsPerToken = text.length / tokens.length; + const actualCharsPerToken = text.length / tokenCount; let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) { safeMaxChars = Math.floor(text.length / 2); @@ -2337,12 +2348,14 @@ export async function chunkDocumentByTokens( subChunks.length <= 1 || subChunks[0]?.text.length === text.length ) { - const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens)); - const truncatedText = await llm.detokenize(fallbackTokens); + const tokenLimit = Math.max(1, maxTokens); + const truncatedText = useLocalTokenizer + ? await llm.detokenize((await llm.tokenize(text)).slice(0, tokenLimit)) + : truncateByApproxTokens(text, tokenLimit); results.push({ text: truncatedText, pos, - tokens: fallbackTokens.length, + tokens: tokenLimit, }); return; } @@ -4013,7 +4026,7 @@ export async function hybridQuery( const collection = options?.collection; const explain = options?.explain ?? false; const intent = options?.intent; - const skipRerank = options?.skipRerank ?? localModelsDisabled(); + const skipRerank = options?.skipRerank ?? !localModelsEnabled(); const hooks = options?.hooks; const rankedLists: RankedResult[][] = []; @@ -4408,7 +4421,7 @@ export async function structuredSearch( const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; const explain = options?.explain ?? false; const intent = options?.intent; - const skipRerank = options?.skipRerank ?? localModelsDisabled(); + const skipRerank = options?.skipRerank ?? !localModelsEnabled(); const hooks = options?.hooks; const collections = options?.collections; diff --git a/test/llm.test.ts b/test/llm.test.ts index f1b09ae..8ee11a2 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -193,9 +193,11 @@ describe("LlamaCpp model resolution (config > env > default)", () => { }); test("default embedding uses NVIDIA OpenAI-compatible API", async () => { + const prevModel = process.env.QMD_EMBED_MODEL; const prevKey = process.env.QMD_EMBED_API_KEY; const prevNvidiaKey = process.env.NVIDIA_API_KEY; const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL; + delete process.env.QMD_EMBED_MODEL; process.env.QMD_EMBED_API_KEY = "test-key"; delete process.env.NVIDIA_API_KEY; delete process.env.QMD_EMBED_API_BASE_URL; @@ -225,6 +227,8 @@ describe("LlamaCpp model resolution (config > env > default)", () => { }); } finally { fetchMock.mockRestore(); + if (prevModel === undefined) delete process.env.QMD_EMBED_MODEL; + else process.env.QMD_EMBED_MODEL = prevModel; if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY; else process.env.QMD_EMBED_API_KEY = prevKey; if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY; @@ -274,9 +278,9 @@ describe("LlamaCpp model resolution (config > env > default)", () => { expect(llm.usesLocalEmbedding).toBe(true); }); - test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => { - const prev = process.env.QMD_DISABLE_LOCAL_MODELS; - process.env.QMD_DISABLE_LOCAL_MODELS = "1"; + test("local models are disabled by default", async () => { + const prev = process.env.QMD_ENABLE_LOCAL_MODELS; + delete process.env.QMD_ENABLE_LOCAL_MODELS; try { const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }); await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled"); @@ -286,14 +290,54 @@ describe("LlamaCpp model resolution (config > env > default)", () => { results: [{ file: "doc.md", score: 0, index: 0 }], }); } finally { - if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS; - else process.env.QMD_DISABLE_LOCAL_MODELS = prev; + if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS; + else process.env.QMD_ENABLE_LOCAL_MODELS = prev; } }); + + test("QMD_ENABLE_LOCAL_MODELS allows explicit local embedding models", async () => { + const prev = process.env.QMD_ENABLE_LOCAL_MODELS; + process.env.QMD_ENABLE_LOCAL_MODELS = "1"; + try { + const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }) as any; + llm._ciMode = false; + llm.touchActivity = vi.fn(); + llm.ensureEmbedContext = vi.fn().mockResolvedValue({ + getEmbeddingFor: vi.fn(async () => ({ vector: new Float32Array([0.1, 0.2]) })), + }); + llm.truncateToContextSize = vi.fn(async (text: string) => ({ + text, + truncated: false, + limit: 2048, + })); + + await expect(llm.embed("hello")).resolves.toEqual({ + embedding: [expect.closeTo(0.1), expect.closeTo(0.2)], + model: "hf:custom/embed.gguf", + }); + expect(llm.ensureEmbedContext).toHaveBeenCalled(); + } finally { + if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS; + else process.env.QMD_ENABLE_LOCAL_MODELS = prev; + } + }); + + test("external embedding token counting does not load a local tokenizer", async () => { + const llm = new LlamaCpp({ embedModel: "nvidia/llama-3.2-nv-embedqa-1b-v2" }) as any; + llm.ensureEmbedContext = vi.fn(async () => { + throw new Error("should not load local tokenizer"); + }); + + await expect(llm.countTokens("abcdef")).resolves.toBe(2); + await expect(llm.tokenize("abcdef")).resolves.toHaveLength(2); + expect(llm.ensureEmbedContext).not.toHaveBeenCalled(); + }); }); describe("LlamaCpp embedding truncation", () => { test("truncates against the active embedding context limit, not the model train context", async () => { + const prev = process.env.QMD_ENABLE_LOCAL_MODELS; + process.env.QMD_ENABLE_LOCAL_MODELS = "1"; const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any; const getEmbeddingFor = vi.fn(async (text: string) => ({ vector: new Float32Array([0.25, 0.5]), @@ -308,18 +352,25 @@ describe("LlamaCpp embedding truncation", () => { }; llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor }); - const result = await llm.embed("x".repeat(3000)); + try { + const result = await llm.embed("x".repeat(3000)); - expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044)); - expect(result).toEqual({ - embedding: [0.25, 0.5], - model: llm.embedModelUri, - }); + expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044)); + expect(result).toEqual({ + embedding: [0.25, 0.5], + model: llm.embedModelUri, + }); + } finally { + if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS; + else process.env.QMD_ENABLE_LOCAL_MODELS = prev; + } }); }); describe("LlamaCpp rerank deduping", () => { test("deduplicates identical document texts before scoring", async () => { + const prev = process.env.QMD_ENABLE_LOCAL_MODELS; + process.env.QMD_ENABLE_LOCAL_MODELS = "1"; const llm = new LlamaCpp({}) as any; llm._ciMode = false; // allow unit test even in CI (mocked, no real models) const rankAll = vi.fn(async (_query: string, docs: string[]) => @@ -333,20 +384,25 @@ describe("LlamaCpp rerank deduping", () => { detokenize: (tokens: string[]) => tokens.join(""), }); - const result = await llm.rerank("query", [ - { file: "a.md", text: "shared chunk" }, - { file: "b.md", text: "shared chunk" }, - { file: "c.md", text: "different chunk" }, - ]); + try { + const result = await llm.rerank("query", [ + { file: "a.md", text: "shared chunk" }, + { file: "b.md", text: "shared chunk" }, + { file: "c.md", text: "different chunk" }, + ]); - expect(rankAll).toHaveBeenCalledTimes(1); - expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]); - expect(result.results).toHaveLength(3); + expect(rankAll).toHaveBeenCalledTimes(1); + expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]); + expect(result.results).toHaveLength(3); - const scoreByFile = new Map(result.results.map((item) => [item.file, item.score])); - expect(scoreByFile.get("a.md")).toBe(0.9); - expect(scoreByFile.get("b.md")).toBe(0.9); - expect(scoreByFile.get("c.md")).toBe(0.2); + const scoreByFile = new Map(result.results.map((item) => [item.file, item.score])); + expect(scoreByFile.get("a.md")).toBe(0.9); + expect(scoreByFile.get("b.md")).toBe(0.9); + expect(scoreByFile.get("c.md")).toBe(0.2); + } finally { + if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS; + else process.env.QMD_ENABLE_LOCAL_MODELS = prev; + } }); }); diff --git a/test/store.test.ts b/test/store.test.ts index 848ec96..9fe57ff 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -2820,6 +2820,33 @@ describe("Embedding batching", () => { }); describe("Token chunking guardrails", () => { + test("chunkDocumentByTokens uses approximate counts for external embeddings without tokenizer load", async () => { + const tokenize = vi.fn(async () => { + throw new Error("should not tokenize through local GGUF"); + }); + const detokenize = vi.fn(async () => { + throw new Error("should not detokenize through local GGUF"); + }); + + setDefaultLlamaCpp({ + usesLocalEmbedding: false, + tokenize, + detokenize, + } as any); + + try { + const chunks = await chunkDocumentByTokens("x".repeat(1200), 100, 15, 20); + + expect(chunks.length).toBeGreaterThan(1); + expect(chunks.every((chunk) => chunk.tokens <= 100)).toBe(true); + expect(chunks[0]!.text.length).toBeLessThanOrEqual(300); + expect(tokenize).not.toHaveBeenCalled(); + expect(detokenize).not.toHaveBeenCalled(); + } finally { + setDefaultLlamaCpp(null); + } + }); + test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => { setDefaultLlamaCpp({ async tokenize(text: string) {