fix: disable local qmd models by default

This commit is contained in:
Haitao Pan 2026-05-23 11:04:48 +08:00
parent 7c17c8bcce
commit e3711767c6
8 changed files with 197 additions and 65 deletions

View File

@ -5,8 +5,11 @@
### Fixes
- Embedding: default to an external OpenAI-compatible embeddings API
(`text-embedding-3-small`) and require explicit `hf:`/`.gguf`
configuration to use local node-llama-cpp embedding models.
(`nvidia/llama-3.2-nv-embedqa-1b-v2`) and require
`QMD_ENABLE_LOCAL_MODELS=1` for local node-llama-cpp embedding, reranking,
and query expansion models.
- Embedding: use approximate token counts in external embedding mode so
chunking does not load a local GGUF tokenizer.
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
- Fix: preserve original filename case in `handelize()`. The previous
`.toLowerCase()` call made indexed paths unreachable on case-sensitive

View File

@ -490,19 +490,20 @@ by default. Configure it with:
export NVIDIA_API_KEY="..."
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
export QMD_DISABLE_LOCAL_MODELS=1
```
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
while searching).
`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
local GGUF models. In that mode QMD rejects local embedding model URIs, skips
local query expansion, and defaults search reranking off while still using the
configured external embedding service for vector search.
Local GGUF models are disabled by default. In the default mode QMD rejects local
embedding model URIs, skips local query expansion, and search reranking uses RRF
scores only while still using the configured external embedding service for
vector search.
Reranking and query expansion still use local GGUF models via node-llama-cpp:
Set `QMD_ENABLE_LOCAL_MODELS=1` to opt into local GGUF model loading. The first
query expansion or reranking call can download and load the configured local
model, which may take a while.
| Model | Purpose | Size |
|-------|---------|------|
@ -513,10 +514,12 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
### Local Embedding Model
Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings.
Set `QMD_ENABLE_LOCAL_MODELS=1` and `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf`
path to opt into local node-llama-cpp embeddings.
```sh
# Use Qwen3-Embedding-0.6B locally
export QMD_ENABLE_LOCAL_MODELS=1
export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
# After changing the model, re-embed all collections:
@ -943,8 +946,9 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
```yaml
models:
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
# Optional local models, used only when QMD_ENABLE_LOCAL_MODELS=1:
# rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
# generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
```
### EmbeddingGemma Prompt Format

View File

@ -9,14 +9,16 @@
global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context."
# Model overrides.
# Embeddings use an external OpenAI-compatible /embeddings API by default.
# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth.
# Embeddings use NVIDIA's OpenAI-compatible /embeddings API by default.
# Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY in the environment for API auth.
# Local GGUF models are disabled unless QMD_ENABLE_LOCAL_MODELS=1 is set.
models:
embed: text-embedding-3-small
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
# Optional local embedding model instead of the external API:
# embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
# Optional local rerank/generation models:
# rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
# generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
# Collection definitions
collections:

View File

@ -463,8 +463,8 @@ async function showStatus(): Promise<void> {
};
console.log(`\n${c.bold}Models${c.reset}`);
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
}
// Device / GPU info
@ -3125,11 +3125,13 @@ if (isMain) {
case "pull": {
const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
const isLocalModelUri = (uri: string) =>
uri.startsWith("hf:") || uri.endsWith(".gguf") || uri.startsWith("/") || uri.startsWith("./") || uri.startsWith("../");
const models = [
DEFAULT_EMBED_MODEL_URI,
DEFAULT_GENERATE_MODEL_URI,
DEFAULT_RERANK_MODEL_URI,
];
].filter(isLocalModelUri);
console.log(`${c.bold}Pulling models${c.reset}`);
const results = await pullModels(models, {
refresh,

View File

@ -192,7 +192,7 @@ export type RerankDocument = {
// =============================================================================
// Embeddings use NVIDIA's OpenAI-compatible API by default.
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
// Set QMD_ENABLE_LOCAL_MODELS=1 before using any local node-llama-cpp GGUF models.
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
@ -216,14 +216,24 @@ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
export function localModelsDisabled(): boolean {
return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
export function localModelsEnabled(): boolean {
return /^(1|true|yes|on)$/i.test(process.env.QMD_ENABLE_LOCAL_MODELS ?? "");
}
function isLocalEmbeddingModel(model: string): boolean {
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
}
export function approximateTokenCount(text: string): number {
if (text.length === 0) return 0;
return Math.max(1, Math.ceil(text.length / 3));
}
export function truncateByApproxTokens(text: string, maxTokens: number): string {
if (maxTokens <= 0) return "";
return text.slice(0, Math.max(1, maxTokens * 3));
}
export type PullResult = {
model: string;
path: string;
@ -929,10 +939,17 @@ export class LlamaCpp implements LLM {
// ==========================================================================
/**
* Tokenize text using the embedding model's tokenizer
* Returns tokenizer tokens (opaque type from node-llama-cpp)
* Tokenize text using the embedding model's tokenizer when local embeddings
* are explicitly active. External embedding mode uses a conservative
* approximation and must not load a local tokenizer.
*/
async tokenize(text: string): Promise<readonly LlamaToken[]> {
if (!this.usesLocalEmbedding) {
return Array.from(
{ length: approximateTokenCount(text) },
(_, index) => index as unknown as LlamaToken,
);
}
await this.ensureEmbedContext(); // Ensure model is loaded
if (!this.embedModel) {
throw new Error("Embed model not loaded");
@ -941,17 +958,25 @@ export class LlamaCpp implements LLM {
}
/**
* Count tokens in text using the embedding model's tokenizer
* Count tokens in text. External embedding mode uses an approximation so
* chunking never pulls in a local GGUF tokenizer by accident.
*/
async countTokens(text: string): Promise<number> {
if (!this.usesLocalEmbedding) {
return approximateTokenCount(text);
}
const tokens = await this.tokenize(text);
return tokens.length;
}
/**
* Detokenize token IDs back to text
* Detokenize token IDs back to text. External embedding mode has no local
* tokenizer, so return an approximate-width placeholder for guardrail paths.
*/
async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
if (!this.usesLocalEmbedding) {
return " ".repeat(tokens.length * 3);
}
await this.ensureEmbedContext();
if (!this.embedModel) {
throw new Error("Embed model not loaded");
@ -1047,8 +1072,8 @@ export class LlamaCpp implements LLM {
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
}
if (!isLocalEmbeddingModel(model)) {
const results = await this.embedExternal([text], model, options);
@ -1085,8 +1110,8 @@ export class LlamaCpp implements LLM {
*/
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
}
if (!isLocalEmbeddingModel(model)) {
return this.embedExternal(texts, model, options);
@ -1219,7 +1244,7 @@ export class LlamaCpp implements LLM {
// ==========================================================================
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
if (localModelsDisabled()) return [];
if (!localModelsEnabled()) return [];
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
// Ping activity at start to keep models alive during this operation
this.touchActivity();
@ -1319,7 +1344,7 @@ export class LlamaCpp implements LLM {
documents: RerankDocument[],
options: RerankOptions = {}
): Promise<RerankResult> {
if (localModelsDisabled()) {
if (!localModelsEnabled()) {
return {
model: "disabled",
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),

View File

@ -21,11 +21,13 @@ import fastGlob from "fast-glob";
import {
LlamaCpp,
getDefaultLlamaCpp,
approximateTokenCount,
truncateByApproxTokens,
formatQueryForEmbedding,
formatDocForEmbedding,
withLLMSessionForLlm,
DEFAULT_EMBED_MODEL_URI,
localModelsDisabled,
localModelsEnabled,
type RerankDocument,
type ILLMSession,
} from "./llm.js";
@ -2279,6 +2281,15 @@ export async function chunkDocumentByTokens(
signal?: AbortSignal
): Promise<{ text: string; pos: number; tokens: number }[]> {
const llm = getDefaultLlamaCpp();
const useLocalTokenizer = typeof (llm as any).usesLocalEmbedding === "boolean"
? Boolean((llm as any).usesLocalEmbedding)
: true;
const countTokens = async (text: string): Promise<number> => {
if (!useLocalTokenizer) return approximateTokenCount(text);
const tokens = await llm.tokenize(text);
return tokens.length;
};
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
// If chunks exceed limit, they'll be re-split with actual ratio
@ -2301,13 +2312,13 @@ export async function chunkDocumentByTokens(
const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise<void> => {
if (signal?.aborted) return;
const tokens = await llm.tokenize(text);
if (tokens.length <= maxTokens || text.length <= 1) {
results.push({ text, pos, tokens: tokens.length });
const tokenCount = await countTokens(text);
if (tokenCount <= maxTokens || text.length <= 1) {
results.push({ text, pos, tokens: tokenCount });
return;
}
const actualCharsPerToken = text.length / tokens.length;
const actualCharsPerToken = text.length / tokenCount;
let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
safeMaxChars = Math.floor(text.length / 2);
@ -2337,12 +2348,14 @@ export async function chunkDocumentByTokens(
subChunks.length <= 1
|| subChunks[0]?.text.length === text.length
) {
const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
const truncatedText = await llm.detokenize(fallbackTokens);
const tokenLimit = Math.max(1, maxTokens);
const truncatedText = useLocalTokenizer
? await llm.detokenize((await llm.tokenize(text)).slice(0, tokenLimit))
: truncateByApproxTokens(text, tokenLimit);
results.push({
text: truncatedText,
pos,
tokens: fallbackTokens.length,
tokens: tokenLimit,
});
return;
}
@ -4013,7 +4026,7 @@ export async function hybridQuery(
const collection = options?.collection;
const explain = options?.explain ?? false;
const intent = options?.intent;
const skipRerank = options?.skipRerank ?? localModelsDisabled();
const skipRerank = options?.skipRerank ?? !localModelsEnabled();
const hooks = options?.hooks;
const rankedLists: RankedResult[][] = [];
@ -4408,7 +4421,7 @@ export async function structuredSearch(
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
const explain = options?.explain ?? false;
const intent = options?.intent;
const skipRerank = options?.skipRerank ?? localModelsDisabled();
const skipRerank = options?.skipRerank ?? !localModelsEnabled();
const hooks = options?.hooks;
const collections = options?.collections;

View File

@ -193,9 +193,11 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
});
test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
const prevModel = process.env.QMD_EMBED_MODEL;
const prevKey = process.env.QMD_EMBED_API_KEY;
const prevNvidiaKey = process.env.NVIDIA_API_KEY;
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
delete process.env.QMD_EMBED_MODEL;
process.env.QMD_EMBED_API_KEY = "test-key";
delete process.env.NVIDIA_API_KEY;
delete process.env.QMD_EMBED_API_BASE_URL;
@ -225,6 +227,8 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
});
} finally {
fetchMock.mockRestore();
if (prevModel === undefined) delete process.env.QMD_EMBED_MODEL;
else process.env.QMD_EMBED_MODEL = prevModel;
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
else process.env.QMD_EMBED_API_KEY = prevKey;
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
@ -274,9 +278,9 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
expect(llm.usesLocalEmbedding).toBe(true);
});
test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
process.env.QMD_DISABLE_LOCAL_MODELS = "1";
test("local models are disabled by default", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
delete process.env.QMD_ENABLE_LOCAL_MODELS;
try {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
@ -286,14 +290,54 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
results: [{ file: "doc.md", score: 0, index: 0 }],
});
} finally {
if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
});
test("QMD_ENABLE_LOCAL_MODELS allows explicit local embedding models", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
try {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }) as any;
llm._ciMode = false;
llm.touchActivity = vi.fn();
llm.ensureEmbedContext = vi.fn().mockResolvedValue({
getEmbeddingFor: vi.fn(async () => ({ vector: new Float32Array([0.1, 0.2]) })),
});
llm.truncateToContextSize = vi.fn(async (text: string) => ({
text,
truncated: false,
limit: 2048,
}));
await expect(llm.embed("hello")).resolves.toEqual({
embedding: [expect.closeTo(0.1), expect.closeTo(0.2)],
model: "hf:custom/embed.gguf",
});
expect(llm.ensureEmbedContext).toHaveBeenCalled();
} finally {
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
});
test("external embedding token counting does not load a local tokenizer", async () => {
const llm = new LlamaCpp({ embedModel: "nvidia/llama-3.2-nv-embedqa-1b-v2" }) as any;
llm.ensureEmbedContext = vi.fn(async () => {
throw new Error("should not load local tokenizer");
});
await expect(llm.countTokens("abcdef")).resolves.toBe(2);
await expect(llm.tokenize("abcdef")).resolves.toHaveLength(2);
expect(llm.ensureEmbedContext).not.toHaveBeenCalled();
});
});
describe("LlamaCpp embedding truncation", () => {
test("truncates against the active embedding context limit, not the model train context", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any;
const getEmbeddingFor = vi.fn(async (text: string) => ({
vector: new Float32Array([0.25, 0.5]),
@ -308,18 +352,25 @@ describe("LlamaCpp embedding truncation", () => {
};
llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor });
const result = await llm.embed("x".repeat(3000));
try {
const result = await llm.embed("x".repeat(3000));
expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
expect(result).toEqual({
embedding: [0.25, 0.5],
model: llm.embedModelUri,
});
expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
expect(result).toEqual({
embedding: [0.25, 0.5],
model: llm.embedModelUri,
});
} finally {
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
});
});
describe("LlamaCpp rerank deduping", () => {
test("deduplicates identical document texts before scoring", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
const llm = new LlamaCpp({}) as any;
llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
const rankAll = vi.fn(async (_query: string, docs: string[]) =>
@ -333,20 +384,25 @@ describe("LlamaCpp rerank deduping", () => {
detokenize: (tokens: string[]) => tokens.join(""),
});
const result = await llm.rerank("query", [
{ file: "a.md", text: "shared chunk" },
{ file: "b.md", text: "shared chunk" },
{ file: "c.md", text: "different chunk" },
]);
try {
const result = await llm.rerank("query", [
{ file: "a.md", text: "shared chunk" },
{ file: "b.md", text: "shared chunk" },
{ file: "c.md", text: "different chunk" },
]);
expect(rankAll).toHaveBeenCalledTimes(1);
expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
expect(result.results).toHaveLength(3);
expect(rankAll).toHaveBeenCalledTimes(1);
expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
expect(result.results).toHaveLength(3);
const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
expect(scoreByFile.get("a.md")).toBe(0.9);
expect(scoreByFile.get("b.md")).toBe(0.9);
expect(scoreByFile.get("c.md")).toBe(0.2);
const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
expect(scoreByFile.get("a.md")).toBe(0.9);
expect(scoreByFile.get("b.md")).toBe(0.9);
expect(scoreByFile.get("c.md")).toBe(0.2);
} finally {
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
});
});

View File

@ -2820,6 +2820,33 @@ describe("Embedding batching", () => {
});
describe("Token chunking guardrails", () => {
test("chunkDocumentByTokens uses approximate counts for external embeddings without tokenizer load", async () => {
const tokenize = vi.fn(async () => {
throw new Error("should not tokenize through local GGUF");
});
const detokenize = vi.fn(async () => {
throw new Error("should not detokenize through local GGUF");
});
setDefaultLlamaCpp({
usesLocalEmbedding: false,
tokenize,
detokenize,
} as any);
try {
const chunks = await chunkDocumentByTokens("x".repeat(1200), 100, 15, 20);
expect(chunks.length).toBeGreaterThan(1);
expect(chunks.every((chunk) => chunk.tokens <= 100)).toBe(true);
expect(chunks[0]!.text.length).toBeLessThanOrEqual(300);
expect(tokenize).not.toHaveBeenCalled();
expect(detokenize).not.toHaveBeenCalled();
} finally {
setDefaultLlamaCpp(null);
}
});
test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => {
setDefaultLlamaCpp({
async tokenize(text: string) {