fix: disable local qmd models by default

This commit is contained in:
Haitao Pan 2026-05-23 11:04:48 +08:00
parent 7c17c8bcce
commit e3711767c6
8 changed files with 197 additions and 65 deletions

View File

@ -5,8 +5,11 @@
### Fixes ### Fixes
- Embedding: default to an external OpenAI-compatible embeddings API - Embedding: default to an external OpenAI-compatible embeddings API
(`text-embedding-3-small`) and require explicit `hf:`/`.gguf` (`nvidia/llama-3.2-nv-embedqa-1b-v2`) and require
configuration to use local node-llama-cpp embedding models. `QMD_ENABLE_LOCAL_MODELS=1` for local node-llama-cpp embedding, reranking,
and query expansion models.
- Embedding: use approximate token counts in external embedding mode so
chunking does not load a local GGUF tokenizer.
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529 - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
- Fix: preserve original filename case in `handelize()`. The previous - Fix: preserve original filename case in `handelize()`. The previous
`.toLowerCase()` call made indexed paths unreachable on case-sensitive `.toLowerCase()` call made indexed paths unreachable on case-sensitive

View File

@ -490,19 +490,20 @@ by default. Configure it with:
export NVIDIA_API_KEY="..." export NVIDIA_API_KEY="..."
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1" export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2" export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
export QMD_DISABLE_LOCAL_MODELS=1
``` ```
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
NVIDIA's required `input_type` automatically (`passage` while indexing, `query` NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
while searching). while searching).
`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load Local GGUF models are disabled by default. In the default mode QMD rejects local
local GGUF models. In that mode QMD rejects local embedding model URIs, skips embedding model URIs, skips local query expansion, and search reranking uses RRF
local query expansion, and defaults search reranking off while still using the scores only while still using the configured external embedding service for
configured external embedding service for vector search. vector search.
Reranking and query expansion still use local GGUF models via node-llama-cpp: Set `QMD_ENABLE_LOCAL_MODELS=1` to opt into local GGUF model loading. The first
query expansion or reranking call can download and load the configured local
model, which may take a while.
| Model | Purpose | Size | | Model | Purpose | Size |
|-------|---------|------| |-------|---------|------|
@ -513,10 +514,12 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
### Local Embedding Model ### Local Embedding Model
Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings. Set `QMD_ENABLE_LOCAL_MODELS=1` and `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf`
path to opt into local node-llama-cpp embeddings.
```sh ```sh
# Use Qwen3-Embedding-0.6B locally # Use Qwen3-Embedding-0.6B locally
export QMD_ENABLE_LOCAL_MODELS=1
export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf" export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
# After changing the model, re-embed all collections: # After changing the model, re-embed all collections:
@ -943,8 +946,9 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
```yaml ```yaml
models: models:
embed: nvidia/llama-3.2-nv-embedqa-1b-v2 embed: nvidia/llama-3.2-nv-embedqa-1b-v2
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf # Optional local models, used only when QMD_ENABLE_LOCAL_MODELS=1:
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf # rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
# generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
``` ```
### EmbeddingGemma Prompt Format ### EmbeddingGemma Prompt Format

View File

@ -9,14 +9,16 @@
global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context." global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context."
# Model overrides. # Model overrides.
# Embeddings use an external OpenAI-compatible /embeddings API by default. # Embeddings use NVIDIA's OpenAI-compatible /embeddings API by default.
# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth. # Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY in the environment for API auth.
# Local GGUF models are disabled unless QMD_ENABLE_LOCAL_MODELS=1 is set.
models: models:
embed: text-embedding-3-small embed: nvidia/llama-3.2-nv-embedqa-1b-v2
# Optional local embedding model instead of the external API: # Optional local embedding model instead of the external API:
# embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf # embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf # Optional local rerank/generation models:
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf # rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
# generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
# Collection definitions # Collection definitions
collections: collections:

View File

@ -463,8 +463,8 @@ async function showStatus(): Promise<void> {
}; };
console.log(`\n${c.bold}Models${c.reset}`); console.log(`\n${c.bold}Models${c.reset}`);
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`); console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`); console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`); console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
} }
// Device / GPU info // Device / GPU info
@ -3125,11 +3125,13 @@ if (isMain) {
case "pull": { case "pull": {
const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh); const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
const isLocalModelUri = (uri: string) =>
uri.startsWith("hf:") || uri.endsWith(".gguf") || uri.startsWith("/") || uri.startsWith("./") || uri.startsWith("../");
const models = [ const models = [
DEFAULT_EMBED_MODEL_URI, DEFAULT_EMBED_MODEL_URI,
DEFAULT_GENERATE_MODEL_URI, DEFAULT_GENERATE_MODEL_URI,
DEFAULT_RERANK_MODEL_URI, DEFAULT_RERANK_MODEL_URI,
]; ].filter(isLocalModelUri);
console.log(`${c.bold}Pulling models${c.reset}`); console.log(`${c.bold}Pulling models${c.reset}`);
const results = await pullModels(models, { const results = await pullModels(models, {
refresh, refresh,

View File

@ -192,7 +192,7 @@ export type RerankDocument = {
// ============================================================================= // =============================================================================
// Embeddings use NVIDIA's OpenAI-compatible API by default. // Embeddings use NVIDIA's OpenAI-compatible API by default.
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings. // Set QMD_ENABLE_LOCAL_MODELS=1 before using any local node-llama-cpp GGUF models.
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2"; const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"; // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
@ -216,14 +216,24 @@ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1"; const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
export function localModelsDisabled(): boolean { export function localModelsEnabled(): boolean {
return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? ""); return /^(1|true|yes|on)$/i.test(process.env.QMD_ENABLE_LOCAL_MODELS ?? "");
} }
function isLocalEmbeddingModel(model: string): boolean { function isLocalEmbeddingModel(model: string): boolean {
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../"); return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
} }
export function approximateTokenCount(text: string): number {
if (text.length === 0) return 0;
return Math.max(1, Math.ceil(text.length / 3));
}
export function truncateByApproxTokens(text: string, maxTokens: number): string {
if (maxTokens <= 0) return "";
return text.slice(0, Math.max(1, maxTokens * 3));
}
export type PullResult = { export type PullResult = {
model: string; model: string;
path: string; path: string;
@ -929,10 +939,17 @@ export class LlamaCpp implements LLM {
// ========================================================================== // ==========================================================================
/** /**
* Tokenize text using the embedding model's tokenizer * Tokenize text using the embedding model's tokenizer when local embeddings
* Returns tokenizer tokens (opaque type from node-llama-cpp) * are explicitly active. External embedding mode uses a conservative
* approximation and must not load a local tokenizer.
*/ */
async tokenize(text: string): Promise<readonly LlamaToken[]> { async tokenize(text: string): Promise<readonly LlamaToken[]> {
if (!this.usesLocalEmbedding) {
return Array.from(
{ length: approximateTokenCount(text) },
(_, index) => index as unknown as LlamaToken,
);
}
await this.ensureEmbedContext(); // Ensure model is loaded await this.ensureEmbedContext(); // Ensure model is loaded
if (!this.embedModel) { if (!this.embedModel) {
throw new Error("Embed model not loaded"); throw new Error("Embed model not loaded");
@ -941,17 +958,25 @@ export class LlamaCpp implements LLM {
} }
/** /**
* Count tokens in text using the embedding model's tokenizer * Count tokens in text. External embedding mode uses an approximation so
* chunking never pulls in a local GGUF tokenizer by accident.
*/ */
async countTokens(text: string): Promise<number> { async countTokens(text: string): Promise<number> {
if (!this.usesLocalEmbedding) {
return approximateTokenCount(text);
}
const tokens = await this.tokenize(text); const tokens = await this.tokenize(text);
return tokens.length; return tokens.length;
} }
/** /**
* Detokenize token IDs back to text * Detokenize token IDs back to text. External embedding mode has no local
* tokenizer, so return an approximate-width placeholder for guardrail paths.
*/ */
async detokenize(tokens: readonly LlamaToken[]): Promise<string> { async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
if (!this.usesLocalEmbedding) {
return " ".repeat(tokens.length * 3);
}
await this.ensureEmbedContext(); await this.ensureEmbedContext();
if (!this.embedModel) { if (!this.embedModel) {
throw new Error("Embed model not loaded"); throw new Error("Embed model not loaded");
@ -1047,8 +1072,8 @@ export class LlamaCpp implements LLM {
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> { async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
const model = options.model ?? this.embedModelUri; const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) { if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model."); throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
} }
if (!isLocalEmbeddingModel(model)) { if (!isLocalEmbeddingModel(model)) {
const results = await this.embedExternal([text], model, options); const results = await this.embedExternal([text], model, options);
@ -1085,8 +1110,8 @@ export class LlamaCpp implements LLM {
*/ */
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> { async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
const model = options.model ?? this.embedModelUri; const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) { if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model."); throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
} }
if (!isLocalEmbeddingModel(model)) { if (!isLocalEmbeddingModel(model)) {
return this.embedExternal(texts, model, options); return this.embedExternal(texts, model, options);
@ -1219,7 +1244,7 @@ export class LlamaCpp implements LLM {
// ========================================================================== // ==========================================================================
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> { async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
if (localModelsDisabled()) return []; if (!localModelsEnabled()) return [];
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
// Ping activity at start to keep models alive during this operation // Ping activity at start to keep models alive during this operation
this.touchActivity(); this.touchActivity();
@ -1319,7 +1344,7 @@ export class LlamaCpp implements LLM {
documents: RerankDocument[], documents: RerankDocument[],
options: RerankOptions = {} options: RerankOptions = {}
): Promise<RerankResult> { ): Promise<RerankResult> {
if (localModelsDisabled()) { if (!localModelsEnabled()) {
return { return {
model: "disabled", model: "disabled",
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })), results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),

View File

@ -21,11 +21,13 @@ import fastGlob from "fast-glob";
import { import {
LlamaCpp, LlamaCpp,
getDefaultLlamaCpp, getDefaultLlamaCpp,
approximateTokenCount,
truncateByApproxTokens,
formatQueryForEmbedding, formatQueryForEmbedding,
formatDocForEmbedding, formatDocForEmbedding,
withLLMSessionForLlm, withLLMSessionForLlm,
DEFAULT_EMBED_MODEL_URI, DEFAULT_EMBED_MODEL_URI,
localModelsDisabled, localModelsEnabled,
type RerankDocument, type RerankDocument,
type ILLMSession, type ILLMSession,
} from "./llm.js"; } from "./llm.js";
@ -2279,6 +2281,15 @@ export async function chunkDocumentByTokens(
signal?: AbortSignal signal?: AbortSignal
): Promise<{ text: string; pos: number; tokens: number }[]> { ): Promise<{ text: string; pos: number; tokens: number }[]> {
const llm = getDefaultLlamaCpp(); const llm = getDefaultLlamaCpp();
const useLocalTokenizer = typeof (llm as any).usesLocalEmbedding === "boolean"
? Boolean((llm as any).usesLocalEmbedding)
: true;
const countTokens = async (text: string): Promise<number> => {
if (!useLocalTokenizer) return approximateTokenCount(text);
const tokens = await llm.tokenize(text);
return tokens.length;
};
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3) // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
// If chunks exceed limit, they'll be re-split with actual ratio // If chunks exceed limit, they'll be re-split with actual ratio
@ -2301,13 +2312,13 @@ export async function chunkDocumentByTokens(
const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise<void> => { const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise<void> => {
if (signal?.aborted) return; if (signal?.aborted) return;
const tokens = await llm.tokenize(text); const tokenCount = await countTokens(text);
if (tokens.length <= maxTokens || text.length <= 1) { if (tokenCount <= maxTokens || text.length <= 1) {
results.push({ text, pos, tokens: tokens.length }); results.push({ text, pos, tokens: tokenCount });
return; return;
} }
const actualCharsPerToken = text.length / tokens.length; const actualCharsPerToken = text.length / tokenCount;
let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95); let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) { if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
safeMaxChars = Math.floor(text.length / 2); safeMaxChars = Math.floor(text.length / 2);
@ -2337,12 +2348,14 @@ export async function chunkDocumentByTokens(
subChunks.length <= 1 subChunks.length <= 1
|| subChunks[0]?.text.length === text.length || subChunks[0]?.text.length === text.length
) { ) {
const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens)); const tokenLimit = Math.max(1, maxTokens);
const truncatedText = await llm.detokenize(fallbackTokens); const truncatedText = useLocalTokenizer
? await llm.detokenize((await llm.tokenize(text)).slice(0, tokenLimit))
: truncateByApproxTokens(text, tokenLimit);
results.push({ results.push({
text: truncatedText, text: truncatedText,
pos, pos,
tokens: fallbackTokens.length, tokens: tokenLimit,
}); });
return; return;
} }
@ -4013,7 +4026,7 @@ export async function hybridQuery(
const collection = options?.collection; const collection = options?.collection;
const explain = options?.explain ?? false; const explain = options?.explain ?? false;
const intent = options?.intent; const intent = options?.intent;
const skipRerank = options?.skipRerank ?? localModelsDisabled(); const skipRerank = options?.skipRerank ?? !localModelsEnabled();
const hooks = options?.hooks; const hooks = options?.hooks;
const rankedLists: RankedResult[][] = []; const rankedLists: RankedResult[][] = [];
@ -4408,7 +4421,7 @@ export async function structuredSearch(
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
const explain = options?.explain ?? false; const explain = options?.explain ?? false;
const intent = options?.intent; const intent = options?.intent;
const skipRerank = options?.skipRerank ?? localModelsDisabled(); const skipRerank = options?.skipRerank ?? !localModelsEnabled();
const hooks = options?.hooks; const hooks = options?.hooks;
const collections = options?.collections; const collections = options?.collections;

View File

@ -193,9 +193,11 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
}); });
test("default embedding uses NVIDIA OpenAI-compatible API", async () => { test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
const prevModel = process.env.QMD_EMBED_MODEL;
const prevKey = process.env.QMD_EMBED_API_KEY; const prevKey = process.env.QMD_EMBED_API_KEY;
const prevNvidiaKey = process.env.NVIDIA_API_KEY; const prevNvidiaKey = process.env.NVIDIA_API_KEY;
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL; const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
delete process.env.QMD_EMBED_MODEL;
process.env.QMD_EMBED_API_KEY = "test-key"; process.env.QMD_EMBED_API_KEY = "test-key";
delete process.env.NVIDIA_API_KEY; delete process.env.NVIDIA_API_KEY;
delete process.env.QMD_EMBED_API_BASE_URL; delete process.env.QMD_EMBED_API_BASE_URL;
@ -225,6 +227,8 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
}); });
} finally { } finally {
fetchMock.mockRestore(); fetchMock.mockRestore();
if (prevModel === undefined) delete process.env.QMD_EMBED_MODEL;
else process.env.QMD_EMBED_MODEL = prevModel;
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY; if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
else process.env.QMD_EMBED_API_KEY = prevKey; else process.env.QMD_EMBED_API_KEY = prevKey;
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY; if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
@ -274,9 +278,9 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
expect(llm.usesLocalEmbedding).toBe(true); expect(llm.usesLocalEmbedding).toBe(true);
}); });
test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => { test("local models are disabled by default", async () => {
const prev = process.env.QMD_DISABLE_LOCAL_MODELS; const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_DISABLE_LOCAL_MODELS = "1"; delete process.env.QMD_ENABLE_LOCAL_MODELS;
try { try {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }); const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled"); await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
@ -286,14 +290,54 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
results: [{ file: "doc.md", score: 0, index: 0 }], results: [{ file: "doc.md", score: 0, index: 0 }],
}); });
} finally { } finally {
if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS; if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_DISABLE_LOCAL_MODELS = prev; else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
} }
}); });
test("QMD_ENABLE_LOCAL_MODELS allows explicit local embedding models", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
try {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }) as any;
llm._ciMode = false;
llm.touchActivity = vi.fn();
llm.ensureEmbedContext = vi.fn().mockResolvedValue({
getEmbeddingFor: vi.fn(async () => ({ vector: new Float32Array([0.1, 0.2]) })),
});
llm.truncateToContextSize = vi.fn(async (text: string) => ({
text,
truncated: false,
limit: 2048,
}));
await expect(llm.embed("hello")).resolves.toEqual({
embedding: [expect.closeTo(0.1), expect.closeTo(0.2)],
model: "hf:custom/embed.gguf",
});
expect(llm.ensureEmbedContext).toHaveBeenCalled();
} finally {
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
});
test("external embedding token counting does not load a local tokenizer", async () => {
const llm = new LlamaCpp({ embedModel: "nvidia/llama-3.2-nv-embedqa-1b-v2" }) as any;
llm.ensureEmbedContext = vi.fn(async () => {
throw new Error("should not load local tokenizer");
});
await expect(llm.countTokens("abcdef")).resolves.toBe(2);
await expect(llm.tokenize("abcdef")).resolves.toHaveLength(2);
expect(llm.ensureEmbedContext).not.toHaveBeenCalled();
});
}); });
describe("LlamaCpp embedding truncation", () => { describe("LlamaCpp embedding truncation", () => {
test("truncates against the active embedding context limit, not the model train context", async () => { test("truncates against the active embedding context limit, not the model train context", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any; const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any;
const getEmbeddingFor = vi.fn(async (text: string) => ({ const getEmbeddingFor = vi.fn(async (text: string) => ({
vector: new Float32Array([0.25, 0.5]), vector: new Float32Array([0.25, 0.5]),
@ -308,18 +352,25 @@ describe("LlamaCpp embedding truncation", () => {
}; };
llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor }); llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor });
const result = await llm.embed("x".repeat(3000)); try {
const result = await llm.embed("x".repeat(3000));
expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044)); expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
expect(result).toEqual({ expect(result).toEqual({
embedding: [0.25, 0.5], embedding: [0.25, 0.5],
model: llm.embedModelUri, model: llm.embedModelUri,
}); });
} finally {
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
}); });
}); });
describe("LlamaCpp rerank deduping", () => { describe("LlamaCpp rerank deduping", () => {
test("deduplicates identical document texts before scoring", async () => { test("deduplicates identical document texts before scoring", async () => {
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
const llm = new LlamaCpp({}) as any; const llm = new LlamaCpp({}) as any;
llm._ciMode = false; // allow unit test even in CI (mocked, no real models) llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
const rankAll = vi.fn(async (_query: string, docs: string[]) => const rankAll = vi.fn(async (_query: string, docs: string[]) =>
@ -333,20 +384,25 @@ describe("LlamaCpp rerank deduping", () => {
detokenize: (tokens: string[]) => tokens.join(""), detokenize: (tokens: string[]) => tokens.join(""),
}); });
const result = await llm.rerank("query", [ try {
{ file: "a.md", text: "shared chunk" }, const result = await llm.rerank("query", [
{ file: "b.md", text: "shared chunk" }, { file: "a.md", text: "shared chunk" },
{ file: "c.md", text: "different chunk" }, { file: "b.md", text: "shared chunk" },
]); { file: "c.md", text: "different chunk" },
]);
expect(rankAll).toHaveBeenCalledTimes(1); expect(rankAll).toHaveBeenCalledTimes(1);
expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]); expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
expect(result.results).toHaveLength(3); expect(result.results).toHaveLength(3);
const scoreByFile = new Map(result.results.map((item) => [item.file, item.score])); const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
expect(scoreByFile.get("a.md")).toBe(0.9); expect(scoreByFile.get("a.md")).toBe(0.9);
expect(scoreByFile.get("b.md")).toBe(0.9); expect(scoreByFile.get("b.md")).toBe(0.9);
expect(scoreByFile.get("c.md")).toBe(0.2); expect(scoreByFile.get("c.md")).toBe(0.2);
} finally {
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
}
}); });
}); });

View File

@ -2820,6 +2820,33 @@ describe("Embedding batching", () => {
}); });
describe("Token chunking guardrails", () => { describe("Token chunking guardrails", () => {
test("chunkDocumentByTokens uses approximate counts for external embeddings without tokenizer load", async () => {
const tokenize = vi.fn(async () => {
throw new Error("should not tokenize through local GGUF");
});
const detokenize = vi.fn(async () => {
throw new Error("should not detokenize through local GGUF");
});
setDefaultLlamaCpp({
usesLocalEmbedding: false,
tokenize,
detokenize,
} as any);
try {
const chunks = await chunkDocumentByTokens("x".repeat(1200), 100, 15, 20);
expect(chunks.length).toBeGreaterThan(1);
expect(chunks.every((chunk) => chunk.tokens <= 100)).toBe(true);
expect(chunks[0]!.text.length).toBeLessThanOrEqual(300);
expect(tokenize).not.toHaveBeenCalled();
expect(detokenize).not.toHaveBeenCalled();
} finally {
setDefaultLlamaCpp(null);
}
});
test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => { test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => {
setDefaultLlamaCpp({ setDefaultLlamaCpp({
async tokenize(text: string) { async tokenize(text: string) {