fix: disable local qmd models by default
This commit is contained in:
parent
7c17c8bcce
commit
e3711767c6
@ -5,8 +5,11 @@
|
|||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
- Embedding: default to an external OpenAI-compatible embeddings API
|
- Embedding: default to an external OpenAI-compatible embeddings API
|
||||||
(`text-embedding-3-small`) and require explicit `hf:`/`.gguf`
|
(`nvidia/llama-3.2-nv-embedqa-1b-v2`) and require
|
||||||
configuration to use local node-llama-cpp embedding models.
|
`QMD_ENABLE_LOCAL_MODELS=1` for local node-llama-cpp embedding, reranking,
|
||||||
|
and query expansion models.
|
||||||
|
- Embedding: use approximate token counts in external embedding mode so
|
||||||
|
chunking does not load a local GGUF tokenizer.
|
||||||
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
|
- GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
|
||||||
- Fix: preserve original filename case in `handelize()`. The previous
|
- Fix: preserve original filename case in `handelize()`. The previous
|
||||||
`.toLowerCase()` call made indexed paths unreachable on case-sensitive
|
`.toLowerCase()` call made indexed paths unreachable on case-sensitive
|
||||||
|
|||||||
22
README.md
22
README.md
@ -490,19 +490,20 @@ by default. Configure it with:
|
|||||||
export NVIDIA_API_KEY="..."
|
export NVIDIA_API_KEY="..."
|
||||||
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
|
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
|
||||||
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
|
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
|
||||||
export QMD_DISABLE_LOCAL_MODELS=1
|
|
||||||
```
|
```
|
||||||
|
|
||||||
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
|
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
|
||||||
NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
|
NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
|
||||||
while searching).
|
while searching).
|
||||||
|
|
||||||
`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
|
Local GGUF models are disabled by default. In the default mode QMD rejects local
|
||||||
local GGUF models. In that mode QMD rejects local embedding model URIs, skips
|
embedding model URIs, skips local query expansion, and search reranking uses RRF
|
||||||
local query expansion, and defaults search reranking off while still using the
|
scores only while still using the configured external embedding service for
|
||||||
configured external embedding service for vector search.
|
vector search.
|
||||||
|
|
||||||
Reranking and query expansion still use local GGUF models via node-llama-cpp:
|
Set `QMD_ENABLE_LOCAL_MODELS=1` to opt into local GGUF model loading. The first
|
||||||
|
query expansion or reranking call can download and load the configured local
|
||||||
|
model, which may take a while.
|
||||||
|
|
||||||
| Model | Purpose | Size |
|
| Model | Purpose | Size |
|
||||||
|-------|---------|------|
|
|-------|---------|------|
|
||||||
@ -513,10 +514,12 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
|
|||||||
|
|
||||||
### Local Embedding Model
|
### Local Embedding Model
|
||||||
|
|
||||||
Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings.
|
Set `QMD_ENABLE_LOCAL_MODELS=1` and `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf`
|
||||||
|
path to opt into local node-llama-cpp embeddings.
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Use Qwen3-Embedding-0.6B locally
|
# Use Qwen3-Embedding-0.6B locally
|
||||||
|
export QMD_ENABLE_LOCAL_MODELS=1
|
||||||
export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
|
export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
|
||||||
|
|
||||||
# After changing the model, re-embed all collections:
|
# After changing the model, re-embed all collections:
|
||||||
@ -943,8 +946,9 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
|
|||||||
```yaml
|
```yaml
|
||||||
models:
|
models:
|
||||||
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
|
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
|
||||||
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
|
# Optional local models, used only when QMD_ENABLE_LOCAL_MODELS=1:
|
||||||
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
|
# rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
|
||||||
|
# generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
### EmbeddingGemma Prompt Format
|
### EmbeddingGemma Prompt Format
|
||||||
|
|||||||
@ -9,14 +9,16 @@
|
|||||||
global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context."
|
global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context."
|
||||||
|
|
||||||
# Model overrides.
|
# Model overrides.
|
||||||
# Embeddings use an external OpenAI-compatible /embeddings API by default.
|
# Embeddings use NVIDIA's OpenAI-compatible /embeddings API by default.
|
||||||
# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth.
|
# Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY in the environment for API auth.
|
||||||
|
# Local GGUF models are disabled unless QMD_ENABLE_LOCAL_MODELS=1 is set.
|
||||||
models:
|
models:
|
||||||
embed: text-embedding-3-small
|
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
|
||||||
# Optional local embedding model instead of the external API:
|
# Optional local embedding model instead of the external API:
|
||||||
# embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf
|
# embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf
|
||||||
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
|
# Optional local rerank/generation models:
|
||||||
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
|
# rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
|
||||||
|
# generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
|
||||||
|
|
||||||
# Collection definitions
|
# Collection definitions
|
||||||
collections:
|
collections:
|
||||||
|
|||||||
@ -463,8 +463,8 @@ async function showStatus(): Promise<void> {
|
|||||||
};
|
};
|
||||||
console.log(`\n${c.bold}Models${c.reset}`);
|
console.log(`\n${c.bold}Models${c.reset}`);
|
||||||
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
|
console.log(` Embedding: ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
|
||||||
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
|
console.log(` Reranking: ${hfLink(DEFAULT_RERANK_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
|
||||||
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
|
console.log(` Generation: ${hfLink(DEFAULT_GENERATE_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Device / GPU info
|
// Device / GPU info
|
||||||
@ -3125,11 +3125,13 @@ if (isMain) {
|
|||||||
|
|
||||||
case "pull": {
|
case "pull": {
|
||||||
const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
|
const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
|
||||||
|
const isLocalModelUri = (uri: string) =>
|
||||||
|
uri.startsWith("hf:") || uri.endsWith(".gguf") || uri.startsWith("/") || uri.startsWith("./") || uri.startsWith("../");
|
||||||
const models = [
|
const models = [
|
||||||
DEFAULT_EMBED_MODEL_URI,
|
DEFAULT_EMBED_MODEL_URI,
|
||||||
DEFAULT_GENERATE_MODEL_URI,
|
DEFAULT_GENERATE_MODEL_URI,
|
||||||
DEFAULT_RERANK_MODEL_URI,
|
DEFAULT_RERANK_MODEL_URI,
|
||||||
];
|
].filter(isLocalModelUri);
|
||||||
console.log(`${c.bold}Pulling models${c.reset}`);
|
console.log(`${c.bold}Pulling models${c.reset}`);
|
||||||
const results = await pullModels(models, {
|
const results = await pullModels(models, {
|
||||||
refresh,
|
refresh,
|
||||||
|
|||||||
51
src/llm.ts
51
src/llm.ts
@ -192,7 +192,7 @@ export type RerankDocument = {
|
|||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
// Embeddings use NVIDIA's OpenAI-compatible API by default.
|
// Embeddings use NVIDIA's OpenAI-compatible API by default.
|
||||||
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
|
// Set QMD_ENABLE_LOCAL_MODELS=1 before using any local node-llama-cpp GGUF models.
|
||||||
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
|
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
|
||||||
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
||||||
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
|
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
|
||||||
@ -216,14 +216,24 @@ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
|
|||||||
|
|
||||||
const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
|
const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
|
||||||
|
|
||||||
export function localModelsDisabled(): boolean {
|
export function localModelsEnabled(): boolean {
|
||||||
return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
|
return /^(1|true|yes|on)$/i.test(process.env.QMD_ENABLE_LOCAL_MODELS ?? "");
|
||||||
}
|
}
|
||||||
|
|
||||||
function isLocalEmbeddingModel(model: string): boolean {
|
function isLocalEmbeddingModel(model: string): boolean {
|
||||||
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
|
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function approximateTokenCount(text: string): number {
|
||||||
|
if (text.length === 0) return 0;
|
||||||
|
return Math.max(1, Math.ceil(text.length / 3));
|
||||||
|
}
|
||||||
|
|
||||||
|
export function truncateByApproxTokens(text: string, maxTokens: number): string {
|
||||||
|
if (maxTokens <= 0) return "";
|
||||||
|
return text.slice(0, Math.max(1, maxTokens * 3));
|
||||||
|
}
|
||||||
|
|
||||||
export type PullResult = {
|
export type PullResult = {
|
||||||
model: string;
|
model: string;
|
||||||
path: string;
|
path: string;
|
||||||
@ -929,10 +939,17 @@ export class LlamaCpp implements LLM {
|
|||||||
// ==========================================================================
|
// ==========================================================================
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenize text using the embedding model's tokenizer
|
* Tokenize text using the embedding model's tokenizer when local embeddings
|
||||||
* Returns tokenizer tokens (opaque type from node-llama-cpp)
|
* are explicitly active. External embedding mode uses a conservative
|
||||||
|
* approximation and must not load a local tokenizer.
|
||||||
*/
|
*/
|
||||||
async tokenize(text: string): Promise<readonly LlamaToken[]> {
|
async tokenize(text: string): Promise<readonly LlamaToken[]> {
|
||||||
|
if (!this.usesLocalEmbedding) {
|
||||||
|
return Array.from(
|
||||||
|
{ length: approximateTokenCount(text) },
|
||||||
|
(_, index) => index as unknown as LlamaToken,
|
||||||
|
);
|
||||||
|
}
|
||||||
await this.ensureEmbedContext(); // Ensure model is loaded
|
await this.ensureEmbedContext(); // Ensure model is loaded
|
||||||
if (!this.embedModel) {
|
if (!this.embedModel) {
|
||||||
throw new Error("Embed model not loaded");
|
throw new Error("Embed model not loaded");
|
||||||
@ -941,17 +958,25 @@ export class LlamaCpp implements LLM {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Count tokens in text using the embedding model's tokenizer
|
* Count tokens in text. External embedding mode uses an approximation so
|
||||||
|
* chunking never pulls in a local GGUF tokenizer by accident.
|
||||||
*/
|
*/
|
||||||
async countTokens(text: string): Promise<number> {
|
async countTokens(text: string): Promise<number> {
|
||||||
|
if (!this.usesLocalEmbedding) {
|
||||||
|
return approximateTokenCount(text);
|
||||||
|
}
|
||||||
const tokens = await this.tokenize(text);
|
const tokens = await this.tokenize(text);
|
||||||
return tokens.length;
|
return tokens.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Detokenize token IDs back to text
|
* Detokenize token IDs back to text. External embedding mode has no local
|
||||||
|
* tokenizer, so return an approximate-width placeholder for guardrail paths.
|
||||||
*/
|
*/
|
||||||
async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
|
async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
|
||||||
|
if (!this.usesLocalEmbedding) {
|
||||||
|
return " ".repeat(tokens.length * 3);
|
||||||
|
}
|
||||||
await this.ensureEmbedContext();
|
await this.ensureEmbedContext();
|
||||||
if (!this.embedModel) {
|
if (!this.embedModel) {
|
||||||
throw new Error("Embed model not loaded");
|
throw new Error("Embed model not loaded");
|
||||||
@ -1047,8 +1072,8 @@ export class LlamaCpp implements LLM {
|
|||||||
|
|
||||||
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
|
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
|
||||||
const model = options.model ?? this.embedModelUri;
|
const model = options.model ?? this.embedModelUri;
|
||||||
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
|
if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
|
||||||
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
|
throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
|
||||||
}
|
}
|
||||||
if (!isLocalEmbeddingModel(model)) {
|
if (!isLocalEmbeddingModel(model)) {
|
||||||
const results = await this.embedExternal([text], model, options);
|
const results = await this.embedExternal([text], model, options);
|
||||||
@ -1085,8 +1110,8 @@ export class LlamaCpp implements LLM {
|
|||||||
*/
|
*/
|
||||||
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
|
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
|
||||||
const model = options.model ?? this.embedModelUri;
|
const model = options.model ?? this.embedModelUri;
|
||||||
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
|
if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
|
||||||
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
|
throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
|
||||||
}
|
}
|
||||||
if (!isLocalEmbeddingModel(model)) {
|
if (!isLocalEmbeddingModel(model)) {
|
||||||
return this.embedExternal(texts, model, options);
|
return this.embedExternal(texts, model, options);
|
||||||
@ -1219,7 +1244,7 @@ export class LlamaCpp implements LLM {
|
|||||||
// ==========================================================================
|
// ==========================================================================
|
||||||
|
|
||||||
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
|
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
|
||||||
if (localModelsDisabled()) return [];
|
if (!localModelsEnabled()) return [];
|
||||||
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
|
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
|
||||||
// Ping activity at start to keep models alive during this operation
|
// Ping activity at start to keep models alive during this operation
|
||||||
this.touchActivity();
|
this.touchActivity();
|
||||||
@ -1319,7 +1344,7 @@ export class LlamaCpp implements LLM {
|
|||||||
documents: RerankDocument[],
|
documents: RerankDocument[],
|
||||||
options: RerankOptions = {}
|
options: RerankOptions = {}
|
||||||
): Promise<RerankResult> {
|
): Promise<RerankResult> {
|
||||||
if (localModelsDisabled()) {
|
if (!localModelsEnabled()) {
|
||||||
return {
|
return {
|
||||||
model: "disabled",
|
model: "disabled",
|
||||||
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
|
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
|
||||||
|
|||||||
33
src/store.ts
33
src/store.ts
@ -21,11 +21,13 @@ import fastGlob from "fast-glob";
|
|||||||
import {
|
import {
|
||||||
LlamaCpp,
|
LlamaCpp,
|
||||||
getDefaultLlamaCpp,
|
getDefaultLlamaCpp,
|
||||||
|
approximateTokenCount,
|
||||||
|
truncateByApproxTokens,
|
||||||
formatQueryForEmbedding,
|
formatQueryForEmbedding,
|
||||||
formatDocForEmbedding,
|
formatDocForEmbedding,
|
||||||
withLLMSessionForLlm,
|
withLLMSessionForLlm,
|
||||||
DEFAULT_EMBED_MODEL_URI,
|
DEFAULT_EMBED_MODEL_URI,
|
||||||
localModelsDisabled,
|
localModelsEnabled,
|
||||||
type RerankDocument,
|
type RerankDocument,
|
||||||
type ILLMSession,
|
type ILLMSession,
|
||||||
} from "./llm.js";
|
} from "./llm.js";
|
||||||
@ -2279,6 +2281,15 @@ export async function chunkDocumentByTokens(
|
|||||||
signal?: AbortSignal
|
signal?: AbortSignal
|
||||||
): Promise<{ text: string; pos: number; tokens: number }[]> {
|
): Promise<{ text: string; pos: number; tokens: number }[]> {
|
||||||
const llm = getDefaultLlamaCpp();
|
const llm = getDefaultLlamaCpp();
|
||||||
|
const useLocalTokenizer = typeof (llm as any).usesLocalEmbedding === "boolean"
|
||||||
|
? Boolean((llm as any).usesLocalEmbedding)
|
||||||
|
: true;
|
||||||
|
|
||||||
|
const countTokens = async (text: string): Promise<number> => {
|
||||||
|
if (!useLocalTokenizer) return approximateTokenCount(text);
|
||||||
|
const tokens = await llm.tokenize(text);
|
||||||
|
return tokens.length;
|
||||||
|
};
|
||||||
|
|
||||||
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
// Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
|
||||||
// If chunks exceed limit, they'll be re-split with actual ratio
|
// If chunks exceed limit, they'll be re-split with actual ratio
|
||||||
@ -2301,13 +2312,13 @@ export async function chunkDocumentByTokens(
|
|||||||
const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise<void> => {
|
const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise<void> => {
|
||||||
if (signal?.aborted) return;
|
if (signal?.aborted) return;
|
||||||
|
|
||||||
const tokens = await llm.tokenize(text);
|
const tokenCount = await countTokens(text);
|
||||||
if (tokens.length <= maxTokens || text.length <= 1) {
|
if (tokenCount <= maxTokens || text.length <= 1) {
|
||||||
results.push({ text, pos, tokens: tokens.length });
|
results.push({ text, pos, tokens: tokenCount });
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const actualCharsPerToken = text.length / tokens.length;
|
const actualCharsPerToken = text.length / tokenCount;
|
||||||
let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
|
let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
|
||||||
if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
|
if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
|
||||||
safeMaxChars = Math.floor(text.length / 2);
|
safeMaxChars = Math.floor(text.length / 2);
|
||||||
@ -2337,12 +2348,14 @@ export async function chunkDocumentByTokens(
|
|||||||
subChunks.length <= 1
|
subChunks.length <= 1
|
||||||
|| subChunks[0]?.text.length === text.length
|
|| subChunks[0]?.text.length === text.length
|
||||||
) {
|
) {
|
||||||
const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
|
const tokenLimit = Math.max(1, maxTokens);
|
||||||
const truncatedText = await llm.detokenize(fallbackTokens);
|
const truncatedText = useLocalTokenizer
|
||||||
|
? await llm.detokenize((await llm.tokenize(text)).slice(0, tokenLimit))
|
||||||
|
: truncateByApproxTokens(text, tokenLimit);
|
||||||
results.push({
|
results.push({
|
||||||
text: truncatedText,
|
text: truncatedText,
|
||||||
pos,
|
pos,
|
||||||
tokens: fallbackTokens.length,
|
tokens: tokenLimit,
|
||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -4013,7 +4026,7 @@ export async function hybridQuery(
|
|||||||
const collection = options?.collection;
|
const collection = options?.collection;
|
||||||
const explain = options?.explain ?? false;
|
const explain = options?.explain ?? false;
|
||||||
const intent = options?.intent;
|
const intent = options?.intent;
|
||||||
const skipRerank = options?.skipRerank ?? localModelsDisabled();
|
const skipRerank = options?.skipRerank ?? !localModelsEnabled();
|
||||||
const hooks = options?.hooks;
|
const hooks = options?.hooks;
|
||||||
|
|
||||||
const rankedLists: RankedResult[][] = [];
|
const rankedLists: RankedResult[][] = [];
|
||||||
@ -4408,7 +4421,7 @@ export async function structuredSearch(
|
|||||||
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
|
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
|
||||||
const explain = options?.explain ?? false;
|
const explain = options?.explain ?? false;
|
||||||
const intent = options?.intent;
|
const intent = options?.intent;
|
||||||
const skipRerank = options?.skipRerank ?? localModelsDisabled();
|
const skipRerank = options?.skipRerank ?? !localModelsEnabled();
|
||||||
const hooks = options?.hooks;
|
const hooks = options?.hooks;
|
||||||
|
|
||||||
const collections = options?.collections;
|
const collections = options?.collections;
|
||||||
|
|||||||
102
test/llm.test.ts
102
test/llm.test.ts
@ -193,9 +193,11 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
|
test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
|
||||||
|
const prevModel = process.env.QMD_EMBED_MODEL;
|
||||||
const prevKey = process.env.QMD_EMBED_API_KEY;
|
const prevKey = process.env.QMD_EMBED_API_KEY;
|
||||||
const prevNvidiaKey = process.env.NVIDIA_API_KEY;
|
const prevNvidiaKey = process.env.NVIDIA_API_KEY;
|
||||||
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
|
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
|
||||||
|
delete process.env.QMD_EMBED_MODEL;
|
||||||
process.env.QMD_EMBED_API_KEY = "test-key";
|
process.env.QMD_EMBED_API_KEY = "test-key";
|
||||||
delete process.env.NVIDIA_API_KEY;
|
delete process.env.NVIDIA_API_KEY;
|
||||||
delete process.env.QMD_EMBED_API_BASE_URL;
|
delete process.env.QMD_EMBED_API_BASE_URL;
|
||||||
@ -225,6 +227,8 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
});
|
});
|
||||||
} finally {
|
} finally {
|
||||||
fetchMock.mockRestore();
|
fetchMock.mockRestore();
|
||||||
|
if (prevModel === undefined) delete process.env.QMD_EMBED_MODEL;
|
||||||
|
else process.env.QMD_EMBED_MODEL = prevModel;
|
||||||
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
|
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
|
||||||
else process.env.QMD_EMBED_API_KEY = prevKey;
|
else process.env.QMD_EMBED_API_KEY = prevKey;
|
||||||
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
|
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
|
||||||
@ -274,9 +278,9 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
expect(llm.usesLocalEmbedding).toBe(true);
|
expect(llm.usesLocalEmbedding).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
|
test("local models are disabled by default", async () => {
|
||||||
const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
|
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
process.env.QMD_DISABLE_LOCAL_MODELS = "1";
|
delete process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
try {
|
try {
|
||||||
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
|
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
|
||||||
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
|
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
|
||||||
@ -286,14 +290,54 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
results: [{ file: "doc.md", score: 0, index: 0 }],
|
results: [{ file: "doc.md", score: 0, index: 0 }],
|
||||||
});
|
});
|
||||||
} finally {
|
} finally {
|
||||||
if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
|
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
|
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("QMD_ENABLE_LOCAL_MODELS allows explicit local embedding models", async () => {
|
||||||
|
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
|
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
|
||||||
|
try {
|
||||||
|
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }) as any;
|
||||||
|
llm._ciMode = false;
|
||||||
|
llm.touchActivity = vi.fn();
|
||||||
|
llm.ensureEmbedContext = vi.fn().mockResolvedValue({
|
||||||
|
getEmbeddingFor: vi.fn(async () => ({ vector: new Float32Array([0.1, 0.2]) })),
|
||||||
|
});
|
||||||
|
llm.truncateToContextSize = vi.fn(async (text: string) => ({
|
||||||
|
text,
|
||||||
|
truncated: false,
|
||||||
|
limit: 2048,
|
||||||
|
}));
|
||||||
|
|
||||||
|
await expect(llm.embed("hello")).resolves.toEqual({
|
||||||
|
embedding: [expect.closeTo(0.1), expect.closeTo(0.2)],
|
||||||
|
model: "hf:custom/embed.gguf",
|
||||||
|
});
|
||||||
|
expect(llm.ensureEmbedContext).toHaveBeenCalled();
|
||||||
|
} finally {
|
||||||
|
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
|
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test("external embedding token counting does not load a local tokenizer", async () => {
|
||||||
|
const llm = new LlamaCpp({ embedModel: "nvidia/llama-3.2-nv-embedqa-1b-v2" }) as any;
|
||||||
|
llm.ensureEmbedContext = vi.fn(async () => {
|
||||||
|
throw new Error("should not load local tokenizer");
|
||||||
|
});
|
||||||
|
|
||||||
|
await expect(llm.countTokens("abcdef")).resolves.toBe(2);
|
||||||
|
await expect(llm.tokenize("abcdef")).resolves.toHaveLength(2);
|
||||||
|
expect(llm.ensureEmbedContext).not.toHaveBeenCalled();
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("LlamaCpp embedding truncation", () => {
|
describe("LlamaCpp embedding truncation", () => {
|
||||||
test("truncates against the active embedding context limit, not the model train context", async () => {
|
test("truncates against the active embedding context limit, not the model train context", async () => {
|
||||||
|
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
|
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
|
||||||
const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any;
|
const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any;
|
||||||
const getEmbeddingFor = vi.fn(async (text: string) => ({
|
const getEmbeddingFor = vi.fn(async (text: string) => ({
|
||||||
vector: new Float32Array([0.25, 0.5]),
|
vector: new Float32Array([0.25, 0.5]),
|
||||||
@ -308,18 +352,25 @@ describe("LlamaCpp embedding truncation", () => {
|
|||||||
};
|
};
|
||||||
llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor });
|
llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor });
|
||||||
|
|
||||||
const result = await llm.embed("x".repeat(3000));
|
try {
|
||||||
|
const result = await llm.embed("x".repeat(3000));
|
||||||
|
|
||||||
expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
|
expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
|
||||||
expect(result).toEqual({
|
expect(result).toEqual({
|
||||||
embedding: [0.25, 0.5],
|
embedding: [0.25, 0.5],
|
||||||
model: llm.embedModelUri,
|
model: llm.embedModelUri,
|
||||||
});
|
});
|
||||||
|
} finally {
|
||||||
|
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
|
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("LlamaCpp rerank deduping", () => {
|
describe("LlamaCpp rerank deduping", () => {
|
||||||
test("deduplicates identical document texts before scoring", async () => {
|
test("deduplicates identical document texts before scoring", async () => {
|
||||||
|
const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
|
process.env.QMD_ENABLE_LOCAL_MODELS = "1";
|
||||||
const llm = new LlamaCpp({}) as any;
|
const llm = new LlamaCpp({}) as any;
|
||||||
llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
|
llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
|
||||||
const rankAll = vi.fn(async (_query: string, docs: string[]) =>
|
const rankAll = vi.fn(async (_query: string, docs: string[]) =>
|
||||||
@ -333,20 +384,25 @@ describe("LlamaCpp rerank deduping", () => {
|
|||||||
detokenize: (tokens: string[]) => tokens.join(""),
|
detokenize: (tokens: string[]) => tokens.join(""),
|
||||||
});
|
});
|
||||||
|
|
||||||
const result = await llm.rerank("query", [
|
try {
|
||||||
{ file: "a.md", text: "shared chunk" },
|
const result = await llm.rerank("query", [
|
||||||
{ file: "b.md", text: "shared chunk" },
|
{ file: "a.md", text: "shared chunk" },
|
||||||
{ file: "c.md", text: "different chunk" },
|
{ file: "b.md", text: "shared chunk" },
|
||||||
]);
|
{ file: "c.md", text: "different chunk" },
|
||||||
|
]);
|
||||||
|
|
||||||
expect(rankAll).toHaveBeenCalledTimes(1);
|
expect(rankAll).toHaveBeenCalledTimes(1);
|
||||||
expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
|
expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
|
||||||
expect(result.results).toHaveLength(3);
|
expect(result.results).toHaveLength(3);
|
||||||
|
|
||||||
const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
|
const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
|
||||||
expect(scoreByFile.get("a.md")).toBe(0.9);
|
expect(scoreByFile.get("a.md")).toBe(0.9);
|
||||||
expect(scoreByFile.get("b.md")).toBe(0.9);
|
expect(scoreByFile.get("b.md")).toBe(0.9);
|
||||||
expect(scoreByFile.get("c.md")).toBe(0.2);
|
expect(scoreByFile.get("c.md")).toBe(0.2);
|
||||||
|
} finally {
|
||||||
|
if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
|
||||||
|
else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -2820,6 +2820,33 @@ describe("Embedding batching", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
describe("Token chunking guardrails", () => {
|
describe("Token chunking guardrails", () => {
|
||||||
|
test("chunkDocumentByTokens uses approximate counts for external embeddings without tokenizer load", async () => {
|
||||||
|
const tokenize = vi.fn(async () => {
|
||||||
|
throw new Error("should not tokenize through local GGUF");
|
||||||
|
});
|
||||||
|
const detokenize = vi.fn(async () => {
|
||||||
|
throw new Error("should not detokenize through local GGUF");
|
||||||
|
});
|
||||||
|
|
||||||
|
setDefaultLlamaCpp({
|
||||||
|
usesLocalEmbedding: false,
|
||||||
|
tokenize,
|
||||||
|
detokenize,
|
||||||
|
} as any);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const chunks = await chunkDocumentByTokens("x".repeat(1200), 100, 15, 20);
|
||||||
|
|
||||||
|
expect(chunks.length).toBeGreaterThan(1);
|
||||||
|
expect(chunks.every((chunk) => chunk.tokens <= 100)).toBe(true);
|
||||||
|
expect(chunks[0]!.text.length).toBeLessThanOrEqual(300);
|
||||||
|
expect(tokenize).not.toHaveBeenCalled();
|
||||||
|
expect(detokenize).not.toHaveBeenCalled();
|
||||||
|
} finally {
|
||||||
|
setDefaultLlamaCpp(null);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => {
|
test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => {
|
||||||
setDefaultLlamaCpp({
|
setDefaultLlamaCpp({
|
||||||
async tokenize(text: string) {
|
async tokenize(text: string) {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user