feat: default to NVIDIA embeddings
This commit is contained in:
parent
fbad5791e3
commit
7c17c8bcce
28
README.md
28
README.md
@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
|
|||||||
|
|
||||||
### Models
|
### Models
|
||||||
|
|
||||||
QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with:
|
QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings
|
||||||
|
by default. Configure it with:
|
||||||
```sh
|
|
||||||
export QMD_EMBED_API_KEY="..."
|
|
||||||
# Optional for non-OpenAI-compatible gateways:
|
|
||||||
export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
|
|
||||||
export QMD_EMBED_MODEL="text-embedding-3-small"
|
|
||||||
```
|
|
||||||
|
|
||||||
NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads
|
|
||||||
`NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required
|
|
||||||
`input_type` automatically (`passage` while indexing, `query` while searching):
|
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
export NVIDIA_API_KEY="..."
|
export NVIDIA_API_KEY="..."
|
||||||
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
|
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
|
||||||
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
|
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
|
||||||
|
export QMD_DISABLE_LOCAL_MODELS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
|
||||||
|
NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
|
||||||
|
while searching).
|
||||||
|
|
||||||
|
`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
|
||||||
|
local GGUF models. In that mode QMD rejects local embedding model URIs, skips
|
||||||
|
local query expansion, and defaults search reranking off while still using the
|
||||||
|
configured external embedding service for vector search.
|
||||||
|
|
||||||
Reranking and query expansion still use local GGUF models via node-llama-cpp:
|
Reranking and query expansion still use local GGUF models via node-llama-cpp:
|
||||||
|
|
||||||
| Model | Purpose | Size |
|
| Model | Purpose | Size |
|
||||||
@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
|
|||||||
Models are configured in `src/llm.ts`:
|
Models are configured in `src/llm.ts`:
|
||||||
|
|
||||||
```typescript
|
```typescript
|
||||||
const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
|
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
|
||||||
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
||||||
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
||||||
```
|
```
|
||||||
@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
|
|||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
models:
|
models:
|
||||||
embed: text-embedding-3-small
|
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
|
||||||
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
|
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
|
||||||
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
|
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
|
||||||
```
|
```
|
||||||
|
|||||||
7
bin/qmd
7
bin/qmd
@ -16,9 +16,6 @@ done
|
|||||||
DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
|
DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
|
||||||
|
|
||||||
# Detect the package manager that installed dependencies by checking lockfiles.
|
# Detect the package manager that installed dependencies by checking lockfiles.
|
||||||
# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
|
|
||||||
# on the system, not that it was used to install this package (see #361).
|
|
||||||
#
|
|
||||||
# package-lock.json takes priority: if it exists, npm installed the native
|
# package-lock.json takes priority: if it exists, npm installed the native
|
||||||
# modules for Node. The repo ships bun.lock, so without this check, source
|
# modules for Node. The repo ships bun.lock, so without this check, source
|
||||||
# builds that use npm would be incorrectly routed to bun, causing ABI
|
# builds that use npm would be incorrectly routed to bun, causing ABI
|
||||||
@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then
|
|||||||
exec node "$DIR/dist/cli/qmd.js" "$@"
|
exec node "$DIR/dist/cli/qmd.js" "$@"
|
||||||
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
|
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
|
||||||
exec bun "$DIR/dist/cli/qmd.js" "$@"
|
exec bun "$DIR/dist/cli/qmd.js" "$@"
|
||||||
|
elif command -v bun >/dev/null 2>&1; then
|
||||||
|
exec bun "$DIR/dist/cli/qmd.js" "$@"
|
||||||
|
elif [ -x "$HOME/.bun/bin/bun" ]; then
|
||||||
|
exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@"
|
||||||
else
|
else
|
||||||
exec node "$DIR/dist/cli/qmd.js" "$@"
|
exec node "$DIR/dist/cli/qmd.js" "$@"
|
||||||
fi
|
fi
|
||||||
|
|||||||
25
src/llm.ts
25
src/llm.ts
@ -191,9 +191,9 @@ export type RerankDocument = {
|
|||||||
// Model Configuration
|
// Model Configuration
|
||||||
// =============================================================================
|
// =============================================================================
|
||||||
|
|
||||||
// Embeddings use an OpenAI-compatible API by default.
|
// Embeddings use NVIDIA's OpenAI-compatible API by default.
|
||||||
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
|
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
|
||||||
const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
|
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
|
||||||
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
||||||
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
|
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
|
||||||
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
||||||
@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
|
|||||||
: join(homedir(), ".cache", "qmd", "models");
|
: join(homedir(), ".cache", "qmd", "models");
|
||||||
export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
|
export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
|
||||||
|
|
||||||
const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1";
|
const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
|
||||||
|
|
||||||
|
export function localModelsDisabled(): boolean {
|
||||||
|
return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
|
||||||
|
}
|
||||||
|
|
||||||
function isLocalEmbeddingModel(model: string): boolean {
|
function isLocalEmbeddingModel(model: string): boolean {
|
||||||
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
|
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
|
||||||
@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM {
|
|||||||
if (texts.length === 0) return [];
|
if (texts.length === 0) return [];
|
||||||
if (!this.embedApiKey) {
|
if (!this.embedApiKey) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " +
|
"External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " +
|
||||||
"For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
|
"For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM {
|
|||||||
|
|
||||||
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
|
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
|
||||||
const model = options.model ?? this.embedModelUri;
|
const model = options.model ?? this.embedModelUri;
|
||||||
|
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
|
||||||
|
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
|
||||||
|
}
|
||||||
if (!isLocalEmbeddingModel(model)) {
|
if (!isLocalEmbeddingModel(model)) {
|
||||||
const results = await this.embedExternal([text], model, options);
|
const results = await this.embedExternal([text], model, options);
|
||||||
return results[0] ?? null;
|
return results[0] ?? null;
|
||||||
@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM {
|
|||||||
*/
|
*/
|
||||||
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
|
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
|
||||||
const model = options.model ?? this.embedModelUri;
|
const model = options.model ?? this.embedModelUri;
|
||||||
|
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
|
||||||
|
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
|
||||||
|
}
|
||||||
if (!isLocalEmbeddingModel(model)) {
|
if (!isLocalEmbeddingModel(model)) {
|
||||||
return this.embedExternal(texts, model, options);
|
return this.embedExternal(texts, model, options);
|
||||||
}
|
}
|
||||||
@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM {
|
|||||||
// ==========================================================================
|
// ==========================================================================
|
||||||
|
|
||||||
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
|
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
|
||||||
|
if (localModelsDisabled()) return [];
|
||||||
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
|
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
|
||||||
// Ping activity at start to keep models alive during this operation
|
// Ping activity at start to keep models alive during this operation
|
||||||
this.touchActivity();
|
this.touchActivity();
|
||||||
@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM {
|
|||||||
documents: RerankDocument[],
|
documents: RerankDocument[],
|
||||||
options: RerankOptions = {}
|
options: RerankOptions = {}
|
||||||
): Promise<RerankResult> {
|
): Promise<RerankResult> {
|
||||||
|
if (localModelsDisabled()) {
|
||||||
|
return {
|
||||||
|
model: "disabled",
|
||||||
|
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
|
||||||
|
};
|
||||||
|
}
|
||||||
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
|
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
|
||||||
// Ping activity at start to keep models alive during this operation
|
// Ping activity at start to keep models alive during this operation
|
||||||
this.touchActivity();
|
this.touchActivity();
|
||||||
|
|||||||
@ -25,6 +25,7 @@ import {
|
|||||||
formatDocForEmbedding,
|
formatDocForEmbedding,
|
||||||
withLLMSessionForLlm,
|
withLLMSessionForLlm,
|
||||||
DEFAULT_EMBED_MODEL_URI,
|
DEFAULT_EMBED_MODEL_URI,
|
||||||
|
localModelsDisabled,
|
||||||
type RerankDocument,
|
type RerankDocument,
|
||||||
type ILLMSession,
|
type ILLMSession,
|
||||||
} from "./llm.js";
|
} from "./llm.js";
|
||||||
@ -4012,7 +4013,7 @@ export async function hybridQuery(
|
|||||||
const collection = options?.collection;
|
const collection = options?.collection;
|
||||||
const explain = options?.explain ?? false;
|
const explain = options?.explain ?? false;
|
||||||
const intent = options?.intent;
|
const intent = options?.intent;
|
||||||
const skipRerank = options?.skipRerank ?? false;
|
const skipRerank = options?.skipRerank ?? localModelsDisabled();
|
||||||
const hooks = options?.hooks;
|
const hooks = options?.hooks;
|
||||||
|
|
||||||
const rankedLists: RankedResult[][] = [];
|
const rankedLists: RankedResult[][] = [];
|
||||||
@ -4407,7 +4408,7 @@ export async function structuredSearch(
|
|||||||
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
|
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
|
||||||
const explain = options?.explain ?? false;
|
const explain = options?.explain ?? false;
|
||||||
const intent = options?.intent;
|
const intent = options?.intent;
|
||||||
const skipRerank = options?.skipRerank ?? false;
|
const skipRerank = options?.skipRerank ?? localModelsDisabled();
|
||||||
const hooks = options?.hooks;
|
const hooks = options?.hooks;
|
||||||
|
|
||||||
const collections = options?.collections;
|
const collections = options?.collections;
|
||||||
|
|||||||
@ -20,10 +20,16 @@ fail() { printf " %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1)
|
|||||||
# Instead of exec-ing a runtime, we echo which one would be chosen.
|
# Instead of exec-ing a runtime, we echo which one would be chosen.
|
||||||
detect_runtime() {
|
detect_runtime() {
|
||||||
local DIR="$1"
|
local DIR="$1"
|
||||||
|
local BUN_AVAILABLE="${2:-1}"
|
||||||
|
local HOME_BUN_AVAILABLE="${3:-1}"
|
||||||
if [ -f "$DIR/package-lock.json" ]; then
|
if [ -f "$DIR/package-lock.json" ]; then
|
||||||
echo "node"
|
echo "node"
|
||||||
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
|
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
|
||||||
echo "bun"
|
echo "bun"
|
||||||
|
elif [[ "$BUN_AVAILABLE" == "1" ]]; then
|
||||||
|
echo "bun"
|
||||||
|
elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then
|
||||||
|
echo "home-bun"
|
||||||
else
|
else
|
||||||
echo "node"
|
echo "node"
|
||||||
fi
|
fi
|
||||||
@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ==="
|
|||||||
|
|
||||||
# --- Test cases ---
|
# --- Test cases ---
|
||||||
|
|
||||||
# 1. No lockfiles → default to node
|
# 1. No lockfiles → default to bun when bun is available
|
||||||
d="$TMPDIR_BASE/no-lockfiles"
|
d="$TMPDIR_BASE/no-lockfiles"
|
||||||
mkdir -p "$d"
|
mkdir -p "$d"
|
||||||
assert_runtime "no lockfiles → node" "$d" "node"
|
assert_runtime "no lockfiles + bun available → bun" "$d" "bun"
|
||||||
|
|
||||||
|
got=$(detect_runtime "$d" 0 0)
|
||||||
|
if [[ "$got" == "node" ]]; then
|
||||||
|
ok "no lockfiles + no bun → node"
|
||||||
|
else
|
||||||
|
fail "no lockfiles + no bun → node" "$got" "node"
|
||||||
|
fi
|
||||||
|
|
||||||
|
got=$(detect_runtime "$d" 0 1)
|
||||||
|
if [[ "$got" == "home-bun" ]]; then
|
||||||
|
ok "no lockfiles + home bun → home-bun"
|
||||||
|
else
|
||||||
|
fail "no lockfiles + home bun → home-bun" "$got" "home-bun"
|
||||||
|
fi
|
||||||
|
|
||||||
# 2. Only bun.lock → bun
|
# 2. Only bun.lock → bun
|
||||||
d="$TMPDIR_BASE/bun-lock-only"
|
d="$TMPDIR_BASE/bun-lock-only"
|
||||||
|
|||||||
@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
describe("LlamaCpp model resolution (config > env > default)", () => {
|
describe("LlamaCpp model resolution (config > env > default)", () => {
|
||||||
const HARDCODED_EMBED = "text-embedding-3-small";
|
const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2";
|
||||||
const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
|
||||||
const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
|
||||||
|
|
||||||
@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test("default embedding uses external OpenAI-compatible API", async () => {
|
test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
|
||||||
const prevKey = process.env.QMD_EMBED_API_KEY;
|
const prevKey = process.env.QMD_EMBED_API_KEY;
|
||||||
|
const prevNvidiaKey = process.env.NVIDIA_API_KEY;
|
||||||
|
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
|
||||||
process.env.QMD_EMBED_API_KEY = "test-key";
|
process.env.QMD_EMBED_API_KEY = "test-key";
|
||||||
|
delete process.env.NVIDIA_API_KEY;
|
||||||
|
delete process.env.QMD_EMBED_API_BASE_URL;
|
||||||
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
|
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
|
||||||
ok: true,
|
ok: true,
|
||||||
json: async () => ({
|
json: async () => ({
|
||||||
model: "text-embedding-3-small",
|
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
|
||||||
data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
|
data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
|
||||||
}),
|
}),
|
||||||
} as Response);
|
} as Response);
|
||||||
@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
try {
|
try {
|
||||||
const llm = new LlamaCpp({});
|
const llm = new LlamaCpp({});
|
||||||
const result = await llm.embed("hello");
|
const result = await llm.embed("hello");
|
||||||
expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({
|
expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({
|
||||||
method: "POST",
|
method: "POST",
|
||||||
}));
|
}));
|
||||||
|
const [, init] = fetchMock.mock.calls[0]!;
|
||||||
|
expect(JSON.parse((init as RequestInit).body as string)).toEqual({
|
||||||
|
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
|
||||||
|
input: ["hello"],
|
||||||
|
input_type: "passage",
|
||||||
|
});
|
||||||
expect(result).toEqual({
|
expect(result).toEqual({
|
||||||
embedding: [0.1, 0.2, 0.3],
|
embedding: [0.1, 0.2, 0.3],
|
||||||
model: "text-embedding-3-small",
|
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
|
||||||
});
|
});
|
||||||
} finally {
|
} finally {
|
||||||
fetchMock.mockRestore();
|
fetchMock.mockRestore();
|
||||||
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
|
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
|
||||||
else process.env.QMD_EMBED_API_KEY = prevKey;
|
else process.env.QMD_EMBED_API_KEY = prevKey;
|
||||||
|
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
|
||||||
|
else process.env.NVIDIA_API_KEY = prevNvidiaKey;
|
||||||
|
if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL;
|
||||||
|
else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
|
|||||||
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
|
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
|
||||||
expect(llm.usesLocalEmbedding).toBe(true);
|
expect(llm.usesLocalEmbedding).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
|
||||||
|
const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
|
||||||
|
process.env.QMD_DISABLE_LOCAL_MODELS = "1";
|
||||||
|
try {
|
||||||
|
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
|
||||||
|
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
|
||||||
|
await expect(llm.expandQuery("hello")).resolves.toEqual([]);
|
||||||
|
await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({
|
||||||
|
model: "disabled",
|
||||||
|
results: [{ file: "doc.md", score: 0, index: 0 }],
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
|
||||||
|
else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
|
||||||
|
}
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("LlamaCpp embedding truncation", () => {
|
describe("LlamaCpp embedding truncation", () => {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user