feat: default to NVIDIA embeddings

This commit is contained in:
Haitao Pan 2026-05-09 16:50:04 +08:00
parent fbad5791e3
commit 7c17c8bcce
6 changed files with 100 additions and 30 deletions

View File

@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
### Models ### Models
QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with: QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings
by default. Configure it with:
```sh
export QMD_EMBED_API_KEY="..."
# Optional for non-OpenAI-compatible gateways:
export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
export QMD_EMBED_MODEL="text-embedding-3-small"
```
NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads
`NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required
`input_type` automatically (`passage` while indexing, `query` while searching):
```sh ```sh
export NVIDIA_API_KEY="..." export NVIDIA_API_KEY="..."
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1" export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2" export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
export QMD_DISABLE_LOCAL_MODELS=1
``` ```
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
while searching).
`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
local GGUF models. In that mode QMD rejects local embedding model URIs, skips
local query expansion, and defaults search reranking off while still using the
configured external embedding service for vector search.
Reranking and query expansion still use local GGUF models via node-llama-cpp: Reranking and query expansion still use local GGUF models via node-llama-cpp:
| Model | Purpose | Size | | Model | Purpose | Size |
@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
Models are configured in `src/llm.ts`: Models are configured in `src/llm.ts`:
```typescript ```typescript
const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
``` ```
@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
```yaml ```yaml
models: models:
embed: text-embedding-3-small embed: nvidia/llama-3.2-nv-embedqa-1b-v2
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
``` ```

View File

@ -16,9 +16,6 @@ done
DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)" DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
# Detect the package manager that installed dependencies by checking lockfiles. # Detect the package manager that installed dependencies by checking lockfiles.
# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
# on the system, not that it was used to install this package (see #361).
#
# package-lock.json takes priority: if it exists, npm installed the native # package-lock.json takes priority: if it exists, npm installed the native
# modules for Node. The repo ships bun.lock, so without this check, source # modules for Node. The repo ships bun.lock, so without this check, source
# builds that use npm would be incorrectly routed to bun, causing ABI # builds that use npm would be incorrectly routed to bun, causing ABI
@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then
exec node "$DIR/dist/cli/qmd.js" "$@" exec node "$DIR/dist/cli/qmd.js" "$@"
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
exec bun "$DIR/dist/cli/qmd.js" "$@" exec bun "$DIR/dist/cli/qmd.js" "$@"
elif command -v bun >/dev/null 2>&1; then
exec bun "$DIR/dist/cli/qmd.js" "$@"
elif [ -x "$HOME/.bun/bin/bun" ]; then
exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@"
else else
exec node "$DIR/dist/cli/qmd.js" "$@" exec node "$DIR/dist/cli/qmd.js" "$@"
fi fi

View File

@ -191,9 +191,9 @@ export type RerankDocument = {
// Model Configuration // Model Configuration
// ============================================================================= // =============================================================================
// Embeddings use an OpenAI-compatible API by default. // Embeddings use NVIDIA's OpenAI-compatible API by default.
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings. // Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"; // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
: join(homedir(), ".cache", "qmd", "models"); : join(homedir(), ".cache", "qmd", "models");
export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR; export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1"; const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
export function localModelsDisabled(): boolean {
return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
}
function isLocalEmbeddingModel(model: string): boolean { function isLocalEmbeddingModel(model: string): boolean {
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../"); return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM {
if (texts.length === 0) return []; if (texts.length === 0) return [];
if (!this.embedApiKey) { if (!this.embedApiKey) {
throw new Error( throw new Error(
"External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " + "External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " +
"For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI." "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
); );
} }
@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM {
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> { async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
const model = options.model ?? this.embedModelUri; const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
}
if (!isLocalEmbeddingModel(model)) { if (!isLocalEmbeddingModel(model)) {
const results = await this.embedExternal([text], model, options); const results = await this.embedExternal([text], model, options);
return results[0] ?? null; return results[0] ?? null;
@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM {
*/ */
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> { async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
const model = options.model ?? this.embedModelUri; const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
}
if (!isLocalEmbeddingModel(model)) { if (!isLocalEmbeddingModel(model)) {
return this.embedExternal(texts, model, options); return this.embedExternal(texts, model, options);
} }
@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM {
// ========================================================================== // ==========================================================================
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> { async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
if (localModelsDisabled()) return [];
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
// Ping activity at start to keep models alive during this operation // Ping activity at start to keep models alive during this operation
this.touchActivity(); this.touchActivity();
@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM {
documents: RerankDocument[], documents: RerankDocument[],
options: RerankOptions = {} options: RerankOptions = {}
): Promise<RerankResult> { ): Promise<RerankResult> {
if (localModelsDisabled()) {
return {
model: "disabled",
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
};
}
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
// Ping activity at start to keep models alive during this operation // Ping activity at start to keep models alive during this operation
this.touchActivity(); this.touchActivity();

View File

@ -25,6 +25,7 @@ import {
formatDocForEmbedding, formatDocForEmbedding,
withLLMSessionForLlm, withLLMSessionForLlm,
DEFAULT_EMBED_MODEL_URI, DEFAULT_EMBED_MODEL_URI,
localModelsDisabled,
type RerankDocument, type RerankDocument,
type ILLMSession, type ILLMSession,
} from "./llm.js"; } from "./llm.js";
@ -4012,7 +4013,7 @@ export async function hybridQuery(
const collection = options?.collection; const collection = options?.collection;
const explain = options?.explain ?? false; const explain = options?.explain ?? false;
const intent = options?.intent; const intent = options?.intent;
const skipRerank = options?.skipRerank ?? false; const skipRerank = options?.skipRerank ?? localModelsDisabled();
const hooks = options?.hooks; const hooks = options?.hooks;
const rankedLists: RankedResult[][] = []; const rankedLists: RankedResult[][] = [];
@ -4407,7 +4408,7 @@ export async function structuredSearch(
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT; const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
const explain = options?.explain ?? false; const explain = options?.explain ?? false;
const intent = options?.intent; const intent = options?.intent;
const skipRerank = options?.skipRerank ?? false; const skipRerank = options?.skipRerank ?? localModelsDisabled();
const hooks = options?.hooks; const hooks = options?.hooks;
const collections = options?.collections; const collections = options?.collections;

View File

@ -20,10 +20,16 @@ fail() { printf " %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1)
# Instead of exec-ing a runtime, we echo which one would be chosen. # Instead of exec-ing a runtime, we echo which one would be chosen.
detect_runtime() { detect_runtime() {
local DIR="$1" local DIR="$1"
local BUN_AVAILABLE="${2:-1}"
local HOME_BUN_AVAILABLE="${3:-1}"
if [ -f "$DIR/package-lock.json" ]; then if [ -f "$DIR/package-lock.json" ]; then
echo "node" echo "node"
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
echo "bun" echo "bun"
elif [[ "$BUN_AVAILABLE" == "1" ]]; then
echo "bun"
elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then
echo "home-bun"
else else
echo "node" echo "node"
fi fi
@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ==="
# --- Test cases --- # --- Test cases ---
# 1. No lockfiles → default to node # 1. No lockfiles → default to bun when bun is available
d="$TMPDIR_BASE/no-lockfiles" d="$TMPDIR_BASE/no-lockfiles"
mkdir -p "$d" mkdir -p "$d"
assert_runtime "no lockfiles → node" "$d" "node" assert_runtime "no lockfiles + bun available → bun" "$d" "bun"
got=$(detect_runtime "$d" 0 0)
if [[ "$got" == "node" ]]; then
ok "no lockfiles + no bun → node"
else
fail "no lockfiles + no bun → node" "$got" "node"
fi
got=$(detect_runtime "$d" 0 1)
if [[ "$got" == "home-bun" ]]; then
ok "no lockfiles + home bun → home-bun"
else
fail "no lockfiles + home bun → home-bun" "$got" "home-bun"
fi
# 2. Only bun.lock → bun # 2. Only bun.lock → bun
d="$TMPDIR_BASE/bun-lock-only" d="$TMPDIR_BASE/bun-lock-only"

View File

@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
}); });
describe("LlamaCpp model resolution (config > env > default)", () => { describe("LlamaCpp model resolution (config > env > default)", () => {
const HARDCODED_EMBED = "text-embedding-3-small"; const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
} }
}); });
test("default embedding uses external OpenAI-compatible API", async () => { test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
const prevKey = process.env.QMD_EMBED_API_KEY; const prevKey = process.env.QMD_EMBED_API_KEY;
const prevNvidiaKey = process.env.NVIDIA_API_KEY;
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
process.env.QMD_EMBED_API_KEY = "test-key"; process.env.QMD_EMBED_API_KEY = "test-key";
delete process.env.NVIDIA_API_KEY;
delete process.env.QMD_EMBED_API_BASE_URL;
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({ const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
ok: true, ok: true,
json: async () => ({ json: async () => ({
model: "text-embedding-3-small", model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }], data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
}), }),
} as Response); } as Response);
@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
try { try {
const llm = new LlamaCpp({}); const llm = new LlamaCpp({});
const result = await llm.embed("hello"); const result = await llm.embed("hello");
expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({ expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({
method: "POST", method: "POST",
})); }));
const [, init] = fetchMock.mock.calls[0]!;
expect(JSON.parse((init as RequestInit).body as string)).toEqual({
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
input: ["hello"],
input_type: "passage",
});
expect(result).toEqual({ expect(result).toEqual({
embedding: [0.1, 0.2, 0.3], embedding: [0.1, 0.2, 0.3],
model: "text-embedding-3-small", model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
}); });
} finally { } finally {
fetchMock.mockRestore(); fetchMock.mockRestore();
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY; if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
else process.env.QMD_EMBED_API_KEY = prevKey; else process.env.QMD_EMBED_API_KEY = prevKey;
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
else process.env.NVIDIA_API_KEY = prevNvidiaKey;
if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL;
else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl;
} }
}); });
@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }); const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
expect(llm.usesLocalEmbedding).toBe(true); expect(llm.usesLocalEmbedding).toBe(true);
}); });
test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
process.env.QMD_DISABLE_LOCAL_MODELS = "1";
try {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
await expect(llm.expandQuery("hello")).resolves.toEqual([]);
await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({
model: "disabled",
results: [{ file: "doc.md", score: 0, index: 0 }],
});
} finally {
if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
}
});
}); });
describe("LlamaCpp embedding truncation", () => { describe("LlamaCpp embedding truncation", () => {