feat: default to NVIDIA embeddings

This commit is contained in:
Haitao Pan 2026-05-09 16:50:04 +08:00
parent fbad5791e3
commit 7c17c8bcce
6 changed files with 100 additions and 30 deletions

View File

@ -483,25 +483,25 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
### Models
QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with:
```sh
export QMD_EMBED_API_KEY="..."
# Optional for non-OpenAI-compatible gateways:
export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
export QMD_EMBED_MODEL="text-embedding-3-small"
```
NVIDIA NIM's OpenAI-compatible endpoint can be used directly. QMD reads
`NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends NVIDIA's required
`input_type` automatically (`passage` while indexing, `query` while searching):
QMD uses NVIDIA NIM's OpenAI-compatible `/embeddings` API for vector embeddings
by default. Configure it with:
```sh
export NVIDIA_API_KEY="..."
export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
export QMD_DISABLE_LOCAL_MODELS=1
```
QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
while searching).
`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
local GGUF models. In that mode QMD rejects local embedding model URIs, skips
local query expansion, and defaults search reranking off while still using the
configured external embedding service for vector search.
Reranking and query expansion still use local GGUF models via node-llama-cpp:
| Model | Purpose | Size |
@ -933,7 +933,7 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
Models are configured in `src/llm.ts`:
```typescript
const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
```
@ -942,7 +942,7 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
```yaml
models:
embed: text-embedding-3-small
embed: nvidia/llama-3.2-nv-embedqa-1b-v2
rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
```

View File

@ -16,9 +16,6 @@ done
DIR="$(cd -P "$(dirname "$SOURCE")/.." && pwd)"
# Detect the package manager that installed dependencies by checking lockfiles.
# $BUN_INSTALL is intentionally NOT checked — it only indicates that bun exists
# on the system, not that it was used to install this package (see #361).
#
# package-lock.json takes priority: if it exists, npm installed the native
# modules for Node. The repo ships bun.lock, so without this check, source
# builds that use npm would be incorrectly routed to bun, causing ABI
@ -27,6 +24,10 @@ if [ -f "$DIR/package-lock.json" ]; then
exec node "$DIR/dist/cli/qmd.js" "$@"
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
exec bun "$DIR/dist/cli/qmd.js" "$@"
elif command -v bun >/dev/null 2>&1; then
exec bun "$DIR/dist/cli/qmd.js" "$@"
elif [ -x "$HOME/.bun/bin/bun" ]; then
exec "$HOME/.bun/bin/bun" "$DIR/dist/cli/qmd.js" "$@"
else
exec node "$DIR/dist/cli/qmd.js" "$@"
fi

View File

@ -191,9 +191,9 @@ export type RerankDocument = {
// Model Configuration
// =============================================================================
// Embeddings use an OpenAI-compatible API by default.
// Embeddings use NVIDIA's OpenAI-compatible API by default.
// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
// const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -214,7 +214,11 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
: join(homedir(), ".cache", "qmd", "models");
export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1";
const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
export function localModelsDisabled(): boolean {
return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
}
function isLocalEmbeddingModel(model: string): boolean {
return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
@ -999,7 +1003,7 @@ export class LlamaCpp implements LLM {
if (texts.length === 0) return [];
if (!this.embedApiKey) {
throw new Error(
"External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " +
"External embedding API key is required. Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY. " +
"For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
);
}
@ -1043,6 +1047,9 @@ export class LlamaCpp implements LLM {
async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
}
if (!isLocalEmbeddingModel(model)) {
const results = await this.embedExternal([text], model, options);
return results[0] ?? null;
@ -1078,6 +1085,9 @@ export class LlamaCpp implements LLM {
*/
async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
const model = options.model ?? this.embedModelUri;
if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
}
if (!isLocalEmbeddingModel(model)) {
return this.embedExternal(texts, model, options);
}
@ -1209,6 +1219,7 @@ export class LlamaCpp implements LLM {
// ==========================================================================
async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
if (localModelsDisabled()) return [];
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
// Ping activity at start to keep models alive during this operation
this.touchActivity();
@ -1308,6 +1319,12 @@ export class LlamaCpp implements LLM {
documents: RerankDocument[],
options: RerankOptions = {}
): Promise<RerankResult> {
if (localModelsDisabled()) {
return {
model: "disabled",
results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
};
}
if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
// Ping activity at start to keep models alive during this operation
this.touchActivity();

View File

@ -25,6 +25,7 @@ import {
formatDocForEmbedding,
withLLMSessionForLlm,
DEFAULT_EMBED_MODEL_URI,
localModelsDisabled,
type RerankDocument,
type ILLMSession,
} from "./llm.js";
@ -4012,7 +4013,7 @@ export async function hybridQuery(
const collection = options?.collection;
const explain = options?.explain ?? false;
const intent = options?.intent;
const skipRerank = options?.skipRerank ?? false;
const skipRerank = options?.skipRerank ?? localModelsDisabled();
const hooks = options?.hooks;
const rankedLists: RankedResult[][] = [];
@ -4407,7 +4408,7 @@ export async function structuredSearch(
const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
const explain = options?.explain ?? false;
const intent = options?.intent;
const skipRerank = options?.skipRerank ?? false;
const skipRerank = options?.skipRerank ?? localModelsDisabled();
const hooks = options?.hooks;
const collections = options?.collections;

View File

@ -20,10 +20,16 @@ fail() { printf " %-60s FAIL\n" "$1 (got: $2, expected: $3)"; FAIL=$((FAIL + 1)
# Instead of exec-ing a runtime, we echo which one would be chosen.
detect_runtime() {
local DIR="$1"
local BUN_AVAILABLE="${2:-1}"
local HOME_BUN_AVAILABLE="${3:-1}"
if [ -f "$DIR/package-lock.json" ]; then
echo "node"
elif [ -f "$DIR/bun.lock" ] || [ -f "$DIR/bun.lockb" ]; then
echo "bun"
elif [[ "$BUN_AVAILABLE" == "1" ]]; then
echo "bun"
elif [[ "$HOME_BUN_AVAILABLE" == "1" ]]; then
echo "home-bun"
else
echo "node"
fi
@ -45,10 +51,24 @@ echo "=== bin/qmd runtime detection tests ==="
# --- Test cases ---
# 1. No lockfiles → default to node
# 1. No lockfiles → default to bun when bun is available
d="$TMPDIR_BASE/no-lockfiles"
mkdir -p "$d"
assert_runtime "no lockfiles → node" "$d" "node"
assert_runtime "no lockfiles + bun available → bun" "$d" "bun"
got=$(detect_runtime "$d" 0 0)
if [[ "$got" == "node" ]]; then
ok "no lockfiles + no bun → node"
else
fail "no lockfiles + no bun → node" "$got" "node"
fi
got=$(detect_runtime "$d" 0 1)
if [[ "$got" == "home-bun" ]]; then
ok "no lockfiles + home bun → home-bun"
else
fail "no lockfiles + home bun → home-bun" "$got" "home-bun"
fi
# 2. Only bun.lock → bun
d="$TMPDIR_BASE/bun-lock-only"

View File

@ -150,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
});
describe("LlamaCpp model resolution (config > env > default)", () => {
const HARDCODED_EMBED = "text-embedding-3-small";
const HARDCODED_EMBED = "nvidia/llama-3.2-nv-embedqa-1b-v2";
const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -192,13 +192,17 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
}
});
test("default embedding uses external OpenAI-compatible API", async () => {
test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
const prevKey = process.env.QMD_EMBED_API_KEY;
const prevNvidiaKey = process.env.NVIDIA_API_KEY;
const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
process.env.QMD_EMBED_API_KEY = "test-key";
delete process.env.NVIDIA_API_KEY;
delete process.env.QMD_EMBED_API_BASE_URL;
const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
ok: true,
json: async () => ({
model: "text-embedding-3-small",
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
}),
} as Response);
@ -206,17 +210,27 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
try {
const llm = new LlamaCpp({});
const result = await llm.embed("hello");
expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({
expect(fetchMock).toHaveBeenCalledWith("https://integrate.api.nvidia.com/v1/embeddings", expect.objectContaining({
method: "POST",
}));
const [, init] = fetchMock.mock.calls[0]!;
expect(JSON.parse((init as RequestInit).body as string)).toEqual({
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
input: ["hello"],
input_type: "passage",
});
expect(result).toEqual({
embedding: [0.1, 0.2, 0.3],
model: "text-embedding-3-small",
model: "nvidia/llama-3.2-nv-embedqa-1b-v2",
});
} finally {
fetchMock.mockRestore();
if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
else process.env.QMD_EMBED_API_KEY = prevKey;
if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
else process.env.NVIDIA_API_KEY = prevNvidiaKey;
if (prevBaseUrl === undefined) delete process.env.QMD_EMBED_API_BASE_URL;
else process.env.QMD_EMBED_API_BASE_URL = prevBaseUrl;
}
});
@ -259,6 +273,23 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
expect(llm.usesLocalEmbedding).toBe(true);
});
test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
process.env.QMD_DISABLE_LOCAL_MODELS = "1";
try {
const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
await expect(llm.expandQuery("hello")).resolves.toEqual([]);
await expect(llm.rerank("hello", [{ file: "doc.md", text: "hello" }])).resolves.toEqual({
model: "disabled",
results: [{ file: "doc.md", score: 0, index: 0 }],
});
} finally {
if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
}
});
});
describe("LlamaCpp embedding truncation", () => {