diff --git a/CHANGELOG.md b/CHANGELOG.md index fedaa0f..2dd9ad2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ ### Fixes +- Embedding: default to an external OpenAI-compatible embeddings API + (`text-embedding-3-small`) and require explicit `hf:`/`.gguf` + configuration to use local node-llama-cpp embedding models. - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529 - Fix: preserve original filename case in `handelize()`. The previous `.toLowerCase()` call made indexed paths unreachable on case-sensitive diff --git a/CLAUDE.md b/CLAUDE.md index dde8e7c..2f82bf5 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -135,7 +135,7 @@ bun test --preload ./src/test-preload.ts test/ - SQLite FTS5 for full-text search (BM25) - sqlite-vec for vector similarity search -- node-llama-cpp for embeddings (embeddinggemma), reranking (qwen3-reranker), and query expansion (Qwen3) +- External OpenAI-compatible API for default embeddings; node-llama-cpp for optional local embeddings, reranking (qwen3-reranker), and query expansion (Qwen3) - Reciprocal Rank Fusion (RRF) for combining results - Smart chunking: 900 tokens/chunk with 15% overlap, prefers markdown headings as boundaries - AST-aware chunking: use `--chunk-strategy auto` to chunk code files (.ts/.js/.py/.go/.rs) at function/class/import boundaries via tree-sitter. Default is `regex` (existing behavior). Markdown and unknown file types always use regex chunking. diff --git a/README.md b/README.md index 6f31844..d1c36bb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ An on-device search engine for everything you need to remember. Index your markdown notes, meeting transcripts, documentation, and knowledge bases. Search with keywords or natural language. Ideal for your agentic flows. -QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking—all running locally via node-llama-cpp with GGUF models. +QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking. Embeddings use an external OpenAI-compatible API by default; local GGUF embedding models are optional. ![QMD Architecture](assets/qmd-architecture.png) @@ -481,26 +481,32 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl brew install sqlite ``` -### GGUF Models (via node-llama-cpp) +### Models -QMD uses three local GGUF models (auto-downloaded on first use): +QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with: + +```sh +export QMD_EMBED_API_KEY="..." +# Optional for non-OpenAI-compatible gateways: +export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1" +export QMD_EMBED_MODEL="text-embedding-3-small" +``` + +Reranking and query expansion still use local GGUF models via node-llama-cpp: | Model | Purpose | Size | |-------|---------|------| -| `embeddinggemma-300M-Q8_0` | Vector embeddings (default) | ~300MB | | `qwen3-reranker-0.6b-q8_0` | Re-ranking | ~640MB | | `qmd-query-expansion-1.7B-q4_k_m` | Query expansion (fine-tuned) | ~1.1GB | Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`. -### Custom Embedding Model +### Local Embedding Model -Override the default embedding model via the `QMD_EMBED_MODEL` environment variable. -This is useful for multilingual corpora (e.g. Chinese, Japanese, Korean) where -`embeddinggemma-300M` has limited coverage. +Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings. ```sh -# Use Qwen3-Embedding-0.6B for better multilingual (CJK) support +# Use Qwen3-Embedding-0.6B locally export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf" # After changing the model, re-embed all collections: @@ -508,7 +514,8 @@ qmd embed -f ``` Supported model families: -- **embeddinggemma** (default) — English-optimized, small footprint +- **OpenAI-compatible embedding APIs** — default path +- **embeddinggemma** — optional local model, English-optimized, small footprint - **Qwen3-Embedding** — Multilingual (119 languages including CJK), MTEB top-ranked > **Note:** When switching embedding models, you must re-index with `qmd embed -f` @@ -820,8 +827,8 @@ Collection ──► Glob Pattern ──► Markdown Files ──► Parse Title Documents are chunked into ~900-token pieces with 15% overlap using smart boundary detection: ``` -Document ──► Smart Chunk (~900 tokens) ──► Format each chunk ──► node-llama-cpp ──► Store Vectors - │ "title | text" embedBatch() +Document ──► Smart Chunk (~900 tokens) ──► Format each chunk ──► Embedding API ──► Store Vectors + │ "title | text" /embeddings │ └─► Chunks stored with: - hash: document hash @@ -913,14 +920,23 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2] ## Model Configuration -Models are configured in `src/llm.ts` as HuggingFace URIs: +Models are configured in `src/llm.ts`: ```typescript -const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf"; +const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; ``` +YAML configuration can override those defaults; see `example-index.yml` for a complete config file: + +```yaml +models: + embed: text-embedding-3-small + rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf + generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf +``` + ### EmbeddingGemma Prompt Format ``` diff --git a/example-index.yml b/example-index.yml index a6d2d16..d9afe1a 100644 --- a/example-index.yml +++ b/example-index.yml @@ -8,6 +8,16 @@ # Use this for universal search instructions or patterns global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context." +# Model overrides. +# Embeddings use an external OpenAI-compatible /embeddings API by default. +# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth. +models: + embed: text-embedding-3-small + # Optional local embedding model instead of the external API: + # embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf + rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf + generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf + # Collection definitions collections: # Meeting notes diff --git a/src/llm.ts b/src/llm.ts index 7cccc3f..bd70e80 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -1,7 +1,8 @@ /** - * llm.ts - LLM abstraction layer for QMD using node-llama-cpp + * llm.ts - LLM abstraction layer for QMD * - * Provides embeddings, text generation, and reranking using local GGUF models. + * Provides embeddings through an OpenAI-compatible API by default, with optional + * local GGUF embeddings plus local text generation and reranking via node-llama-cpp. */ import { @@ -32,7 +33,7 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean { /** * Format a query for embedding. - * Uses nomic-style task prefix format for embeddinggemma (default). + * Uses generic search task prefix format for default external embedding models. * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active. */ export function formatQueryForEmbedding(query: string, modelUri?: string): string { @@ -190,10 +191,9 @@ export type RerankDocument = { // Model Configuration // ============================================================================= -// HuggingFace model URIs for node-llama-cpp -// Format: hf:// -// Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf) -const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf"; +// Embeddings use an OpenAI-compatible API by default. +// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings. +const DEFAULT_EMBED_MODEL = "text-embedding-3-small"; const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf"; const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; @@ -214,6 +214,12 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME : join(homedir(), ".cache", "qmd", "models"); export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR; +const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1"; + +function isLocalEmbeddingModel(model: string): boolean { + return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../"); +} + export type PullResult = { model: string; path: string; @@ -406,6 +412,8 @@ export interface LLM { export type LlamaCppConfig = { embedModel?: string; + embedApiBaseUrl?: string; + embedApiKey?: string; generateModel?: string; rerankModel?: string; modelCacheDir?: string; @@ -481,6 +489,8 @@ export class LlamaCpp implements LLM { private rerankContexts: Awaited>[] = []; private embedModelUri: string; + private embedApiBaseUrl: string; + private embedApiKey?: string; private generateModelUri: string; private rerankModelUri: string; private modelCacheDir: string; @@ -502,6 +512,8 @@ export class LlamaCpp implements LLM { constructor(config: LlamaCppConfig = {}) { this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL; + this.embedApiBaseUrl = (config.embedApiBaseUrl || process.env.QMD_EMBED_API_BASE_URL || process.env.OPENAI_BASE_URL || DEFAULT_EMBED_API_BASE_URL).replace(/\/+$/, ""); + this.embedApiKey = config.embedApiKey || process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY; this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL; this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL; this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR; @@ -514,6 +526,10 @@ export class LlamaCpp implements LLM { return this.embedModelUri; } + get usesLocalEmbedding(): boolean { + return isLocalEmbeddingModel(this.embedModelUri); + } + /** * Reset the inactivity timer. Called after each model operation. * When timer fires, models are unloaded to free memory (if no active sessions). @@ -670,6 +686,9 @@ export class LlamaCpp implements LLM { * Load embedding model (lazy) */ private async ensureEmbedModel(): Promise { + if (!this.usesLocalEmbedding) { + throw new Error("Local embedding model requested while external embedding API is active"); + } if (this.embedModel) { return this.embedModel; } @@ -972,7 +991,55 @@ export class LlamaCpp implements LLM { return { text: truncatedText, truncated: true, limit: maxTokens }; } + private async embedExternal(texts: string[], model: string): Promise<(EmbeddingResult | null)[]> { + if (texts.length === 0) return []; + if (!this.embedApiKey) { + throw new Error( + "External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " + + "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI." + ); + } + + const response = await fetch(`${this.embedApiBaseUrl}/embeddings`, { + method: "POST", + headers: { + "Authorization": `Bearer ${this.embedApiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ model, input: texts }), + }); + + if (!response.ok) { + const body = await response.text().catch(() => ""); + throw new Error(`Embedding API request failed: ${response.status} ${response.statusText}${body ? `\n${body}` : ""}`); + } + + const payload = await response.json() as { + data?: { index?: number; embedding?: number[] }[]; + model?: string; + }; + const byIndex = new Map(); + const data = payload.data ?? []; + for (let i = 0; i < data.length; i++) { + const item = data[i]!; + if (Array.isArray(item.embedding)) { + byIndex.set(typeof item.index === "number" ? item.index : i, item.embedding); + } + } + + return texts.map((_, index) => { + const embedding = byIndex.get(index); + return embedding ? { embedding, model: payload.model ?? model } : null; + }); + } + async embed(text: string, options: EmbedOptions = {}): Promise { + const model = options.model ?? this.embedModelUri; + if (!isLocalEmbeddingModel(model)) { + const results = await this.embedExternal([text], model); + return results[0] ?? null; + } + // Ping activity at start to keep models alive during this operation this.touchActivity(); @@ -989,7 +1056,7 @@ export class LlamaCpp implements LLM { return { embedding: Array.from(embedding.vector), - model: options.model ?? this.embedModelUri, + model, }; } catch (error) { console.error("Embedding error:", error); @@ -1002,6 +1069,11 @@ export class LlamaCpp implements LLM { * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally */ async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> { + const model = options.model ?? this.embedModelUri; + if (!isLocalEmbeddingModel(model)) { + return this.embedExternal(texts, model); + } + if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)"); // Ping activity at start to keep models alive during this operation this.touchActivity(); @@ -1024,7 +1096,7 @@ export class LlamaCpp implements LLM { } const embedding = await context.getEmbeddingFor(safeText); this.touchActivity(); - embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri }); + embeddings.push({ embedding: Array.from(embedding.vector), model }); } catch (err) { console.error("Embedding error for text:", err); embeddings.push(null); @@ -1051,7 +1123,7 @@ export class LlamaCpp implements LLM { } const embedding = await ctx.getEmbeddingFor(safeText); this.touchActivity(); - results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri }); + results.push({ embedding: Array.from(embedding.vector), model }); } catch (err) { console.error("Embedding error for text:", err); results.push(null); diff --git a/src/store.ts b/src/store.ts index 16a55b7..5289917 100644 --- a/src/store.ts +++ b/src/store.ts @@ -24,6 +24,7 @@ import { formatQueryForEmbedding, formatDocForEmbedding, withLLMSessionForLlm, + DEFAULT_EMBED_MODEL_URI, type RerankDocument, type ILLMSession, } from "./llm.js"; @@ -39,7 +40,7 @@ import type { // ============================================================================= const HOME = process.env.HOME || process.env.USERPROFILE || "/tmp"; -export const DEFAULT_EMBED_MODEL = "embeddinggemma"; +export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI; export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0"; export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B"; export const DEFAULT_GLOB = "**/*.md"; diff --git a/test/llm.test.ts b/test/llm.test.ts index 74b6430..42bce0e 100644 --- a/test/llm.test.ts +++ b/test/llm.test.ts @@ -1,10 +1,9 @@ /** - * llm.test.ts - Unit tests for the LLM abstraction layer (node-llama-cpp) + * llm.test.ts - Unit tests for the LLM abstraction layer * * Run with: bun test src/llm.test.ts * - * These tests require the actual models to be downloaded. Run the embed or - * rerank functions first to trigger model downloads. + * Integration tests require the actual local GGUF models to be downloaded. */ import { describe, test, expect, beforeAll, afterAll, vi } from "vitest"; @@ -151,7 +150,7 @@ describe("LlamaCpp expand context size config", () => { }); describe("LlamaCpp model resolution (config > env > default)", () => { - const HARDCODED_EMBED = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf"; + const HARDCODED_EMBED = "text-embedding-3-small"; const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf"; const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf"; @@ -192,11 +191,44 @@ describe("LlamaCpp model resolution (config > env > default)", () => { else process.env.QMD_EMBED_MODEL = prev; } }); + + test("default embedding uses external OpenAI-compatible API", async () => { + const prevKey = process.env.QMD_EMBED_API_KEY; + process.env.QMD_EMBED_API_KEY = "test-key"; + const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({ + ok: true, + json: async () => ({ + model: "text-embedding-3-small", + data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }], + }), + } as Response); + + try { + const llm = new LlamaCpp({}); + const result = await llm.embed("hello"); + expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({ + method: "POST", + })); + expect(result).toEqual({ + embedding: [0.1, 0.2, 0.3], + model: "text-embedding-3-small", + }); + } finally { + fetchMock.mockRestore(); + if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY; + else process.env.QMD_EMBED_API_KEY = prevKey; + } + }); + + test("hf embedding model opts into local embedding", () => { + const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }); + expect(llm.usesLocalEmbedding).toBe(true); + }); }); describe("LlamaCpp embedding truncation", () => { test("truncates against the active embedding context limit, not the model train context", async () => { - const llm = new LlamaCpp({}) as any; + const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any; const getEmbeddingFor = vi.fn(async (text: string) => ({ vector: new Float32Array([0.25, 0.5]), text, @@ -283,11 +315,12 @@ describe("LlamaCpp.getDeviceInfo", () => { // ============================================================================= describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => { - // Use the singleton to avoid multiple Metal contexts - const llm = getDefaultLlamaCpp(); + const LOCAL_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf"; + const llm = new LlamaCpp({ embedModel: LOCAL_EMBED_MODEL }); afterAll(async () => { // Ensure native resources are released to avoid ggml-metal asserts on process exit. + await llm.dispose(); await disposeDefaultLlamaCpp(); }); @@ -406,7 +439,7 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => { // The fix uses a promise guard to ensure only one context creation runs at a time. // We verify this by instrumenting createEmbeddingContext to count invocations. - const freshLlm = new LlamaCpp({}); + const freshLlm = new LlamaCpp({ embedModel: LOCAL_EMBED_MODEL }); let contextCreateCount = 0; // Instrument the model's createEmbeddingContext to count calls