Merge pull request #273 from daocoding/feature/configurable-embed-model

feat: add QMD_EMBED_MODEL env var for multilingual embedding support
2026-03-07 14:28:59 -04:00 · 2026-03-07 14:28:59 -04:00 · 7904ab9a9d
commit 7904ab9a9d
parent cb5d84ff07 b71649b12d
3 changed files with 52 additions and 9 deletions
--- a/README.md
+++ b/README.md
@ -252,12 +252,34 @@ QMD uses three local GGUF models (auto-downloaded on first use):

 | Model | Purpose | Size |
 |-------|---------|------|
-| `embeddinggemma-300M-Q8_0` | Vector embeddings | ~300MB |
+| `embeddinggemma-300M-Q8_0` | Vector embeddings (default) | ~300MB |
 | `qwen3-reranker-0.6b-q8_0` | Re-ranking | ~640MB |
 | `qmd-query-expansion-1.7B-q4_k_m` | Query expansion (fine-tuned) | ~1.1GB |

 Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.

+### Custom Embedding Model
+
+Override the default embedding model via the `QMD_EMBED_MODEL` environment variable.
+This is useful for multilingual corpora (e.g. Chinese, Japanese, Korean) where
+`embeddinggemma-300M` has limited coverage.
+
+```sh
+# Use Qwen3-Embedding-0.6B for better multilingual (CJK) support
+export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf"
+
+# After changing the model, re-embed all collections:
+qmd embed -f
+```
+
+Supported model families:
+- **embeddinggemma** (default) — English-optimized, small footprint
+- **Qwen3-Embedding** — Multilingual (119 languages including CJK), MTEB top-ranked
+
+> **Note:** When switching embedding models, you must re-index with `qmd embed -f`
+> since vectors are not cross-compatible between models. The prompt format is
+> automatically adjusted for each model family.
+
 ## Installation

 ```sh
--- a/src/llm.ts
+++ b/src/llm.ts
@ -23,18 +23,37 @@ import { existsSync, mkdirSync, statSync, unlinkSync, readdirSync, readFileSync,
 // =============================================================================

 /**
- * Format a query for embedding.
- * Uses nomic-style task prefix format for embeddinggemma.
+ * Detect if a model URI uses the Qwen3-Embedding format.
+ * Qwen3-Embedding uses a different prompting style than nomic/embeddinggemma.
 */
-export function formatQueryForEmbedding(query: string): string {
+export function isQwen3EmbeddingModel(modelUri: string): boolean {
+  return /qwen.*embed/i.test(modelUri) || /embed.*qwen/i.test(modelUri);
+}
+
+/**
+ * Format a query for embedding.
+ * Uses nomic-style task prefix format for embeddinggemma (default).
+ * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
+ */
+export function formatQueryForEmbedding(query: string, modelUri?: string): string {
+  const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+  if (isQwen3EmbeddingModel(uri)) {
+    return `Instruct: Retrieve relevant documents for the given query\nQuery: ${query}`;
+  }
  return `task: search result | query: ${query}`;
 }

 /**
 * Format a document for embedding.
- * Uses nomic-style format with title and text fields.
+ * Uses nomic-style format with title and text fields (default).
+ * Qwen3-Embedding encodes documents as raw text without special prefixes.
 */
-export function formatDocForEmbedding(text: string, title?: string): string {
+export function formatDocForEmbedding(text: string, title?: string, modelUri?: string): string {
+  const uri = modelUri ?? process.env.QMD_EMBED_MODEL ?? DEFAULT_EMBED_MODEL;
+  if (isQwen3EmbeddingModel(uri)) {
+    // Qwen3-Embedding: documents are raw text, no task prefix
+    return title ? `${title}\n${text}` : text;
+  }
  return `title: ${title || "none"} | text: ${text}`;
 }

@ -173,7 +192,8 @@ export type RerankDocument = {

 // HuggingFace model URIs for node-llama-cpp
 // Format: hf:<user>/<repo>/<file>
-const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+// Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/qwen3-embedding-0.6b-q8_0.gguf)
+const DEFAULT_EMBED_MODEL = process.env.QMD_EMBED_MODEL ?? "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@ -1409,7 +1429,8 @@ let defaultLlamaCpp: LlamaCpp | null = null;
 */
 export function getDefaultLlamaCpp(): LlamaCpp {
  if (!defaultLlamaCpp) {
-    defaultLlamaCpp = new LlamaCpp();
+    const embedModel = process.env.QMD_EMBED_MODEL;
+    defaultLlamaCpp = new LlamaCpp(embedModel ? { embedModel } : {});
  }
  return defaultLlamaCpp;
 }
--- a/src/store.ts
+++ b/src/store.ts
@ -2255,7 +2255,7 @@ export async function searchVec(db: Database, query: string, model: string, limi

 async function getEmbedding(text: string, model: string, isQuery: boolean, session?: ILLMSession): Promise<number[] | null> {
  // Format text using the appropriate prompt template
-  const formattedText = isQuery ? formatQueryForEmbedding(text) : formatDocForEmbedding(text);
+  const formattedText = isQuery ? formatQueryForEmbedding(text, model) : formatDocForEmbedding(text, undefined, model);
  const result = session
    ? await session.embed(formattedText, { model, isQuery })
    : await getDefaultLlamaCpp().embed(formattedText, { model, isQuery });