diff --git a/CHANGELOG.md b/CHANGELOG.md
index fedaa0f..2dd9ad2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,9 @@
 
 ### Fixes
 
+- Embedding: default to an external OpenAI-compatible embeddings API
+  (`text-embedding-3-small`) and require explicit `hf:`/`.gguf`
+  configuration to use local node-llama-cpp embedding models.
 - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
 - Fix: preserve original filename case in `handelize()`. The previous
   `.toLowerCase()` call made indexed paths unreachable on case-sensitive
diff --git a/CLAUDE.md b/CLAUDE.md
index dde8e7c..2f82bf5 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -135,7 +135,7 @@ bun test --preload ./src/test-preload.ts test/
 
 - SQLite FTS5 for full-text search (BM25)
 - sqlite-vec for vector similarity search
-- node-llama-cpp for embeddings (embeddinggemma), reranking (qwen3-reranker), and query expansion (Qwen3)
+- External OpenAI-compatible API for default embeddings; node-llama-cpp for optional local embeddings, reranking (qwen3-reranker), and query expansion (Qwen3)
 - Reciprocal Rank Fusion (RRF) for combining results
 - Smart chunking: 900 tokens/chunk with 15% overlap, prefers markdown headings as boundaries
 - AST-aware chunking: use `--chunk-strategy auto` to chunk code files (.ts/.js/.py/.go/.rs) at function/class/import boundaries via tree-sitter. Default is `regex` (existing behavior). Markdown and unknown file types always use regex chunking.
diff --git a/README.md b/README.md
index 6f31844..d1c36bb 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 An on-device search engine for everything you need to remember. Index your markdown notes, meeting transcripts, documentation, and knowledge bases. Search with keywords or natural language. Ideal for your agentic flows.
 
-QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking—all running locally via node-llama-cpp with GGUF models.
+QMD combines BM25 full-text search, vector semantic search, and LLM re-ranking. Embeddings use an external OpenAI-compatible API by default; local GGUF embedding models are optional.
 
 ![QMD Architecture](assets/qmd-architecture.png)
 
@@ -481,26 +481,32 @@ The `query` command uses **Reciprocal Rank Fusion (RRF)** with position-aware bl
   brew install sqlite
   ```
 
-### GGUF Models (via node-llama-cpp)
+### Models
 
-QMD uses three local GGUF models (auto-downloaded on first use):
+QMD uses `text-embedding-3-small` through an OpenAI-compatible `/embeddings` API for vector embeddings by default. Configure it with:
+
+```sh
+export QMD_EMBED_API_KEY="..."
+# Optional for non-OpenAI-compatible gateways:
+export QMD_EMBED_API_BASE_URL="https://api.openai.com/v1"
+export QMD_EMBED_MODEL="text-embedding-3-small"
+```
+
+Reranking and query expansion still use local GGUF models via node-llama-cpp:
 
 | Model | Purpose | Size |
 |-------|---------|------|
-| `embeddinggemma-300M-Q8_0` | Vector embeddings (default) | ~300MB |
 | `qwen3-reranker-0.6b-q8_0` | Re-ranking | ~640MB |
 | `qmd-query-expansion-1.7B-q4_k_m` | Query expansion (fine-tuned) | ~1.1GB |
 
 Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
 
-### Custom Embedding Model
+### Local Embedding Model
 
-Override the default embedding model via the `QMD_EMBED_MODEL` environment variable.
-This is useful for multilingual corpora (e.g. Chinese, Japanese, Korean) where
-`embeddinggemma-300M` has limited coverage.
+Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings.
 
 ```sh
-# Use Qwen3-Embedding-0.6B for better multilingual (CJK) support
+# Use Qwen3-Embedding-0.6B locally
 export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
 
 # After changing the model, re-embed all collections:
@@ -508,7 +514,8 @@ qmd embed -f
 ```
 
 Supported model families:
-- **embeddinggemma** (default) — English-optimized, small footprint
+- **OpenAI-compatible embedding APIs** — default path
+- **embeddinggemma** — optional local model, English-optimized, small footprint
 - **Qwen3-Embedding** — Multilingual (119 languages including CJK), MTEB top-ranked
 
 > **Note:** When switching embedding models, you must re-index with `qmd embed -f`
@@ -820,8 +827,8 @@ Collection ──► Glob Pattern ──► Markdown Files ──► Parse Title
 Documents are chunked into ~900-token pieces with 15% overlap using smart boundary detection:
 
 ```
-Document ──► Smart Chunk (~900 tokens) ──► Format each chunk ──► node-llama-cpp ──► Store Vectors
-                │                           "title | text"        embedBatch()
+Document ──► Smart Chunk (~900 tokens) ──► Format each chunk ──► Embedding API ──► Store Vectors
+                │                           "title | text"        /embeddings
                 │
                 └─► Chunks stored with:
                     - hash: document hash
@@ -913,14 +920,23 @@ Query ──► LLM Expansion ──► [Original, Variant 1, Variant 2]
 
 ## Model Configuration
 
-Models are configured in `src/llm.ts` as HuggingFace URIs:
+Models are configured in `src/llm.ts`:
 
 ```typescript
-const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
 ```
 
+YAML configuration can override those defaults; see `example-index.yml` for a complete config file:
+
+```yaml
+models:
+  embed: text-embedding-3-small
+  rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
+  generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
+```
+
 ### EmbeddingGemma Prompt Format
 
 ```
diff --git a/example-index.yml b/example-index.yml
index a6d2d16..d9afe1a 100644
--- a/example-index.yml
+++ b/example-index.yml
@@ -8,6 +8,16 @@
 # Use this for universal search instructions or patterns
 global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context."
 
+# Model overrides.
+# Embeddings use an external OpenAI-compatible /embeddings API by default.
+# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth.
+models:
+  embed: text-embedding-3-small
+  # Optional local embedding model instead of the external API:
+  # embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf
+  rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
+  generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
+
 # Collection definitions
 collections:
   # Meeting notes
diff --git a/src/llm.ts b/src/llm.ts
index 7cccc3f..bd70e80 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -1,7 +1,8 @@
 /**
- * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
+ * llm.ts - LLM abstraction layer for QMD
  *
- * Provides embeddings, text generation, and reranking using local GGUF models.
+ * Provides embeddings through an OpenAI-compatible API by default, with optional
+ * local GGUF embeddings plus local text generation and reranking via node-llama-cpp.
  */
 
 import {
@@ -32,7 +33,7 @@ export function isQwen3EmbeddingModel(modelUri: string): boolean {
 
 /**
  * Format a query for embedding.
- * Uses nomic-style task prefix format for embeddinggemma (default).
+ * Uses generic search task prefix format for default external embedding models.
  * Uses Qwen3-Embedding instruct format when a Qwen embedding model is active.
  */
 export function formatQueryForEmbedding(query: string, modelUri?: string): string {
@@ -190,10 +191,9 @@ export type RerankDocument = {
 // Model Configuration
 // =============================================================================
 
-// HuggingFace model URIs for node-llama-cpp
-// Format: hf:<user>/<repo>/<file>
-// Override via QMD_EMBED_MODEL env var (e.g. hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf)
-const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+// Embeddings use an OpenAI-compatible API by default.
+// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
+const DEFAULT_EMBED_MODEL = "text-embedding-3-small";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
 const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
@@ -214,6 +214,12 @@ const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME
   : join(homedir(), ".cache", "qmd", "models");
 export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
 
+const DEFAULT_EMBED_API_BASE_URL = "https://api.openai.com/v1";
+
+function isLocalEmbeddingModel(model: string): boolean {
+  return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
+}
+
 export type PullResult = {
   model: string;
   path: string;
@@ -406,6 +412,8 @@ export interface LLM {
 
 export type LlamaCppConfig = {
   embedModel?: string;
+  embedApiBaseUrl?: string;
+  embedApiKey?: string;
   generateModel?: string;
   rerankModel?: string;
   modelCacheDir?: string;
@@ -481,6 +489,8 @@ export class LlamaCpp implements LLM {
   private rerankContexts: Awaited<ReturnType<LlamaModel["createRankingContext"]>>[] = [];
 
   private embedModelUri: string;
+  private embedApiBaseUrl: string;
+  private embedApiKey?: string;
   private generateModelUri: string;
   private rerankModelUri: string;
   private modelCacheDir: string;
@@ -502,6 +512,8 @@ export class LlamaCpp implements LLM {
 
   constructor(config: LlamaCppConfig = {}) {
     this.embedModelUri = config.embedModel || process.env.QMD_EMBED_MODEL || DEFAULT_EMBED_MODEL;
+    this.embedApiBaseUrl = (config.embedApiBaseUrl || process.env.QMD_EMBED_API_BASE_URL || process.env.OPENAI_BASE_URL || DEFAULT_EMBED_API_BASE_URL).replace(/\/+$/, "");
+    this.embedApiKey = config.embedApiKey || process.env.QMD_EMBED_API_KEY || process.env.OPENAI_API_KEY;
     this.generateModelUri = config.generateModel || process.env.QMD_GENERATE_MODEL || DEFAULT_GENERATE_MODEL;
     this.rerankModelUri = config.rerankModel || process.env.QMD_RERANK_MODEL || DEFAULT_RERANK_MODEL;
     this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
@@ -514,6 +526,10 @@ export class LlamaCpp implements LLM {
     return this.embedModelUri;
   }
 
+  get usesLocalEmbedding(): boolean {
+    return isLocalEmbeddingModel(this.embedModelUri);
+  }
+
   /**
    * Reset the inactivity timer. Called after each model operation.
    * When timer fires, models are unloaded to free memory (if no active sessions).
@@ -670,6 +686,9 @@ export class LlamaCpp implements LLM {
    * Load embedding model (lazy)
    */
   private async ensureEmbedModel(): Promise<LlamaModel> {
+    if (!this.usesLocalEmbedding) {
+      throw new Error("Local embedding model requested while external embedding API is active");
+    }
     if (this.embedModel) {
       return this.embedModel;
     }
@@ -972,7 +991,55 @@ export class LlamaCpp implements LLM {
     return { text: truncatedText, truncated: true, limit: maxTokens };
   }
 
+  private async embedExternal(texts: string[], model: string): Promise<(EmbeddingResult | null)[]> {
+    if (texts.length === 0) return [];
+    if (!this.embedApiKey) {
+      throw new Error(
+        "External embedding API key is required. Set QMD_EMBED_API_KEY or OPENAI_API_KEY. " +
+        "For local embeddings, set QMD_EMBED_MODEL to an hf: or .gguf model URI."
+      );
+    }
+
+    const response = await fetch(`${this.embedApiBaseUrl}/embeddings`, {
+      method: "POST",
+      headers: {
+        "Authorization": `Bearer ${this.embedApiKey}`,
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify({ model, input: texts }),
+    });
+
+    if (!response.ok) {
+      const body = await response.text().catch(() => "");
+      throw new Error(`Embedding API request failed: ${response.status} ${response.statusText}${body ? `\n${body}` : ""}`);
+    }
+
+    const payload = await response.json() as {
+      data?: { index?: number; embedding?: number[] }[];
+      model?: string;
+    };
+    const byIndex = new Map<number, number[]>();
+    const data = payload.data ?? [];
+    for (let i = 0; i < data.length; i++) {
+      const item = data[i]!;
+      if (Array.isArray(item.embedding)) {
+        byIndex.set(typeof item.index === "number" ? item.index : i, item.embedding);
+      }
+    }
+
+    return texts.map((_, index) => {
+      const embedding = byIndex.get(index);
+      return embedding ? { embedding, model: payload.model ?? model } : null;
+    });
+  }
+
   async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
+    const model = options.model ?? this.embedModelUri;
+    if (!isLocalEmbeddingModel(model)) {
+      const results = await this.embedExternal([text], model);
+      return results[0] ?? null;
+    }
+
     // Ping activity at start to keep models alive during this operation
     this.touchActivity();
 
@@ -989,7 +1056,7 @@ export class LlamaCpp implements LLM {
 
       return {
         embedding: Array.from(embedding.vector),
-        model: options.model ?? this.embedModelUri,
+        model,
       };
     } catch (error) {
       console.error("Embedding error:", error);
@@ -1002,6 +1069,11 @@ export class LlamaCpp implements LLM {
    * Uses Promise.all for parallel embedding - node-llama-cpp handles batching internally
    */
   async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
+    const model = options.model ?? this.embedModelUri;
+    if (!isLocalEmbeddingModel(model)) {
+      return this.embedExternal(texts, model);
+    }
+
     if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
     // Ping activity at start to keep models alive during this operation
     this.touchActivity();
@@ -1024,7 +1096,7 @@ export class LlamaCpp implements LLM {
             }
             const embedding = await context.getEmbeddingFor(safeText);
             this.touchActivity();
-            embeddings.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
+            embeddings.push({ embedding: Array.from(embedding.vector), model });
           } catch (err) {
             console.error("Embedding error for text:", err);
             embeddings.push(null);
@@ -1051,7 +1123,7 @@ export class LlamaCpp implements LLM {
               }
               const embedding = await ctx.getEmbeddingFor(safeText);
               this.touchActivity();
-              results.push({ embedding: Array.from(embedding.vector), model: options.model ?? this.embedModelUri });
+              results.push({ embedding: Array.from(embedding.vector), model });
             } catch (err) {
               console.error("Embedding error for text:", err);
               results.push(null);
diff --git a/src/store.ts b/src/store.ts
index 16a55b7..5289917 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -24,6 +24,7 @@ import {
   formatQueryForEmbedding,
   formatDocForEmbedding,
   withLLMSessionForLlm,
+  DEFAULT_EMBED_MODEL_URI,
   type RerankDocument,
   type ILLMSession,
 } from "./llm.js";
@@ -39,7 +40,7 @@ import type {
 // =============================================================================
 
 const HOME = process.env.HOME || process.env.USERPROFILE || "/tmp";
-export const DEFAULT_EMBED_MODEL = "embeddinggemma";
+export const DEFAULT_EMBED_MODEL = DEFAULT_EMBED_MODEL_URI;
 export const DEFAULT_RERANK_MODEL = "ExpedientFalcon/qwen3-reranker:0.6b-q8_0";
 export const DEFAULT_QUERY_MODEL = "Qwen/Qwen3-1.7B";
 export const DEFAULT_GLOB = "**/*.md";
diff --git a/test/llm.test.ts b/test/llm.test.ts
index 74b6430..42bce0e 100644
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -1,10 +1,9 @@
 /**
- * llm.test.ts - Unit tests for the LLM abstraction layer (node-llama-cpp)
+ * llm.test.ts - Unit tests for the LLM abstraction layer
  *
  * Run with: bun test src/llm.test.ts
  *
- * These tests require the actual models to be downloaded. Run the embed or
- * rerank functions first to trigger model downloads.
+ * Integration tests require the actual local GGUF models to be downloaded.
  */
 
 import { describe, test, expect, beforeAll, afterAll, vi } from "vitest";
@@ -151,7 +150,7 @@ describe("LlamaCpp expand context size config", () => {
 });
 
 describe("LlamaCpp model resolution (config > env > default)", () => {
-  const HARDCODED_EMBED = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+  const HARDCODED_EMBED = "text-embedding-3-small";
   const HARDCODED_RERANK = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
   const HARDCODED_GENERATE = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
 
@@ -192,11 +191,44 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
       else process.env.QMD_EMBED_MODEL = prev;
     }
   });
+
+  test("default embedding uses external OpenAI-compatible API", async () => {
+    const prevKey = process.env.QMD_EMBED_API_KEY;
+    process.env.QMD_EMBED_API_KEY = "test-key";
+    const fetchMock = vi.spyOn(globalThis, "fetch").mockResolvedValue({
+      ok: true,
+      json: async () => ({
+        model: "text-embedding-3-small",
+        data: [{ index: 0, embedding: [0.1, 0.2, 0.3] }],
+      }),
+    } as Response);
+
+    try {
+      const llm = new LlamaCpp({});
+      const result = await llm.embed("hello");
+      expect(fetchMock).toHaveBeenCalledWith("https://api.openai.com/v1/embeddings", expect.objectContaining({
+        method: "POST",
+      }));
+      expect(result).toEqual({
+        embedding: [0.1, 0.2, 0.3],
+        model: "text-embedding-3-small",
+      });
+    } finally {
+      fetchMock.mockRestore();
+      if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
+      else process.env.QMD_EMBED_API_KEY = prevKey;
+    }
+  });
+
+  test("hf embedding model opts into local embedding", () => {
+    const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
+    expect(llm.usesLocalEmbedding).toBe(true);
+  });
 });
 
 describe("LlamaCpp embedding truncation", () => {
   test("truncates against the active embedding context limit, not the model train context", async () => {
-    const llm = new LlamaCpp({}) as any;
+    const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any;
     const getEmbeddingFor = vi.fn(async (text: string) => ({
       vector: new Float32Array([0.25, 0.5]),
       text,
@@ -283,11 +315,12 @@ describe("LlamaCpp.getDeviceInfo", () => {
 // =============================================================================
 
 describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
-  // Use the singleton to avoid multiple Metal contexts
-  const llm = getDefaultLlamaCpp();
+  const LOCAL_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
+  const llm = new LlamaCpp({ embedModel: LOCAL_EMBED_MODEL });
 
   afterAll(async () => {
     // Ensure native resources are released to avoid ggml-metal asserts on process exit.
+    await llm.dispose();
     await disposeDefaultLlamaCpp();
   });
 
@@ -406,7 +439,7 @@ describe.skipIf(!!process.env.CI)("LlamaCpp Integration", () => {
       // The fix uses a promise guard to ensure only one context creation runs at a time.
       // We verify this by instrumenting createEmbeddingContext to count invocations.
       
-      const freshLlm = new LlamaCpp({});
+      const freshLlm = new LlamaCpp({ embedModel: LOCAL_EMBED_MODEL });
       let contextCreateCount = 0;
       
       // Instrument the model's createEmbeddingContext to count calls