diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2dd9ad2..1558305 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,8 +5,11 @@
 ### Fixes
 
 - Embedding: default to an external OpenAI-compatible embeddings API
-  (`text-embedding-3-small`) and require explicit `hf:`/`.gguf`
-  configuration to use local node-llama-cpp embedding models.
+  (`nvidia/llama-3.2-nv-embedqa-1b-v2`) and require
+  `QMD_ENABLE_LOCAL_MODELS=1` for local node-llama-cpp embedding, reranking,
+  and query expansion models.
+- Embedding: use approximate token counts in external embedding mode so
+  chunking does not load a local GGUF tokenizer.
 - GPU: respect explicit `QMD_LLAMA_GPU=metal|vulkan|cuda` backend overrides instead of always using auto GPU selection. #529
 - Fix: preserve original filename case in `handelize()`. The previous
   `.toLowerCase()` call made indexed paths unreachable on case-sensitive
diff --git a/README.md b/README.md
index 74efadd..a51d8e9 100644
--- a/README.md
+++ b/README.md
@@ -490,19 +490,20 @@ by default. Configure it with:
 export NVIDIA_API_KEY="..."
 export QMD_EMBED_API_BASE_URL="https://integrate.api.nvidia.com/v1"
 export QMD_EMBED_MODEL="nvidia/llama-3.2-nv-embedqa-1b-v2"
-export QMD_DISABLE_LOCAL_MODELS=1
 ```
 
 QMD reads `NVIDIA_API_KEY` when `QMD_EMBED_API_KEY` is not set and sends
 NVIDIA's required `input_type` automatically (`passage` while indexing, `query`
 while searching).
 
-`QMD_DISABLE_LOCAL_MODELS=1` is recommended for deployments that must not load
-local GGUF models. In that mode QMD rejects local embedding model URIs, skips
-local query expansion, and defaults search reranking off while still using the
-configured external embedding service for vector search.
+Local GGUF models are disabled by default. In the default mode QMD rejects local
+embedding model URIs, skips local query expansion, and search reranking uses RRF
+scores only while still using the configured external embedding service for
+vector search.
 
-Reranking and query expansion still use local GGUF models via node-llama-cpp:
+Set `QMD_ENABLE_LOCAL_MODELS=1` to opt into local GGUF model loading. The first
+query expansion or reranking call can download and load the configured local
+model, which may take a while.
 
 | Model | Purpose | Size |
 |-------|---------|------|
@@ -513,10 +514,12 @@ Models are downloaded from HuggingFace and cached in `~/.cache/qmd/models/`.
 
 ### Local Embedding Model
 
-Set `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf` path to opt into local node-llama-cpp embeddings.
+Set `QMD_ENABLE_LOCAL_MODELS=1` and `QMD_EMBED_MODEL` to an `hf:` URI or `.gguf`
+path to opt into local node-llama-cpp embeddings.
 
 ```sh
 # Use Qwen3-Embedding-0.6B locally
+export QMD_ENABLE_LOCAL_MODELS=1
 export QMD_EMBED_MODEL="hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf"
 
 # After changing the model, re-embed all collections:
@@ -943,8 +946,9 @@ YAML configuration can override those defaults; see `example-index.yml` for a co
 ```yaml
 models:
   embed: nvidia/llama-3.2-nv-embedqa-1b-v2
-  rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
-  generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
+  # Optional local models, used only when QMD_ENABLE_LOCAL_MODELS=1:
+  # rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
+  # generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
 ```
 
 ### EmbeddingGemma Prompt Format
diff --git a/example-index.yml b/example-index.yml
index d9afe1a..62f8279 100644
--- a/example-index.yml
+++ b/example-index.yml
@@ -9,14 +9,16 @@
 global_context: "If you see a relevant [[WikiWord]], you can search for that WikiWord to get more context."
 
 # Model overrides.
-# Embeddings use an external OpenAI-compatible /embeddings API by default.
-# Set QMD_EMBED_API_KEY or OPENAI_API_KEY in the environment for API auth.
+# Embeddings use NVIDIA's OpenAI-compatible /embeddings API by default.
+# Set NVIDIA_API_KEY, QMD_EMBED_API_KEY, or OPENAI_API_KEY in the environment for API auth.
+# Local GGUF models are disabled unless QMD_ENABLE_LOCAL_MODELS=1 is set.
 models:
-  embed: text-embedding-3-small
+  embed: nvidia/llama-3.2-nv-embedqa-1b-v2
   # Optional local embedding model instead of the external API:
   # embed: hf:Qwen/Qwen3-Embedding-0.6B-GGUF/Qwen3-Embedding-0.6B-Q8_0.gguf
-  rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
-  generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
+  # Optional local rerank/generation models:
+  # rerank: hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf
+  # generate: hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf
 
 # Collection definitions
 collections:
diff --git a/src/cli/qmd.ts b/src/cli/qmd.ts
index bfcd392..3ee10aa 100755
--- a/src/cli/qmd.ts
+++ b/src/cli/qmd.ts
@@ -463,8 +463,8 @@ async function showStatus(): Promise<void> {
     };
     console.log(`\n${c.bold}Models${c.reset}`);
     console.log(`  Embedding:   ${hfLink(DEFAULT_EMBED_MODEL_URI)}`);
-    console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)}`);
-    console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)}`);
+    console.log(`  Reranking:   ${hfLink(DEFAULT_RERANK_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
+    console.log(`  Generation:  ${hfLink(DEFAULT_GENERATE_MODEL_URI)} ${c.dim}(optional; set QMD_ENABLE_LOCAL_MODELS=1)${c.reset}`);
   }
 
   // Device / GPU info
@@ -3125,11 +3125,13 @@ if (isMain) {
 
     case "pull": {
       const refresh = cli.values.refresh === undefined ? false : Boolean(cli.values.refresh);
+      const isLocalModelUri = (uri: string) =>
+        uri.startsWith("hf:") || uri.endsWith(".gguf") || uri.startsWith("/") || uri.startsWith("./") || uri.startsWith("../");
       const models = [
         DEFAULT_EMBED_MODEL_URI,
         DEFAULT_GENERATE_MODEL_URI,
         DEFAULT_RERANK_MODEL_URI,
-      ];
+      ].filter(isLocalModelUri);
       console.log(`${c.bold}Pulling models${c.reset}`);
       const results = await pullModels(models, {
         refresh,
diff --git a/src/llm.ts b/src/llm.ts
index cb8c6a2..e8be977 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -192,7 +192,7 @@ export type RerankDocument = {
 // =============================================================================
 
 // Embeddings use NVIDIA's OpenAI-compatible API by default.
-// Override QMD_EMBED_MODEL with hf:/path/.gguf to opt into local node-llama-cpp embeddings.
+// Set QMD_ENABLE_LOCAL_MODELS=1 before using any local node-llama-cpp GGUF models.
 const DEFAULT_EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2";
 const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
 // const DEFAULT_GENERATE_MODEL = "hf:ggml-org/Qwen3-0.6B-GGUF/Qwen3-0.6B-Q8_0.gguf";
@@ -216,14 +216,24 @@ export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR;
 
 const DEFAULT_EMBED_API_BASE_URL = "https://integrate.api.nvidia.com/v1";
 
-export function localModelsDisabled(): boolean {
-  return /^(1|true|yes|on)$/i.test(process.env.QMD_DISABLE_LOCAL_MODELS ?? "");
+export function localModelsEnabled(): boolean {
+  return /^(1|true|yes|on)$/i.test(process.env.QMD_ENABLE_LOCAL_MODELS ?? "");
 }
 
 function isLocalEmbeddingModel(model: string): boolean {
   return model.startsWith("hf:") || model.endsWith(".gguf") || model.startsWith("/") || model.startsWith("./") || model.startsWith("../");
 }
 
+export function approximateTokenCount(text: string): number {
+  if (text.length === 0) return 0;
+  return Math.max(1, Math.ceil(text.length / 3));
+}
+
+export function truncateByApproxTokens(text: string, maxTokens: number): string {
+  if (maxTokens <= 0) return "";
+  return text.slice(0, Math.max(1, maxTokens * 3));
+}
+
 export type PullResult = {
   model: string;
   path: string;
@@ -929,10 +939,17 @@ export class LlamaCpp implements LLM {
   // ==========================================================================
 
   /**
-   * Tokenize text using the embedding model's tokenizer
-   * Returns tokenizer tokens (opaque type from node-llama-cpp)
+   * Tokenize text using the embedding model's tokenizer when local embeddings
+   * are explicitly active. External embedding mode uses a conservative
+   * approximation and must not load a local tokenizer.
    */
   async tokenize(text: string): Promise<readonly LlamaToken[]> {
+    if (!this.usesLocalEmbedding) {
+      return Array.from(
+        { length: approximateTokenCount(text) },
+        (_, index) => index as unknown as LlamaToken,
+      );
+    }
     await this.ensureEmbedContext();  // Ensure model is loaded
     if (!this.embedModel) {
       throw new Error("Embed model not loaded");
@@ -941,17 +958,25 @@ export class LlamaCpp implements LLM {
   }
 
   /**
-   * Count tokens in text using the embedding model's tokenizer
+   * Count tokens in text. External embedding mode uses an approximation so
+   * chunking never pulls in a local GGUF tokenizer by accident.
    */
   async countTokens(text: string): Promise<number> {
+    if (!this.usesLocalEmbedding) {
+      return approximateTokenCount(text);
+    }
     const tokens = await this.tokenize(text);
     return tokens.length;
   }
 
   /**
-   * Detokenize token IDs back to text
+   * Detokenize token IDs back to text. External embedding mode has no local
+   * tokenizer, so return an approximate-width placeholder for guardrail paths.
    */
   async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
+    if (!this.usesLocalEmbedding) {
+      return " ".repeat(tokens.length * 3);
+    }
     await this.ensureEmbedContext();
     if (!this.embedModel) {
       throw new Error("Embed model not loaded");
@@ -1047,8 +1072,8 @@ export class LlamaCpp implements LLM {
 
   async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
     const model = options.model ?? this.embedModelUri;
-    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
-      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
+    if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
+      throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
     }
     if (!isLocalEmbeddingModel(model)) {
       const results = await this.embedExternal([text], model, options);
@@ -1085,8 +1110,8 @@ export class LlamaCpp implements LLM {
    */
   async embedBatch(texts: string[], options: EmbedOptions = {}): Promise<(EmbeddingResult | null)[]> {
     const model = options.model ?? this.embedModelUri;
-    if (localModelsDisabled() && isLocalEmbeddingModel(model)) {
-      throw new Error("Local embedding models are disabled. Set QMD_EMBED_MODEL to an external API model.");
+    if (!localModelsEnabled() && isLocalEmbeddingModel(model)) {
+      throw new Error("Local embedding models are disabled. Set QMD_ENABLE_LOCAL_MODELS=1 to use local GGUF models.");
     }
     if (!isLocalEmbeddingModel(model)) {
       return this.embedExternal(texts, model, options);
@@ -1219,7 +1244,7 @@ export class LlamaCpp implements LLM {
   // ==========================================================================
 
   async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
-    if (localModelsDisabled()) return [];
+    if (!localModelsEnabled()) return [];
     if (this._ciMode) throw new Error("LLM operations are disabled in CI (set CI=true)");
     // Ping activity at start to keep models alive during this operation
     this.touchActivity();
@@ -1319,7 +1344,7 @@ export class LlamaCpp implements LLM {
     documents: RerankDocument[],
     options: RerankOptions = {}
   ): Promise<RerankResult> {
-    if (localModelsDisabled()) {
+    if (!localModelsEnabled()) {
       return {
         model: "disabled",
         results: documents.map((doc, index) => ({ file: doc.file, score: 0, index })),
diff --git a/src/store.ts b/src/store.ts
index f7c853a..b0f4ee3 100644
--- a/src/store.ts
+++ b/src/store.ts
@@ -21,11 +21,13 @@ import fastGlob from "fast-glob";
 import {
   LlamaCpp,
   getDefaultLlamaCpp,
+  approximateTokenCount,
+  truncateByApproxTokens,
   formatQueryForEmbedding,
   formatDocForEmbedding,
   withLLMSessionForLlm,
   DEFAULT_EMBED_MODEL_URI,
-  localModelsDisabled,
+  localModelsEnabled,
   type RerankDocument,
   type ILLMSession,
 } from "./llm.js";
@@ -2279,6 +2281,15 @@ export async function chunkDocumentByTokens(
   signal?: AbortSignal
 ): Promise<{ text: string; pos: number; tokens: number }[]> {
   const llm = getDefaultLlamaCpp();
+  const useLocalTokenizer = typeof (llm as any).usesLocalEmbedding === "boolean"
+    ? Boolean((llm as any).usesLocalEmbedding)
+    : true;
+
+  const countTokens = async (text: string): Promise<number> => {
+    if (!useLocalTokenizer) return approximateTokenCount(text);
+    const tokens = await llm.tokenize(text);
+    return tokens.length;
+  };
 
   // Use moderate chars/token estimate (prose ~4, code ~2, mixed ~3)
   // If chunks exceed limit, they'll be re-split with actual ratio
@@ -2301,13 +2312,13 @@ export async function chunkDocumentByTokens(
   const pushChunkWithinTokenLimit = async (text: string, pos: number): Promise<void> => {
     if (signal?.aborted) return;
 
-    const tokens = await llm.tokenize(text);
-    if (tokens.length <= maxTokens || text.length <= 1) {
-      results.push({ text, pos, tokens: tokens.length });
+    const tokenCount = await countTokens(text);
+    if (tokenCount <= maxTokens || text.length <= 1) {
+      results.push({ text, pos, tokens: tokenCount });
       return;
     }
 
-    const actualCharsPerToken = text.length / tokens.length;
+    const actualCharsPerToken = text.length / tokenCount;
     let safeMaxChars = Math.floor(maxTokens * actualCharsPerToken * 0.95);
     if (!Number.isFinite(safeMaxChars) || safeMaxChars < 1) {
       safeMaxChars = Math.floor(text.length / 2);
@@ -2337,12 +2348,14 @@ export async function chunkDocumentByTokens(
       subChunks.length <= 1
       || subChunks[0]?.text.length === text.length
     ) {
-      const fallbackTokens = tokens.slice(0, Math.max(1, maxTokens));
-      const truncatedText = await llm.detokenize(fallbackTokens);
+      const tokenLimit = Math.max(1, maxTokens);
+      const truncatedText = useLocalTokenizer
+        ? await llm.detokenize((await llm.tokenize(text)).slice(0, tokenLimit))
+        : truncateByApproxTokens(text, tokenLimit);
       results.push({
         text: truncatedText,
         pos,
-        tokens: fallbackTokens.length,
+        tokens: tokenLimit,
       });
       return;
     }
@@ -4013,7 +4026,7 @@ export async function hybridQuery(
   const collection = options?.collection;
   const explain = options?.explain ?? false;
   const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? localModelsDisabled();
+  const skipRerank = options?.skipRerank ?? !localModelsEnabled();
   const hooks = options?.hooks;
 
   const rankedLists: RankedResult[][] = [];
@@ -4408,7 +4421,7 @@ export async function structuredSearch(
   const candidateLimit = options?.candidateLimit ?? RERANK_CANDIDATE_LIMIT;
   const explain = options?.explain ?? false;
   const intent = options?.intent;
-  const skipRerank = options?.skipRerank ?? localModelsDisabled();
+  const skipRerank = options?.skipRerank ?? !localModelsEnabled();
   const hooks = options?.hooks;
 
   const collections = options?.collections;
diff --git a/test/llm.test.ts b/test/llm.test.ts
index f1b09ae..8ee11a2 100644
--- a/test/llm.test.ts
+++ b/test/llm.test.ts
@@ -193,9 +193,11 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
   });
 
   test("default embedding uses NVIDIA OpenAI-compatible API", async () => {
+    const prevModel = process.env.QMD_EMBED_MODEL;
     const prevKey = process.env.QMD_EMBED_API_KEY;
     const prevNvidiaKey = process.env.NVIDIA_API_KEY;
     const prevBaseUrl = process.env.QMD_EMBED_API_BASE_URL;
+    delete process.env.QMD_EMBED_MODEL;
     process.env.QMD_EMBED_API_KEY = "test-key";
     delete process.env.NVIDIA_API_KEY;
     delete process.env.QMD_EMBED_API_BASE_URL;
@@ -225,6 +227,8 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
       });
     } finally {
       fetchMock.mockRestore();
+      if (prevModel === undefined) delete process.env.QMD_EMBED_MODEL;
+      else process.env.QMD_EMBED_MODEL = prevModel;
       if (prevKey === undefined) delete process.env.QMD_EMBED_API_KEY;
       else process.env.QMD_EMBED_API_KEY = prevKey;
       if (prevNvidiaKey === undefined) delete process.env.NVIDIA_API_KEY;
@@ -274,9 +278,9 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
     expect(llm.usesLocalEmbedding).toBe(true);
   });
 
-  test("QMD_DISABLE_LOCAL_MODELS rejects local embedding models and bypasses local LLMs", async () => {
-    const prev = process.env.QMD_DISABLE_LOCAL_MODELS;
-    process.env.QMD_DISABLE_LOCAL_MODELS = "1";
+  test("local models are disabled by default", async () => {
+    const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
+    delete process.env.QMD_ENABLE_LOCAL_MODELS;
     try {
       const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" });
       await expect(llm.embed("hello")).rejects.toThrow("Local embedding models are disabled");
@@ -286,14 +290,54 @@ describe("LlamaCpp model resolution (config > env > default)", () => {
         results: [{ file: "doc.md", score: 0, index: 0 }],
       });
     } finally {
-      if (prev === undefined) delete process.env.QMD_DISABLE_LOCAL_MODELS;
-      else process.env.QMD_DISABLE_LOCAL_MODELS = prev;
+      if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
+      else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
     }
   });
+
+  test("QMD_ENABLE_LOCAL_MODELS allows explicit local embedding models", async () => {
+    const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
+    process.env.QMD_ENABLE_LOCAL_MODELS = "1";
+    try {
+      const llm = new LlamaCpp({ embedModel: "hf:custom/embed.gguf" }) as any;
+      llm._ciMode = false;
+      llm.touchActivity = vi.fn();
+      llm.ensureEmbedContext = vi.fn().mockResolvedValue({
+        getEmbeddingFor: vi.fn(async () => ({ vector: new Float32Array([0.1, 0.2]) })),
+      });
+      llm.truncateToContextSize = vi.fn(async (text: string) => ({
+        text,
+        truncated: false,
+        limit: 2048,
+      }));
+
+      await expect(llm.embed("hello")).resolves.toEqual({
+        embedding: [expect.closeTo(0.1), expect.closeTo(0.2)],
+        model: "hf:custom/embed.gguf",
+      });
+      expect(llm.ensureEmbedContext).toHaveBeenCalled();
+    } finally {
+      if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
+      else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
+    }
+  });
+
+  test("external embedding token counting does not load a local tokenizer", async () => {
+    const llm = new LlamaCpp({ embedModel: "nvidia/llama-3.2-nv-embedqa-1b-v2" }) as any;
+    llm.ensureEmbedContext = vi.fn(async () => {
+      throw new Error("should not load local tokenizer");
+    });
+
+    await expect(llm.countTokens("abcdef")).resolves.toBe(2);
+    await expect(llm.tokenize("abcdef")).resolves.toHaveLength(2);
+    expect(llm.ensureEmbedContext).not.toHaveBeenCalled();
+  });
 });
 
 describe("LlamaCpp embedding truncation", () => {
   test("truncates against the active embedding context limit, not the model train context", async () => {
+    const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
+    process.env.QMD_ENABLE_LOCAL_MODELS = "1";
     const llm = new LlamaCpp({ embedModel: "hf:test/embed.gguf" }) as any;
     const getEmbeddingFor = vi.fn(async (text: string) => ({
       vector: new Float32Array([0.25, 0.5]),
@@ -308,18 +352,25 @@ describe("LlamaCpp embedding truncation", () => {
     };
     llm.ensureEmbedContext = vi.fn().mockResolvedValue({ getEmbeddingFor });
 
-    const result = await llm.embed("x".repeat(3000));
+    try {
+      const result = await llm.embed("x".repeat(3000));
 
-    expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
-    expect(result).toEqual({
-      embedding: [0.25, 0.5],
-      model: llm.embedModelUri,
-    });
+      expect(getEmbeddingFor).toHaveBeenCalledWith("x".repeat(2044));
+      expect(result).toEqual({
+        embedding: [0.25, 0.5],
+        model: llm.embedModelUri,
+      });
+    } finally {
+      if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
+      else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
+    }
   });
 });
 
 describe("LlamaCpp rerank deduping", () => {
   test("deduplicates identical document texts before scoring", async () => {
+    const prev = process.env.QMD_ENABLE_LOCAL_MODELS;
+    process.env.QMD_ENABLE_LOCAL_MODELS = "1";
     const llm = new LlamaCpp({}) as any;
     llm._ciMode = false; // allow unit test even in CI (mocked, no real models)
     const rankAll = vi.fn(async (_query: string, docs: string[]) =>
@@ -333,20 +384,25 @@ describe("LlamaCpp rerank deduping", () => {
       detokenize: (tokens: string[]) => tokens.join(""),
     });
 
-    const result = await llm.rerank("query", [
-      { file: "a.md", text: "shared chunk" },
-      { file: "b.md", text: "shared chunk" },
-      { file: "c.md", text: "different chunk" },
-    ]);
+    try {
+      const result = await llm.rerank("query", [
+        { file: "a.md", text: "shared chunk" },
+        { file: "b.md", text: "shared chunk" },
+        { file: "c.md", text: "different chunk" },
+      ]);
 
-    expect(rankAll).toHaveBeenCalledTimes(1);
-    expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
-    expect(result.results).toHaveLength(3);
+      expect(rankAll).toHaveBeenCalledTimes(1);
+      expect(rankAll).toHaveBeenCalledWith("query", ["shared chunk", "different chunk"]);
+      expect(result.results).toHaveLength(3);
 
-    const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
-    expect(scoreByFile.get("a.md")).toBe(0.9);
-    expect(scoreByFile.get("b.md")).toBe(0.9);
-    expect(scoreByFile.get("c.md")).toBe(0.2);
+      const scoreByFile = new Map(result.results.map((item) => [item.file, item.score]));
+      expect(scoreByFile.get("a.md")).toBe(0.9);
+      expect(scoreByFile.get("b.md")).toBe(0.9);
+      expect(scoreByFile.get("c.md")).toBe(0.2);
+    } finally {
+      if (prev === undefined) delete process.env.QMD_ENABLE_LOCAL_MODELS;
+      else process.env.QMD_ENABLE_LOCAL_MODELS = prev;
+    }
   });
 });
 
diff --git a/test/store.test.ts b/test/store.test.ts
index 848ec96..9fe57ff 100644
--- a/test/store.test.ts
+++ b/test/store.test.ts
@@ -2820,6 +2820,33 @@ describe("Embedding batching", () => {
 });
 
 describe("Token chunking guardrails", () => {
+  test("chunkDocumentByTokens uses approximate counts for external embeddings without tokenizer load", async () => {
+    const tokenize = vi.fn(async () => {
+      throw new Error("should not tokenize through local GGUF");
+    });
+    const detokenize = vi.fn(async () => {
+      throw new Error("should not detokenize through local GGUF");
+    });
+
+    setDefaultLlamaCpp({
+      usesLocalEmbedding: false,
+      tokenize,
+      detokenize,
+    } as any);
+
+    try {
+      const chunks = await chunkDocumentByTokens("x".repeat(1200), 100, 15, 20);
+
+      expect(chunks.length).toBeGreaterThan(1);
+      expect(chunks.every((chunk) => chunk.tokens <= 100)).toBe(true);
+      expect(chunks[0]!.text.length).toBeLessThanOrEqual(300);
+      expect(tokenize).not.toHaveBeenCalled();
+      expect(detokenize).not.toHaveBeenCalled();
+    } finally {
+      setDefaultLlamaCpp(null);
+    }
+  });
+
   test("chunkDocumentByTokens keeps pathological single-line blobs under the token limit", async () => {
     setDefaultLlamaCpp({
       async tokenize(text: string) {