fix: use Qwen3 recommended sampling params to prevent repetition loops

- Changed temperature from 0/0.1 to 0.7 (Qwen3 non-thinking mode default) - Added topK=20, topP=0.8 per Qwen3 docs - Added repeatPenalty with presencePenalty=0.5 for query expansion - Fixes infinite loop on acronyms like DHH, BFCM Qwen3 docs explicitly warn: 'DO NOT use greedy decoding, as it can lead to performance degradation and endless repetitions'
2026-02-01 03:23:14 +00:00 · 2026-02-01 03:23:14 +00:00 · 102ff861d3
commit 102ff861d3
parent 479b68bbf1
5 changed files with 85 additions and 14 deletions
--- a/finetune/Justfile
+++ b/finetune/Justfile
@ -29,3 +29,15 @@ train-local:
 grpo-local:
    CUDA_VISIBLE_DEVICES=1,2,3 HF_TOKEN=${HF_TOKEN} uv run torchrun --standalone --nproc_per_node 3 \
      train.py grpo --config configs/grpo.yaml |& tee /tmp/qmd-grpo-train.log
+
+gepa-local:
+    UV_CACHE_DIR=/tmp/uv-cache LITELLM_CACHE_DIR=/tmp/litellm-cache OLLAMA_API_BASE=http://localhost:11434 \
+      uv run python gepa/dspy_gepa.py \
+      --input data/qmd_expansion_v2.jsonl \
+      --model ollama/glm-4.7-flash:Q8_0 \
+      --reflection-model ollama/glm-4.7-flash:Q8_0 \
+      --max-metric-calls 100 --limit 20 \
+      --valset data/qmd_expansion_handcrafted.jsonl --val-limit 20 \
+      --max-tokens 512 --reflection-max-tokens 512 \
+      --emit gepa/gepa_outputs_glm.jsonl \
+      --save-prompt gepa/best_prompt_glm.txt
--- a/finetune/gepa/best_prompt_glm.txt
+++ b/finetune/gepa/best_prompt_glm.txt
@ -0,0 +1 @@
+Expand a search query into lex/vec/hyde lines.
--- a/finetune/gepa/dspy_gepa.py
+++ b/finetune/gepa/dspy_gepa.py
@ -27,7 +27,7 @@ repo_root = Path(__file__).parent.parent
 if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

-from dataset.schema import parse_output_text
+from dataset.schema import normalize_output_items, output_items_to_text, parse_output_text
 from reward import score_expansion_detailed


@ -35,11 +35,12 @@ class ExpandSignature(dspy.Signature):
    """Expand a search query into lex/vec/hyde lines."""

    query = dspy.InputField(desc="User search query")
-    expansion = dspy.OutputField(
+    output = dspy.OutputField(
        desc=(
-            "Multi-line text with prefixes: 2-3 lex:, 2-3 vec:, optional 0-1 hyde:. "
-            "Lex lines are short keywords and must not echo the query. "
-            "Vec lines are natural language search phrases. "
+            "JSON array of [kind, text] pairs. kind is lex|vec|hyde. "
+            "Return 2-3 lex, 2-3 vec, optional 0-1 hyde. "
+            "Lex items are short keywords and must not echo the query. "
+            "Vec items are natural language search phrases. "
            "Hyde is 50-200 chars, single line."
        )
    )
@ -55,7 +56,7 @@ class Expander(dspy.Module):


 def reward_metric(gold, pred, trace=None, pred_name=None, pred_trace=None):
-    expansion = getattr(pred, "expansion", "") or ""
+    expansion = output_items_to_text(_coerce_output_items(pred))
    detail = score_expansion_detailed(gold.query, expansion)
    score = detail["percentage"] / 100.0
    feedback = "; ".join(detail.get("deductions", [])) or f"score={detail['percentage']:.1f}"
@ -80,10 +81,31 @@ def to_examples(queries: list[str]) -> list[dspy.Example]:
    return [dspy.Example(query=q).with_inputs("query") for q in queries]


-def write_jsonl(path: Path, queries: list[str], outputs: list[str]) -> None:
+def _coerce_output_items(pred) -> list[list[str]]:
+    raw_output = getattr(pred, "output", None)
+    if isinstance(raw_output, (list, tuple)):
+        return normalize_output_items(raw_output)
+
+    raw_text = str(raw_output or getattr(pred, "expansion", "") or "").strip()
+    if not raw_text:
+        return []
+
+    if raw_text[0] in ("[", "{"):
+        try:
+            obj = json.loads(raw_text)
+            if isinstance(obj, dict) and "output" in obj:
+                obj = obj["output"]
+            if isinstance(obj, (list, tuple)):
+                return normalize_output_items(obj)
+        except Exception:
+            pass
+
+    return parse_output_text(raw_text)
+
+
+def write_jsonl(path: Path, queries: list[str], outputs: list[list[list[str]]]) -> None:
    with path.open("w", encoding="utf-8") as f:
-        for query, output_text in zip(queries, outputs, strict=True):
-            output = parse_output_text(output_text)
+        for query, output in zip(queries, outputs, strict=True):
            f.write(json.dumps({"query": query, "output": output}, ensure_ascii=False) + "\n")


@ -102,6 +124,8 @@ def main() -> int:
        default="grok-4-1-fast-reasoning",
        help="LM string in provider/model format (e.g., openai/gpt-4o)",
    )
+    parser.add_argument("--max-tokens", type=int, default=512, help="Max tokens for student LM")
+    parser.add_argument("--reflection-max-tokens", type=int, default=512, help="Max tokens for reflection LM")
    parser.add_argument("--auto", type=str, default="light", choices=["light", "medium", "heavy"])
    parser.add_argument("--max-full-evals", type=int, default=None)
    parser.add_argument("--max-metric-calls", type=int, default=None)
@ -134,8 +158,8 @@ def main() -> int:
            val_queries = val_queries[: args.val_limit]
        valset = to_examples(val_queries)

-    lm = dspy.LM(model=args.model)
-    reflection_lm = dspy.LM(model=args.reflection_model)
+    lm = dspy.LM(model=args.model, max_tokens=args.max_tokens)
+    reflection_lm = dspy.LM(model=args.reflection_model, max_tokens=args.reflection_max_tokens)

    student = Expander()
    student.set_lm(lm)
@ -163,7 +187,8 @@ def main() -> int:
        outputs = []
        for q in queries:
            pred = optimized(query=q)
-            outputs.append(getattr(pred, "expansion", "") or "")
+            items = _coerce_output_items(pred)
+            outputs.append(items)
        write_jsonl(Path(args.emit), queries, outputs)
        print(f"Wrote {args.emit}")

--- a/finetune/gepa/gepa_outputs_glm.jsonl
+++ b/finetune/gepa/gepa_outputs_glm.jsonl
@ -0,0 +1,20 @@
+{"query": "how tourism affects local cultures", "output": []}
+{"query": "how to ferment foods at home", "output": []}
+{"query": "how to mix modern and vintage decor", "output": []}
+{"query": "how to perform a scientific experiment", "output": []}
+{"query": "web mail", "output": []}
+{"query": "what does the quran cover", "output": []}
+{"query": "web config", "output": []}
+{"query": "how to choose farm equipment", "output": []}
+{"query": "how do thought experiments aid philosophical reasoning", "output": []}
+{"query": "what is the significance of logic in philosophy", "output": []}
+{"query": "how to train for a 5k run", "output": []}
+{"query": "how to engage with political dialogues", "output": []}
+{"query": "what is competitive analysis", "output": []}
+{"query": "how does the united nations operate", "output": []}
+{"query": "what are the crusades?", "output": []}
+{"query": "what is a literary theme?", "output": []}
+{"query": "what is the ethical significance of consent", "output": []}
+{"query": "paint mix", "output": []}
+{"query": "how to conserve energy in the office?", "output": []}
+{"query": "how to test soil ph?", "output": []}
--- a/src/llm.ts
+++ b/src/llm.ts
@ -742,13 +742,17 @@ export class LlamaCpp implements LLM {
    const session = new LlamaChatSession({ contextSequence: sequence });

    const maxTokens = options.maxTokens ?? 150;
-    const temperature = options.temperature ?? 0;
+    // Qwen3 recommends temp=0.7, topP=0.8, topK=20 for non-thinking mode
+    // DO NOT use greedy decoding (temp=0) - causes repetition loops
+    const temperature = options.temperature ?? 0.7;

    let result = "";
    try {
      await session.prompt(prompt, {
        maxTokens,
        temperature,
+        topK: 20,
+        topP: 0.8,
        onTextChunk: (text) => {
          result += text;
        },
@ -811,10 +815,19 @@ export class LlamaCpp implements LLM {
    const session = new LlamaChatSession({ contextSequence: sequence });

    try {
+      // Qwen3 recommended settings for non-thinking mode:
+      // temp=0.7, topP=0.8, topK=20, presence_penalty for repetition
+      // DO NOT use greedy decoding (temp=0) - causes infinite loops
      const result = await session.prompt(prompt, {
        grammar,
        maxTokens: 600,
-        temperature: 0.1,
+        temperature: 0.7,
+        topK: 20,
+        topP: 0.8,
+        repeatPenalty: {
+          lastTokens: 64,
+          presencePenalty: 0.5,
+        },
      });

      const lines = result.trim().split("\n");
				`@ -0,0 +1 @@`
				`Expand a search query into lex/vec/hyde lines.`