diff --git a/CHANGELOG.md b/CHANGELOG.md index 7731bc3..3e56c27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ ### Fixes +- Fix paths in nix flake - Sync stale `bun.lock` (`better-sqlite3` 11.x → 12.x). CI and release script now use `--frozen-lockfile` to prevent recurrence. #386 (thanks @Mic92) diff --git a/bun.lock b/bun.lock index 74cf1cb..de2be8c 100644 --- a/bun.lock +++ b/bun.lock @@ -12,7 +12,7 @@ "picomatch": "^4.0.0", "sqlite-vec": "^0.1.7-alpha.2", "yaml": "^2.8.2", - "zod": "^4.2.1", + "zod": "4.2.1", }, "devDependencies": { "@types/better-sqlite3": "^7.6.0", diff --git a/finetune/benchmark.py b/finetune/benchmark.py new file mode 100644 index 0000000..c0a28bf --- /dev/null +++ b/finetune/benchmark.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +"""Benchmark QMD query expansion: LFM2.5 vs Qwen3 finetuned models.""" + +import json +import time +import torch +from pathlib import Path +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig +from peft import PeftModel + +QUERIES = [ + "kubernetes pod networking", + "best practices for React server components", + "how to optimize PostgreSQL queries for large tables", + "what is retrieval augmented generation", + "python async await concurrency patterns", + "nginx reverse proxy load balancing", + "git rebase vs merge workflow", + "rust ownership and borrowing explained", + "docker compose multi-stage builds", + "elasticsearch full text search performance", + "shopify liquid template customization", + "machine learning feature engineering techniques", + "aws lambda cold start optimization", + "typescript generics and utility types", + "redis caching strategies for web apps", +] + +def load_model(base_name, adapter_dir, device, trust_remote=False): + tokenizer = AutoTokenizer.from_pretrained(base_name, trust_remote_code=trust_remote) + base = AutoModelForCausalLM.from_pretrained( + base_name, dtype=torch.bfloat16, device_map=device, trust_remote_code=trust_remote + ) + model = PeftModel.from_pretrained(base, adapter_dir, local_files_only=True) + model = model.merge_and_unload() + model.eval() + + gen_config_path = Path(adapter_dir) / "generation_config.json" + if gen_config_path.exists(): + gen_config = GenerationConfig.from_pretrained(adapter_dir) + else: + gen_config = GenerationConfig( + temperature=0.1, top_k=50, top_p=0.1, + repetition_penalty=1.05, do_sample=True, max_new_tokens=300, + ) + return model, tokenizer, gen_config + +def run_inference(model, tokenizer, gen_config, query, device): + messages = [{"role": "user", "content": f"Expand this search query: {query}"}] + text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + inputs = tokenizer(text, return_tensors="pt").to(device) + + start = time.perf_counter() + with torch.no_grad(): + out = model.generate(**inputs, generation_config=gen_config, max_new_tokens=300) + elapsed = time.perf_counter() - start + + new_tokens = out.shape[-1] - inputs["input_ids"].shape[-1] + result = tokenizer.decode(out[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True) + return result, elapsed, new_tokens + +def score_output(output): + """Simple quality scoring: check for lex/vec/hyde presence and specificity.""" + score = 0 + lines = output.strip().split("\n") + has_lex = has_vec = has_hyde = False + hyde_text = "" + + for line in lines: + l = line.strip() + if l.startswith("lex:"): + has_lex = True + score += 1 + elif l.startswith("vec:"): + has_vec = True + score += 1 + elif l.startswith("hyde:"): + has_hyde = True + hyde_text = l[5:].strip() + score += 2 # hyde is worth more + + # Bonus for hyde length in sweet spot (80-200 chars) + if hyde_text: + hlen = len(hyde_text) + if 80 <= hlen <= 200: + score += 2 + elif 50 <= hlen <= 250: + score += 1 + + # Penalty for generic/template hyde + generic_phrases = ["comprehensive guide", "everything you need to know", "beginners and advanced users"] + for phrase in generic_phrases: + if phrase in hyde_text.lower(): + score -= 1 + + return score, {"has_lex": has_lex, "has_vec": has_vec, "has_hyde": has_hyde, "hyde_len": len(hyde_text)} + +def main(): + device = "cuda:0" + + models = { + "LFM2.5-1.2B (finetuned)": { + "base": "LiquidAI/LFM2.5-1.2B-Instruct", + "adapter": "outputs/sft-lfm2", + "trust_remote": True, + }, + "Qwen3-1.7B (finetuned)": { + "base": "Qwen/Qwen3-1.7B", + "adapter": "outputs/sft", + "trust_remote": False, + }, + } + + results = {} + + for name, cfg in models.items(): + print(f"\n{'='*60}") + print(f"Loading {name}...") + model, tokenizer, gen_config = load_model( + cfg["base"], cfg["adapter"], device, cfg["trust_remote"] + ) + + model_results = [] + total_time = 0 + total_tokens = 0 + total_score = 0 + + for query in QUERIES: + output, elapsed, n_tokens = run_inference(model, tokenizer, gen_config, query, device) + score, details = score_output(output) + + model_results.append({ + "query": query, + "output": output, + "time_s": round(elapsed, 3), + "tokens": n_tokens, + "score": score, + "details": details, + }) + total_time += elapsed + total_tokens += n_tokens + total_score += score + + tok_s = n_tokens / elapsed if elapsed > 0 else 0 + print(f" [{score:2d}] {query[:40]:<40} {elapsed:.2f}s {n_tokens:3d}tok {tok_s:.0f}tok/s") + + avg_time = total_time / len(QUERIES) + avg_score = total_score / len(QUERIES) + avg_toks = total_tokens / total_time if total_time > 0 else 0 + + results[name] = { + "queries": model_results, + "avg_time_s": round(avg_time, 3), + "avg_score": round(avg_score, 2), + "avg_tok_s": round(avg_toks, 1), + "total_score": total_score, + } + + print(f"\n Summary: avg_score={avg_score:.2f} avg_time={avg_time:.2f}s avg_tok/s={avg_toks:.0f}") + + # Free GPU memory + del model + torch.cuda.empty_cache() + + # Print comparison + print(f"\n{'='*60}") + print("COMPARISON") + print(f"{'='*60}") + for name, r in results.items(): + print(f"\n{name}:") + print(f" Total Score: {r['total_score']} / {len(QUERIES) * 8}") # max ~8 per query + print(f" Avg Score: {r['avg_score']}") + print(f" Avg Time: {r['avg_time_s']}s") + print(f" Throughput: {r['avg_tok_s']} tok/s") + + # Save full results + with open("outputs/benchmark_results.json", "w") as f: + json.dump(results, f, indent=2) + print("\nFull results saved to outputs/benchmark_results.json") + +if __name__ == "__main__": + main() diff --git a/finetune/configs/sft-lfm2.yaml b/finetune/configs/sft-lfm2.yaml new file mode 100644 index 0000000..e37b6f1 --- /dev/null +++ b/finetune/configs/sft-lfm2.yaml @@ -0,0 +1,58 @@ +# SFT Training Config for QMD Query Expansion +# Target: LiquidAI LFM2.5-1.2B-Instruct with LoRA +# +# LFM2.5 is a hybrid model: 10 conv blocks + 6 GQA attention blocks +# Uses ChatML template: <|im_start|>user\n...<|im_end|>\n<|im_start|>assistant\n +# No /no_think needed (not Qwen3) +# +# Usage: uv run train.py sft --config configs/sft-lfm2.yaml + +model: + base: "LiquidAI/LFM2.5-1.2B-Instruct" + output: "outputs/sft-lfm2" + trust_remote_code: true + +dataset: + name: "data/train-lfm2/" + text_field: "text" + split: "train" + eval_split: 0.1 + +training: + epochs: 5 + batch_size: 4 + gradient_accumulation_steps: 4 + learning_rate: 2e-4 + max_length: 512 + warmup_ratio: 0.03 + lr_scheduler: "cosine" + +lora: + rank: 16 + alpha: 32 + dropout: 0.0 + target_modules: + # Convolution blocks (layers 0,1,3,4,6,7,9,11,13,15) + - "conv.in_proj" + - "conv.out_proj" + # Attention blocks (layers 2,5,8,10,12,14) + - "q_proj" + - "k_proj" + - "v_proj" + - "out_proj" + # FFN (all 16 layers) + - "feed_forward.w1" + - "feed_forward.w2" + - "feed_forward.w3" + +generation: + temperature: 0.1 + top_k: 50 + top_p: 0.1 + repetition_penalty: 1.05 + +gguf: false # LFM2.5 hybrid arch not supported by llama.cpp + +tracking: + project: "qmd-query-expansion" + run_name: "sft-lfm2-1.2B" diff --git a/finetune/data/fix_hyde_checkpoint.json b/finetune/data/fix_hyde_checkpoint.json new file mode 100644 index 0000000..c221983 --- /dev/null +++ b/finetune/data/fix_hyde_checkpoint.json @@ -0,0 +1 @@ +{"processed_queries": {"1000": "Capitals quiz: Paris (France), Tokyo (Japan), Canberra (Australia), Bras\u00edlia (Brazil), Ottawa (Canada). Quiz includes 50+ capitals.", "1001": "Trivia: The universe is 13.8 billion years old. There are an estimated 100 billion galaxies. The Milky Way is about 100,000 light-years wide.", "1002": "Did you know? The Great Wall of China is over 13,000 miles long. Cleopatra lived closer to the moon landing than the building of the pyramids.", "1003": "Science fact: Water can boil and freeze at the same time at 0.01\u00b0C. This phenomenon is called the triple point of water.", "1004": "Famous inventions timeline: 1440 - Printing Press by Gutenberg, 1876 - Telephone by Bell, 1903 - Airplane by Wright Brothers, 1971 - Microprocessor.", "1005": "World records: Longest human tunnel traveled through by a skateboarding dog: 30.1 m (98 ft). Fastest 100m sprint: Usain Bolt, 9.58 seconds.", "1006": "Geography fact: Russia is the largest country at 17.1 million km\u00b2. Canada follows at 9.98 million km\u00b2. There are 195 countries worldwide.", "1007": "Historical trivia: Did you know that the first Olympic Games were held in 776 BC in Olympia, Greece? The games lasted for nearly 12 centuries.", "1008": "Animal trivia: A group of flamingos is called a 'flamboyance.' Octopuses have three hearts and blue blood. Elephants are the largest land mammals.", "1009": "Sports records: Michael Phelps holds 23 Olympic gold medals in swimming. The fastest recorded serve in tennis was 263 km/h by Sam Groth.", "1010": "Largest countries by area: Russia (17.1 million km\u00b2), Canada (9.98 million km\u00b2), China (9.6 million km\u00b2), USA (9.83 million km\u00b2).", "1011": "Rivers crossing countries: The Danube flows through 10 countries, including Germany and Romania. The Nile passes through 11 countries in Africa.", "1012": "Highest peaks: Mount Everest (8,848 m) in Nepal, K2 (8,611 m) in Pakistan, Kangchenjunga (8,586 m) on the India-Nepal border.", "1013": "Desert climate zones: Hot deserts like the Sahara average 40\u00b0C in summer. Cold deserts like Antarctica can drop to -60\u00b0C in winter.", "1014": "Island nations list: Japan, Madagascar, Iceland, the Philippines, and New Zealand are prominent island nations, each with unique ecosystems.", "1015": "European capitals: Berlin (Germany), Madrid (Spain), Rome (Italy), Vienna (Austria), and Budapest (Hungary) are key capitals in Europe.", "1016": "Population by continent: Asia (4.7 billion), Africa (1.3 billion), Europe (748 million), North America (579 million), South America (430 million).", "1017": "Time zones: The Earth has 24 time zones. UTC+0 is Greenwich Mean Time; UTC+14 includes parts of Kiribati, the earliest timezone.", "1018": "Latitude/Longitude: The coordinates for the Eiffel Tower are 48.8584\u00b0 N, 2.2945\u00b0 E. The exact point can pinpoint any location globally.", "1019": "Country borders: The longest border is between the USA and Canada (8,891 km). The shortest is between Spain and Portugal (1,214 km).", "1020": "Ocean currents: The Gulf Stream carries warm water from the Gulf of Mexico to the North Atlantic. The Antarctic Circumpolar Current is the largest.", "1021": "Tectonic plates: There are seven major plates: Pacific, North American, Eurasian, African, South American, Antarctic, and Indo-Australian.", "1022": "Climate zones: The Earth has five main climate zones: Tropical, Dry, Temperate, Continental, and Polar, each affecting ecosystems differently.", "1023": "Stoicism daily practice: Key practices include negative visualization, focusing on what\u2019s within your control, and maintaining a gratitude journal.", "1024": "Existentialism: A philosophical theory emphasizing individual existence, freedom, and choice, suggesting meaning in life is self-created.", "1025": "Utilitarianism posits that actions are right if they promote happiness. Jeremy Bentham's principle of utility focuses on maximizing pleasure for the greatest number.", "1026": "Kant's Categorical Imperative asserts that one should act only according to maxims that can be universalized. It emphasizes duty and moral law over consequences.", "1027": "The free will vs determinism debate questions whether human actions are determined by external factors or if individuals possess genuine choice in their decisions.", "1028": "Nietzsche's 'will to power' refers to an intrinsic drive to assert and enhance one's influence and creativity, transcending traditional moral values and societal norms.", "1029": "Socrates employed the elenchus method, a form of cooperative argumentative dialogue, to stimulate critical thinking and illuminate ideas through questioning and refutation.", "1030": "Plato's Theory of Forms posits that non-material abstract forms, rather than material objects, represent the most accurate reality, influencing his views on knowledge and truth.", "1031": "Aristotle's virtue ethics emphasizes character and the importance of developing virtuous habits. The 'Golden Mean' represents moderation between extremes of behavior.", "1032": "Descartes' 'Cogito, ergo sum' ('I think, therefore I am') establishes self-awareness as the foundational element of knowledge and existence, emphasizing rational thought.", "1033": "Propositional calculus studies logical relationships between propositions, using connectives like AND, OR, NOT. It's foundational for modern logic and computation.", "1034": "Epistemology investigates the nature of knowledge, addressing questions of belief, truth, and justification. Key figures include Plato, Descartes, and Kant.", "1035": "Metaphysics explores fundamental questions about existence and reality, including the nature of objects, causality, and the relationship between mind and matter.", "1036": "Ancient civilizations timeline: Sumerians (c. 3500 BCE), Egyptians (c. 3100 BCE), Indus Valley (c. 2500 BCE), Greeks (c. 800 BCE), Romans (c. 500 BCE).", "1037": "The fall of the Roman Empire is attributed to economic troubles, military defeats, political corruption, and invasions by barbarian tribes, culminating in 476 CE.", "1038": "Key medieval events include the rise of feudalism (9th century), the Crusades (1096-1291), the Black Death (1347-1351), and the Hundred Years' War (1337-1453).", "1039": "The Renaissance art movement (14th-17th centuries) emphasized realism, perspective, and humanism, with figures like da Vinci, Michelangelo, and Raphael leading innovations.", "1040": "The Industrial Revolution (1760-1840) introduced inventions such as the steam engine (James Watt), power loom (Edmund Cartwright), and spinning jenny (James Hargreaves).", "1041": "World War I was triggered by the assassination of Archduke Franz Ferdinand in 1914, leading to complex alliances and militarism among European powers.", "1042": "Key Cold War events include the Berlin Airlift (1948), Cuban Missile Crisis (1962), Vietnam War (1955-1975), and the fall of the Berlin Wall (1989).", "1043": "The French Revolution timeline: Estates-General convened (1789), Storming of the Bastille (July 14, 1789), Declaration of the Rights of Man (August 1789), Reign of Terror (1793-1794).", "1044": "American Civil War battles include Fort Sumter (1861), Gettysburg (1863), and Appomattox Court House (1865), marking pivotal moments in the conflict's progression.", "1045": "Egyptian pharaohs dynasty lasted over 3,000 years, beginning with Narmer (c. 3100 BCE) and ending with Cleopatra VII (30 BCE), showcasing significant cultural achievements.", "1046": "The Bronze Age collapse (c. 1200 BCE) saw the fall of several civilizations due to factors like climate change, invasions, and trade disruptions, affecting the Eastern Mediterranean.", "1047": "The Byzantine Empire's history spans from 330 CE with Byzantium's founding to 1453 CE, marked by the preservation of Greek and Roman culture amid Islamic conquests.", "1048": "The Vietnam War timeline includes the Gulf of Tonkin Incident (1964), Tet Offensive (1968), and the fall of Saigon (1975), reflecting U.S. involvement and eventual withdrawal.", "1049": "Quantum mechanics basics include wave-particle duality, Heisenberg's uncertainty principle, and quantum entanglement, fundamentally altering our understanding of physics.", "1050": "Einstein's theory posits that space and time are interwoven, with mass influencing curvature. Notably, E=mc\u00b2 links mass and energy equivalence.", "1051": "James Watson and Francis Crick elucidated DNA's double helix structure in 1953, revealing its base pairing of adenine with thymine, and cytosine with guanine.", "1052": "Photosynthesis occurs in chloroplasts, involving light absorption, water splitting, and CO2 fixation. Key steps: light-dependent reactions and Calvin cycle.", "1053": "Black holes form from collapsing stars, exhibiting extreme gravitational pull. The event horizon marks the boundary beyond which nothing escapes.", "1054": "Plate tectonics theory explains Earth's lithosphere's movement. It describes continental drift, seafloor spreading, and the creation of mountain ranges.", "1055": "Natural selection, proposed by Charles Darwin, drives evolution. Traits enhancing survival and reproduction become more common in successive generations.", "1056": "The periodic table contains 118 elements, organized by atomic number. Notable groups include alkali metals (Group 1) and noble gases (Group 18).", "1057": "Cell biology studies the structure and function of cells. Key components include the nucleus, mitochondria, and the plasma membrane.", "1058": "Evidence for climate change includes rising global temperatures, with a 1.2\u00b0C increase since the late 19th century, and increased atmospheric CO2 levels.", "1059": "Notable Impressionist painters include Claude Monet, Edgar Degas, and Pierre-Auguste Renoir, who emphasized light and color in their works.", "1060": "Shakespeare's plays include tragedies like 'Hamlet' and 'Macbeth', comedies such as 'A Midsummer Night's Dream', and historical plays like 'Henry V'.", "1061": "Influential classical music composers include Johann Sebastian Bach, Ludwig van Beethoven, and Wolfgang Amadeus Mozart, each shaping the genre profoundly.", "1062": "Modern art movements include Abstract Expressionism, Surrealism, and Cubism, with key figures like Jackson Pollock, Salvador Dal\u00ed, and Pablo Picasso.", "1063": "Film noir is characterized by its moral ambiguity, femme fatales, and stark lighting. Notable films include 'Double Indemnity' and 'The Maltese Falcon'.", "1064": "Jazz originated in the early 20th century in New Orleans, blending African rhythms with blues and ragtime, leading to styles like bebop and smooth jazz.", "1065": "Renaissance sculpture techniques included contrapposto for dynamic poses and lost-wax casting for bronze works, exemplified by Michelangelo's David.", "1066": "Photography composition rules include the rule of thirds, leading lines, and framing, which enhance visual storytelling and engagement in images.", "1067": "Haiku, a traditional Japanese form, consists of three lines with a 5-7-5 syllable structure, capturing nature and emotions in a concise format.", "1068": "Baroque art features dramatic use of light and shadow (chiaroscuro), emotional intensity, and grandeur, seen in works by Caravaggio and Bernini.", "1069": "Street art and graffiti emerged in the late 20th century, with artists like Banksy gaining prominence. It often serves as social and political commentary.", "1070": "Symptoms of vitamin deficiency vary; for example, Vitamin D deficiency can cause bone pain, while Vitamin C deficiency may lead to scurvy and fatigue.", "1071": "Vaccines stimulate the immune system by introducing antigens. They promote antibody production, enabling the body to recognize and fight pathogens effectively.", "1072": "Normal blood pressure ranges from 90/60 mmHg to 120/80 mmHg. Readings above this may indicate hypertension, requiring lifestyle or medical intervention.", "1073": "Sleep hygiene tips include maintaining a consistent sleep schedule, creating a restful environment, and limiting screen time before bed for better quality sleep.", "1074": "Intermittent fasting can enhance metabolic health, promoting weight loss, improved insulin sensitivity, and cellular repair processes through autophagy.", "1075": "Practice deep breathing for 5-10 minutes to calm the mind. Engage in regular physical activity; aim for 30 minutes most days. Use cognitive-behavioral techniques to challenge anxious thoughts.", "1076": "Try the cat-cow stretch for spinal flexibility. Perform child's pose for lower back relief. Incorporate hamstring stretches, holding each for 20-30 seconds, to alleviate tension.", "1077": "Reduce saturated fats to less than 7% of total calories. Increase fiber intake to 25-30 grams daily. Aim for regular physical activity, targeting at least 150 minutes weekly.", "1078": "Monitor blood glucose levels regularly. Adhere to a balanced diet with a focus on whole grains, vegetables, and lean proteins. Aim for 150 minutes of exercise per week.", "1079": "Consider practicing mindfulness meditation for 10-20 minutes daily. Research shows it can reduce anxiety and improve emotional well-being. Focus on breath awareness to enhance concentration.", "1080": "Macronutrients include carbohydrates (45-65%), proteins (10-35%), and fats (20-35%). Calculate your daily needs based on total caloric intake to maintain balanced nutrition.", "1081": "Basic first aid includes assessing the scene, calling emergency services if needed, and performing CPR if the person is unresponsive. Apply pressure to stop bleeding effectively.", "1082": "Use the formula A = P(1 + r/n)^(nt) to calculate compound interest. For example, investing $1,000 at 5% for 10 years yields approximately $1,628.89.", "1083": "Begin by understanding stocks, bonds, and mutual funds. The S&P 500 is a common index; track it to gauge market performance. Diversification is key to reducing risk.", "1084": "Startup funding stages include seed funding, Series A, Series B, and Series C. Each stage focuses on scaling growth, requiring increasing amounts of capital, often starting with $500,000.", "1085": "Eligible tax deductions for small businesses include home office expenses, vehicle use, and business travel costs. Keep detailed receipts to substantiate claims during audits.", "1086": "The 50/30/20 budgeting method allocates 50% of income to needs, 30% to wants, and 20% to savings. Adjust percentages based on personal financial goals and obligations.", "1087": "Cryptocurrency is a digital currency secured by cryptography. Bitcoin, the first, launched in 2009. Transactions are recorded on decentralized ledgers called blockchains.", "1088": "Inflation erodes purchasing power; a 3% inflation rate means $1,000 today will only buy $970 next year. Diversifying investments can help mitigate these effects on savings.", "1089": "Effective retirement planning includes contributing to a 401(k) or IRA. Aim to save at least 15% of your income annually; consider increasing contributions as income rises.", "1090": "Passive income ideas include rental properties, dividend stocks, and creating online courses. Each can generate revenue with minimal ongoing effort once established.", "1091": "Venture capitalists typically invest larger sums and seek high-growth startups, while angel investors often provide smaller amounts and focus on early-stage companies.", "1092": "A balance sheet consists of assets, liabilities, and equity. Total assets must equal total liabilities plus equity, providing a snapshot of financial health at a specific date.", "1093": "Supply chain management involves overseeing the flow of goods from suppliers to customers. Key components include procurement, production, inventory management, and logistics.", "1094": "A marathon training schedule generally spans 16-20 weeks. Long runs increase weekly, peaking at 20 miles, with tapering in the last few weeks before race day.", "1095": "Maintain a neutral spine during weightlifting. Use a grip that is shoulder-width apart for bench presses, and ensure knees do not extend beyond toes during squats.", "1096": "Focus on a streamlined body position and proper arm pull in freestyle swimming. Practice the catch phase with an extended hand and a high elbow to maximize propulsion.", "1097": "For a proper tennis serve, start with a continental grip. Toss the ball slightly in front and above your head to enable a powerful upward swing and follow-through.", "1098": "Incorporate drills like zig-zag dribbling and crossover moves. Focus on keeping the ball low and using both hands to enhance ball control and agility.", "1099": "Common soccer formations include 4-4-2 and 4-3-3. The 4-4-2 provides a balanced defense and midfield, while the 4-3-3 enhances attacking options with three forwards.", "1100": "Focus on grip, stance, and posture. A proper backswing, downswing, and follow-through can improve accuracy by 30%. Weight transfer is crucial.", "1101": "Begin with Mountain Pose for grounding, then try Downward Dog for stretching. Child's Pose helps beginners relax and focus on breathing.", "1102": "Incorporate strength training, proper warm-ups, and cooldowns. 70% of runners experience injuries; addressing form can reduce risk significantly.", "1103": "Common ratios include 50/34 for compact gearing or 53/39 for road bikes. A 11-28 cassette gives a good balance for climbing and flat terrains.", "1104": "Climbing grades range from 5.0 (easy) to 5.15 (extremely hard). The Yosemite Decimal System is commonly used in the USA for rock climbing.", "1105": "Types of waves include beach breaks, point breaks, and reef breaks. Each offers different ride characteristics based on wind and tide conditions.", "1106": "Best time to visit Japan is during spring (March to May) for cherry blossoms or fall (September to November) for autumn foliage.", "1107": "Checklist: Passport, travel insurance, clothing layers, toiletries, chargers, and snacks. Verify weight limits for carry-ons before packing.", "1108": "Budget travelers can consider Eastern Europe; countries like Poland and Hungary offer accommodation from \u20ac10/night. Use public transport to save.", "1109": "Visa requirements for the USA vary by nationality. ESTA is needed for visa waiver countries; others must apply for a B1/B2 visa at a consulate.", "1110": "Jet lag remedies include adjusting sleep schedule before travel, staying hydrated, and exposure to natural light upon arrival.", "1111": "Plan routes with apps like Roadtrippers, check for rest stops every 2-3 hours, and keep a first-aid kit for emergencies while driving.", "1112": "Research destinations, stay in well-reviewed accommodations, share itineraries with friends, and use apps to stay connected while abroad.", "1113": "Security rules include removing shoes, belts, and laptops from bags. Liquids must be in containers of 3.4 oz or less and placed in a quart-sized bag.", "1114": "Travel insurance coverage typically includes trip cancellations, medical emergencies, and lost baggage. Check policy limits for medical expenses.", "1115": "Popular language apps include Duolingo for vocabulary, Babbel for conversation skills, and Memrise for immersive learning experiences.", "1116": "Hostels offer shared dorms starting around \u20ac15/night, fostering social interactions, while hotels provide privacy but typically cost \u20ac70+ per night.", "1117": "Utilize natural light for best results, use a tripod for stability, and focus on composition; the golden hour enhances colors and shadows.", "1118": "Techniques include using a starter for flavor, kneading dough for gluten development, and monitoring proofing times for optimal rise.", "1119": "Basic knife skills include the claw grip for safety, rocking motion for chopping, and using a sharp knife to enhance efficiency and precision.", "1120": "Ferment vegetables at home by submerging in brine, using weights to keep them submerged, and storing at room temperature for 1-4 weeks.", "1121": "Plan meals around perishable items first, batch cook grains and proteins, and use airtight containers to maintain freshness throughout the week.", "1122": "Common spice combinations include cumin and coriander for Latin dishes, rosemary and thyme for Mediterranean, and paprika with garlic for BBQ.", "1123": "Start with a well-floured surface, mix flour and eggs, knead for 10 minutes, then rest dough for 30 minutes before rolling and cutting.", "1124": "Brewing methods include pour-over for clarity, French press for richness, and espresso for intensity. Adjust grind size for desired extraction.", "1125": "Pair light-bodied wines like Sauvignon Blanc with seafood. Pair full-bodied reds like Cabernet Sauvignon with grilled meats for optimal flavor balance.", "1126": "Top vegetarian protein sources include lentils (18g per cup), chickpeas (15g per cup), quinoa (8g per cup), and edamame (17g per cup).", "1127": "Store raw meat at 28\u00b0F (-2\u00b0C) to 32\u00b0F (0\u00b0C). Refrigerate leftovers within 2 hours; consume within 3-4 days. Freeze meats for up to 12 months.", "1128": "Feed your sourdough starter with equal parts flour and water weekly. Maintain at room temperature for active fermentation; refrigerate for slower growth.", "1129": "For beef, grill at 450\u00b0F to 500\u00b0F for medium-rare (135\u00b0F). Chicken should reach 165\u00b0F, grilled at medium heat (350\u00b0F to 400\u00b0F) for even cooking.", "1130": "Cognitive biases include confirmation bias, anchoring bias, and availability heuristic. Each influences decision-making and perception of reality.", "1131": "Attachment styles: secure (positive relationships), anxious (fear of abandonment), avoidant (emotional distance), and disorganized (fear-driven behavior).", "1132": "Maslow's hierarchy of needs: physiological, safety, love/belonging, esteem, and self-actualization, arranged in a pyramid from basic to complex needs.", "1133": "A growth mindset embraces challenges and sees failure as a learning opportunity, while a fixed mindset views abilities as static and unchangeable.", "1134": "Emotional intelligence components include self-awareness, self-regulation, motivation, empathy, and social skills, crucial for effective interpersonal relations.", "1135": "Memory techniques include the method of loci, acronyms, and chunking. For instance, use 'HOMES' to remember the Great Lakes: Huron, Ontario, Michigan, Erie, Superior.", "1136": "Habit formation involves cue, routine, and reward. Research shows it takes an average of 66 days to form a new habit, varying by individual and behavior.", "1137": "The stress response triggers fight or flight: heart rate increases, adrenaline surges, and cortisol levels rise, preparing the body for immediate action.", "1138": "Myers-Briggs types include 16 combinations like INTJ (Introverted, Intuitive, Thinking, Judging) and ESFP (Extraverted, Sensing, Feeling, Perceiving).", "1139": "Intrinsic motivation arises from internal rewards (personal growth), while extrinsic motivation is driven by external rewards (money, recognition).", "1140": "Decision-making psychology explores heuristics, biases, and the dual-process theory: System 1 (fast, intuitive) vs. System 2 (slow, deliberative).", "1141": "Procrastination can stem from fear of failure, perfectionism, or lack of motivation. Solutions include setting smaller tasks and using time management techniques.", "1142": "Renewable energy types include solar, wind, hydroelectric, geothermal, and biomass. Solar energy capacity reached 250 GW globally in 2020.", "1143": "To reduce carbon footprint: use public transport, reduce meat consumption (beef has the highest emissions), and increase energy efficiency in homes.", "1144": "Composting basics: use a mix of green materials (nitrogen-rich) and brown materials (carbon-rich). Maintain moisture and aeration for decomposition.", "1145": "Endangered species include the Amur leopard, Javan rhinoceros, and Sumatra orangutan, all facing threats from habitat loss and poaching.", "1146": "Recycling symbols: 1 (PETE), 2 (HDPE), 3 (PVC), 4 (LDPE), 5 (PP), 6 (PS), 7 (other). Each indicates the type of plastic for appropriate recycling.", "1147": "Ocean plastic pollution exceeded 150 million tons in 2020, harming marine life. Microplastics are particularly concerning, affecting food chains.", "1148": "Deforestation effects include loss of biodiversity, increased carbon emissions, and disruption of water cycles, threatening ecosystems and human livelihoods.", "1149": "Sustainable living tips: reduce single-use plastics, support local agriculture, conserve water, and choose energy-efficient appliances to lessen your impact.", "1150": "In 2021, the WWF reported a 68% decline in wildlife populations since 1970, emphasizing the need for habitat protection and anti-poaching laws.", "1151": "The average cost of solar panel installation in the U.S. is around $3 per watt, with a typical system size of 5 kW costing approximately $15,000 before incentives.", "1152": "Implementing drip irrigation can reduce water usage by up to 60% compared to traditional methods, significantly conserving water in agricultural practices.", "1153": "Biodiversity boosts ecosystem productivity, resilience, and stability. Healthy ecosystems with diverse species can provide humans with food, clean air, and water.", "1154": "The derivative of a function f(x) at a point x=a is defined as the limit of the difference quotient as h approaches 0: f'(a) = lim(h->0) [f(a+h) - f(a)]/h.", "1155": "Probability basics include the concept that the probability of an event A is P(A) = Number of favorable outcomes / Total number of outcomes.", "1156": "In linear algebra, a matrix can represent a system of equations. The product of matrices A (m x n) and B (n x p) results in a new matrix C (m x p).", "1157": "A common geometry proof is the Pythagorean theorem: For a right triangle with legs a and b, and hypotenuse c, a\u00b2 + b\u00b2 = c\u00b2 holds true.", "1158": "Logarithm properties include: log_b(m * n) = log_b(m) + log_b(n) and log_b(m/n) = log_b(m) - log_b(n). Base changes are done via log_b(m) = log_k(m)/log_k(b).", "1159": "Key trigonometric identities include sin\u00b2(x) + cos\u00b2(x) = 1 and tan(x) = sin(x)/cos(x), essential for solving various trigonometric equations.", "1160": "Set theory basics include operations like union (A \u222a B), intersection (A \u2229 B), and difference (A - B), defining relationships between sets.", "1161": "Prime numbers are defined as having only two distinct positive divisors: 1 and itself. The first five primes are 2, 3, 5, 7, and 11.", "1162": "To convert fractions to decimals, divide the numerator by the denominator. For example, 1/4 equals 0.25, while 3/8 equals 0.375.", "1163": "To solve equations like 2x + 3 = 7, isolate x by subtracting 3 from both sides, then divide by 2, yielding x = 2.", "1164": "Graph theory fundamentals include vertices (nodes) and edges (connections). A simple graph contains no loops or multiple edges between vertices.", "1165": "In combinatorics, the formula for permutations is P(n, r) = n! / (n-r)!, representing the number of ways to arrange r objects from n.", "1166": "In Spanish, regular -ar verbs conjugate by dropping the -ar and adding endings: -o, -as, -a, -amos, -\u00e1is, -an for present tense.", "1167": "Japanese hiragana consists of 46 characters representing syllables, while katakana also has 46 characters used mainly for foreign words.", "1168": "French pronunciation rules include nasal sounds in words like 'pain' and liaisons where final consonants are pronounced if followed by a vowel.", "1169": "German grammar includes four cases: nominative (subject), accusative (direct object), dative (indirect object), and genitive (possession).", "1170": "Mandarin tones are crucial for meaning; there are four tones: first (high), second (rising), third (dipping), and fourth (falling).", "1171": "Common Latin phrases include 'Carpe Diem' (Seize the day) and 'Et cetera' (And the rest), often used in modern contexts.", "1172": "The Arabic alphabet consists of 28 letters, written from right to left, with letters changing shape depending on their position in a word.", "1173": "Common English idioms include 'Break the ice' (to initiate conversation) and 'Bite the bullet' (to face a difficult situation).", "1174": "Basics of sign language include the manual alphabet, commonly fingerspelling names, and essential signs like 'thank you' and 'please'.", "1175": "The word 'etymology' derives from the Greek 'etymon' meaning 'true sense'. It dates back to the 14th century, reflecting the study of word origins.", "1176": "Use a comma to separate items in a list. An apostrophe indicates possession, e.g., 'the dog's leash'. A semicolon links closely related independent clauses.", "1177": "The Chicago Manual of Style recommends using the Oxford comma for clarity. APA style prefers in-text citations with author-date format: (Smith, 2020).", "1178": "Common woodworking joints include butt joints, miter joints, dovetail joints, and mortise and tenon. Each has distinct strength and aesthetic characteristics.", "1179": "Beginner knitting patterns often include simple projects like scarves or dishcloths. Look for patterns that use basic stitches like knit and purl.", "1180": "Basic home repair skills include fixing leaky faucets, patching drywall, and unclogging drains. Essential tools include a hammer, screwdriver, and pliers.", "1181": "To thread a sewing machine, first raise the presser foot, then follow the threading diagram. Ensure the needle is correctly inserted and facing down.", "1182": "Acrylic painting techniques include layering, glazing, and dry brushing. Use a palette knife for texture and experiment with water for different effects.", "1183": "On a pottery wheel, start with centered clay. Press down and pull up to shape your piece. Keep hands wet for smoother results and avoid excessive pressure.", "1184": "For soldering electronics, use a soldering iron at 350\u00b0C. Clean surfaces with flux, apply solder evenly, and ensure joints are solid and shiny.", "1185": "Prepare garden soil by testing pH levels; ideally, it should be between 6.0 and 7.0. Amend with compost and organic matter to enhance fertility.", "1186": "Essential candle making supplies include wax (soy or paraffin), wicks, fragrance oils, and a double boiler. Safety gear is also recommended.", "1187": "Basic leather crafting tools include a rotary cutter, edge tools, and a stitching awl. A cutting mat protects surfaces while working on projects.", "1188": "Origami folding instructions often start with a square piece of paper. Common folds include valley folds, mountain folds, and reverse folds for structure.", "1189": "For furniture restoration, clean surfaces with a gentle solvent, repair joints with wood glue, and finish with varnish or oil for protection.", "1190": "As of 2026, GitHub introduced features like 'Code Suggestions' using AI, and enhanced security measures for repository management.", "1191": "In 2025, Kubernetes added 'Ephemeral Containers' for debugging, and 'Volume Snapshot' support for persistent storage management improvements.", "1192": "November 2023 saw an increase in climate tech investments, with $1.2 billion in funding directed toward renewable energy startups and carbon capture technologies.", "1193": "The latest release of React, version 18.2.0, features automatic batching and improved SSR support, enhancing performance and user experience.", "1194": "In October 2023, AI advancements included OpenAI's GPT-4.5 release, focusing on multimodal capabilities and improved contextual understanding.", "1195": "Kubernetes 2026 introduced 'Kubelet Configuration' for better node management and 'API Aggregation Layer' enhancements for custom resource handling.", "1196": "GitHub's latest version, released December 2023, includes streamlined pull request reviews and enhanced project management tools.", "1197": "Latest Python updates (3.11) emphasize performance improvements, with a 10-60% speed increase in major libraries and syntax enhancements.", "1198": "Shopify's December 2023 updates included new payment processing options and enhanced analytics tools for better sales tracking and inventory management.", "1199": "November 2023 saw Vue.js release version 3.2.0, introducing the Composition API and improved TypeScript support for developers.", "1200": "Next.js 2025 changelog highlights include improved SSR, enhanced image optimization, and the addition of middleware support for better routing.", "1201": "Docker's latest version, 24.0, released in March 2025, introduces support for multi-platform images and enhanced security features with new scanning tools.", "1202": "Kubernetes 2025 changelog reveals v1.27 introduced PodSecurity admission, enhanced scheduler performance, and new API for custom resource metrics.", "1203": "New features in Docker 2025 include BuildKit improvements, automatic layer caching, and integration of container logging with external services.", "1204": "Vue 2025 changes include Composition API enhancements, Vue Router v5 with improved lazy loading, and better TypeScript support for seamless development.", "1205": "AI advancements in 2025 feature GPT-5 release with 10 trillion parameters, improved multimodal capabilities, and enhanced ethical guidelines for AI usage.", "1206": "Vue 2026 updates focus on better performance optimizations, new CLI features for easier project scaffolding, and support for Suspense in SSR.", "1207": "Recent AI changes in 2025 involve the development of explainable AI frameworks and regulations for AI-generated content to ensure consumer protection.", "1208": "October 2025 Vue news includes the announcement of Vue 3.3 with improved reactivity performance and community initiatives for better documentation.", "1209": "Next.js 2026 introduces React Server Components, native ES modules support, and enhanced analytics for performance tracking and optimization.", "1210": "Docker changelog 2026 highlights include introduction of Docker Compose v2.5, improved networking features, and optimizations for resource usage.", "1211": "November 2025 Python news features the release of Python 3.12 with performance improvements and new syntax for type hinting for enhanced readability.", "1212": "Recent Python changes in 2026 include async improvements, new pattern matching capabilities, and the deprecation of older libraries like urllib.", "1213": "Climate tech changelog 2026 highlights include advancements in carbon capture technologies, renewable energy innovations, and new funding initiatives.", "1214": "GitHub changelog 2026 reveals new features such as enhanced code review tools, automatic security updates, and improved CI/CD integrations.", "1215": "Shopify's latest version, 2.5, released in April 2025, introduced improved payment processing features and new tools for inventory management.", "1216": "Recent Python changes in 2025 include the introduction of f-string debugging, better performance with PEP 572, and new async IO utilities.", "1217": "AWS changes in 2025 include the launch of Graviton3 processors, enhanced AI/ML services with SageMaker updates, and new serverless offerings.", "1218": "October 2025 climate tech news showcases the launch of three new solar projects, advancements in battery storage technology, and funding announcements.", "1219": "Python changelog 2025 details the release of Python 3.11 with performance boosts, new error messages, and enhanced typing features for developers.", "1220": "Latest AI updates include breakthroughs in natural language understanding, advancements in reinforcement learning, and expanded ethical AI frameworks.", "1221": "Vue recent news December 2025 covers the upcoming Vue 3.4 release, new plugins for state management, and community-driven enhancements.", "1222": "React news in October 2025 highlights the release of React 18.2 with improved hydration techniques and updates to the new Concurrent features.", "1223": "Recent space exploration changes in 2025 include Artemis II crew selection, Mars Sample Return mission prep, and advancements in satellite technology.", "1224": "Latest space exploration release includes NASA\u2019s Artemis III mission scheduled for 2026, featuring new lunar lander designs and crew training updates.", "1225": "In 2026, ML frameworks like TensorFlow 3.0 and PyTorch 2.2 introduced enhanced support for large language models and improved GPU utilization.", "1226": "December 2025 saw the launch of OpenAI's Codex 2.0, significantly improving code generation and debugging capabilities for developers.", "1227": "GitHub unveiled a new AI-powered code review feature in 2026, enhancing pull request suggestions using machine learning algorithms.", "1228": "Vue 3.3 released in 2026, introducing Composition API enhancements and new directives for improved reactivity and component organization.", "1229": "Docker 20.10.14 in 2025 added support for multi-architecture images and improved performance for build caching and layer management.", "1230": "In 2026, GitHub launched Copilot Labs, introducing experimental features for collaborative coding and enhanced documentation generation.", "1231": "Shopify reported a 25% increase in Q3 2026 revenue, driven by enhanced AI tools for personalized shopping experiences and inventory management.", "1232": "GitHub's 2025 updates included improved issue tracking and the rollout of Discussions, allowing teams to communicate more effectively.", "1233": "Next.js 13 released in 2026 with new features like middleware support and improved image optimization, enhancing performance on server-side rendering.", "1234": "TypeScript 5.0 in 2026 introduced new syntax for type aliases and improved inference, increasing developer productivity and code clarity.", "1235": "Python 3.11 added structural pattern matching and performance improvements, with benchmarks showing up to 30% faster execution in certain cases.", "1236": "Climate tech updates in 2025 included breakthroughs in carbon capture technology, with several startups reporting efficiencies over 90% in CO2 removal.", "1237": "In December 2025, GitHub reported reaching 100 million repositories, highlighting a 15% increase in open-source contributions year-over-year.", "1238": "Kubernetes 1.27 launched in 2026, featuring enhanced security with PodSecurity admission and improved scheduling algorithms for resource optimization.", "1239": "October 2025 saw Kubernetes releasing its new multi-cluster management capabilities, simplifying operations across various environments.", "1240": "TypeScript's October 2025 updates included support for decorators and a new compiler API, aimed at improving the development experience.", "1241": "Docker's October 2025 news highlighted partnerships with cloud providers to streamline container orchestration and deployment for enterprise solutions.", "1242": "In 2025, significant milestones in space exploration included the successful Mars Sample Return mission planning by NASA, targeting launch in 2031.", "1243": "Vue 3.2 was released in 2026, featuring improved TypeScript support and a new plugin system aimed at enhancing modular development.", "1244": "Next.js 12 introduced in 2025 featured automatic static optimization and a revamped API for handling serverless functions more efficiently.", "1245": "In 2025, climate tech innovations included AI-driven energy management systems, reducing operational costs by up to 40% for large enterprises.", "1246": "2026 saw climate tech advancements in renewable energy storage, with new battery technologies achieving 20% greater efficiency over previous models.", "1247": "Space exploration updates in 2026 included the Artemis II mission's successful crewed flight test, paving the way for lunar landings by 2028.", "1248": "Shopify's 2025 features included an enhanced AR shopping experience and a new subscription management tool for recurring billing solutions.", "1249": "In 2026, climate tech focused on sustainable agriculture innovations, with vertical farming techniques reducing water usage by 60% compared to traditional methods.", "1250": "In October 2023, researchers unveiled a new ML model achieving 95% accuracy in image recognition, leveraging self-supervised learning techniques.", "1251": "React 18.3 introduced features like automatic batching, improved SSR support, and new hooks for better state management, enhancing performance and developer experience.", "1252": "TypeScript 5.4 released on October 12, 2023, featuring improved inference for `const` assertions and new utility types, boosting type safety and developer productivity.", "1253": "Next.js 13.5 released on October 15, 2023, includes enhanced image optimization, middleware support, and improved build performance for static exports.", "1254": "Kubernetes 1.28, releasing in August 2026, includes the new PodSecurity admission, enhanced resource quotas, and improved stateful set scaling capabilities.", "1255": "In 2026, React introduced Concurrent Features by default, improving rendering performance and user experience, along with new SSR capabilities.", "1256": "2025 saw the launch of a $500 million fund for climate tech startups, focusing on carbon capture and renewable energy innovations to mitigate climate change.", "1257": "Shopify 2026 introduced AI-driven product recommendations and one-click checkout, significantly increasing conversion rates for merchants by 30% on average.", "1258": "Kubernetes 2026 changelog highlights include enhancements to the Container Storage Interface and more robust support for multi-cluster management tools.", "1259": "In November 2025, Shopify reported a 25% increase in merchant sales, attributed to new analytics tools and improved integration with social media platforms.", "1260": "GitHub announced a new Copilot feature in October 2023 that generates code snippets in multiple programming languages, streamlining the development process.", "1261": "Kubernetes news in December 2023 highlights the upcoming 1.29 release, featuring better support for ephemeral containers and improved security policies.", "1262": "Docker 2025 introduced BuildKit enhancements, reducing build times by 40%, and added native support for multi-platform builds in the Docker CLI.", "1263": "React 2025 updates focused on performance optimizations, including tree-shaking improvements and better integration with TypeScript for type safety.", "1264": "Kubernetes 2025 changed the default storage class to support volume snapshots, improving data resilience and backup strategies across clusters.", "1265": "TypeScript 2026 introduced the `satisfies` operator for better type inference, streamlining the process of ensuring types align with expected interfaces.", "1266": "Shopify's 2025 changelog includes the introduction of Shopify Fulfillment Network, enabling faster shipping options for merchants across North America.", "1267": "Docker's latest updates in October 2023 include enhanced security scanning features and improved integration with Kubernetes for streamlined deployments.", "1268": "In 2025, new ML frameworks emerged, like PyTorch 2.0, emphasizing GPU acceleration and modularity, significantly improving model training times.", "1269": "2026 AI updates include breakthroughs in natural language processing, with models achieving human-like conversational abilities and context awareness improvements.", "1270": "Docker 2026 updates focus on enhanced support for serverless functions, allowing developers to deploy functions directly from the Docker CLI efficiently.", "1271": "AWS 2026 introduced new AI services like SageMaker Studio Lab, providing free compute resources for ML model experimentation and training.", "1272": "Shopify 2025 updated its API to include advanced analytics features, allowing merchants to track user behavior and optimize sales strategies effectively.", "1273": "AI changelog 2026 highlights include the release of GPT-5, which boasts improved contextual understanding and a 50% reduction in response time.", "1274": "Kubernetes 2023 updates include improved scheduling algorithms and enhanced observability features, enabling better monitoring of cluster performance.", "1275": "In 2025, climate tech saw a 30% increase in solar panel efficiency, with new perovskite materials. Carbon capture technology also advanced, reducing costs by 40%.", "1276": "Latest updates in machine learning include the introduction of GPT-5, boasting 175 billion parameters, and advancements in self-supervised learning techniques.", "1277": "Next.js 2025 introduced middleware support, enabling server-side logic without API routes, and improved image optimization with the new 'next/image' component.", "1278": "TypeScript 2025 added support for 'override' and 'override declaration' keywords, improved type inference, and introduced the 'satisfies' operator for type-checking.", "1279": "In 2026, AWS announced the launch of Graviton3 instances, offering 25% better price-performance, and the introduction of SageMaker Canvas for no-code ML.", "1280": "Vue 2025 added the Composition API enhancements, improved reactivity model, and introduced a new CLI tool for project scaffolding and dependency management.", "1281": "New features in TypeScript 2025 include 'template literal types', 'const assertions', and improved support for 'readonly' and 'writeonce' modifier types.", "1282": "React's December news highlights the release of React 18.2, focusing on performance optimizations and the introduction of the 'useId' hook for unique IDs.", "1283": "AWS changelog 2026 features the introduction of Amazon RDS Proxy for serverless applications and enhanced security with IAM roles for service accounts.", "1284": "AI recent news in December includes the unveiling of ChatGPT 4.5, which features enhanced reasoning capabilities and real-time web browsing integration.", "1285": "TypeScript's December updates include a new compiler option for 'useDefineForClassFields' and improvements in performance for large project builds.", "1286": "In December, climate tech reports highlighted a 50% rise in investments in renewable energy projects, with significant advancements in battery storage technologies.", "1287": "Next.js recent news in October covered the beta release of the new 'next/future' experimental features, focusing on improved developer experience and performance.", "1288": "The latest AI version release is GPT-5, launched in December 2025, featuring multi-modal capabilities and an expanded knowledge base up to 2026.", "1289": "Latest Next.js updates include automatic static optimization improvements and new support for React Server Components, enhancing SSR capabilities.", "1290": "Vue's new features in 2026 include improved TypeScript integration, enhanced routing capabilities, and a new state management library for simplified state handling.", "1291": "Space exploration updates in 2026 include the Artemis III mission planned for 2027, aiming to establish a sustainable lunar presence by 2030.", "1292": "Recent Shopify changes in 2026 include the release of Shopify Plus 3.0 with improved analytics tools and AI-driven product recommendations for merchants.", "1293": "The latest version release in machine learning is TensorFlow 3.0, which emphasizes modularity and performance improvements for distributed training.", "1294": "Docker's new features in 2026 include support for multi-platform builds and enhanced security features with built-in vulnerability scanning for images.", "1295": "Python's recent news in December includes the release of Python 3.12, featuring performance enhancements and pattern matching for cleaner syntax.", "1296": "React 2026 changes include the introduction of concurrent rendering improvements and the new 'useDeferredValue' hook for managing rendering priorities.", "1297": "Docker changelog 2025 highlighted the addition of BuildKit enhancements and support for Docker Compose v2, improving multi-container orchestration.", "1298": "Changes in Docker 2026 include the introduction of containerd support for enhanced runtime performance and a new integrated CLI for easier management.", "1299": "Recent Next.js changes in 2026 include a new plugin system for easier customization and improved static site generation capabilities.", "1300": "In 2023, breakthroughs in carbon capture tech have emerged, with companies like Climeworks achieving over 1,000 tons of CO2 captured monthly.", "1301": "The 2026 ML changelog highlights the introduction of TensorFlow 3.0, which features enhanced model optimization and expanded support for quantum computing.", "1302": "AWS 2025 updates include the launch of new Graviton3 processors, promising up to 25% better performance for EC2 instances compared to Graviton2.", "1303": "November 2023 saw Kubernetes 1.27 release, introducing improved support for Windows workloads and enhanced security features with PodSecurity admission.", "1304": "In 2025, AI advancements included OpenAI's release of GPT-5, boasting capabilities for multi-modal inputs and improved context understanding.", "1305": "Next.js 2025 introduced support for React Server Components and a new image optimization API, enhancing performance for dynamic websites.", "1306": "October 2023 saw Python 3.12 release, which includes type parameters in collections and performance improvements, with benchmarks showing 5-10% speedup.", "1307": "Vue 3.3 released in early 2025, offering improved TypeScript support and the new 'Teleport' feature for efficient DOM manipulation.", "1308": "AI features in 2026 include real-time language translation by Google AI and enhanced ethical guidelines for AI deployment across industries.", "1309": "React 18 introduced a new concurrent rendering feature, allowing developers to create smoother user experiences by prioritizing updates in 2026.", "1310": "Vue 3.2 released in 2025, featuring Composition API enhancements and better reactivity performance, with an emphasis on developer experience.", "1311": "The latest climate tech version, ClimateTech 2.1, released in November 2023, includes updates to renewable energy tracking and emissions reporting tools.", "1312": "Python 3.12 was released in October 2023, bringing new features like the 'match' statement enhancements and more robust error messages.", "1313": "AWS December 2023 news includes the introduction of new SageMaker features for automated machine learning workflows and model tuning capabilities.", "1314": "GitHub's 2025 changelog highlights the introduction of 'Projects v3', enabling enhanced project management with Kanban boards and automation.", "1315": "Machine learning in 2026 will see the rise of self-supervised learning techniques, reducing the need for labeled data and improving model accuracy.", "1316": "Recent space exploration news from October 2023 includes NASA's Artemis II mission, set to launch in 2024, aiming to return humans to the Moon.", "1317": "React 2026 changelog features the introduction of 'Suspense for Data Fetching', optimizing loading states in applications, enhancing user experience.", "1318": "The React 2025 changelog highlights the introduction of server-side rendering improvements and automatic static optimization features.", "1319": "Machine learning updates in November 2023 include new frameworks that simplify deep learning model training, reducing setup time by 30%.", "1320": "GitHub new features in 2025 include enhanced code review tools and the introduction of 'Discussions', fostering community engagement on projects.", "1321": "New features in machine learning for 2025 include automated feature engineering tools and improved support for federated learning frameworks.", "1322": "In November 2023, AI news reported a breakthrough in explainable AI, with researchers developing models that can articulate decision-making processes.", "1323": "Python 3.11 introduced in 2025 brings 'frozen' dataclasses and performance optimizations, with benchmarks showing up to 20% faster execution.", "1324": "Latest Shopify updates include the launch of Shopify Markets for global selling and enhanced analytics features for better sales insights.", "1325": "Kubernetes 1.26 introduces 'PodSecurity Admission' for better security policies, 'Immutable Secrets' for configuration stability, and improved 'HPA' scaling capabilities.", "1326": "In 2026, AI advancements include GPT-4's release, improved multimodal capabilities, and new ethical frameworks for AI deployment in industries like healthcare.", "1327": "Machine learning in 2026 sees the introduction of 'AutoML 2.0', enhanced model interpretability tools, and breakthroughs in federated learning for privacy-preserving AI.", "1328": "Shopify's 2025 updates include 'Shopify Markets' for global selling, 'Shopify Flow' for automated workflows, and a revamped 'Shopify POS' for retail integration.", "1329": "In 2025, machine learning focuses on 'explainable AI' with frameworks like LIME, and the integration of 'reinforcement learning' in real-time applications.", "1330": "Shopify's 2026 features include 'AI-driven product recommendations', 'Augmented Reality' for product previews, and enhanced 'in-app messaging' for customer support.", "1331": "In November, Docker released version 24.0 with improved build performance, support for multi-platform images, and enhanced security features with 'Docker Bench'.", "1332": "Latest Vue updates include Vue 3.2's Composition API enhancements, improved TypeScript support, and the introduction of 'Suspense' for better async component handling.", "1333": "Next.js 13.0 introduces 'app directory' for routing, 'React Server Components' for improved performance, and 'image optimization' using the new 'next/image' component.", "1334": "GitHub's 2026 updates include 'GitHub Codespaces' enhancements, 'Advanced Security' with secret scanning, and 'Discussion' features for better community engagement.", "1335": "AWS 2025 introduces 'Graviton3' instances for better performance, 'AWS CloudFormation' for simplified resource management, and 'SageMaker Canvas' for no-code ML.", "1336": "Python 3.11, released in 2026, introduces 'match' statements for structural pattern matching and significant performance improvements with benchmarks showing 30% faster execution.", "1337": "TypeScript 4.7 (2025) includes 'template literal types', 'key remapping' in mapped types, and improved type inference for better developer experience.", "1338": "2026 milestones in space exploration include Artemis II's crewed lunar flyby, Mars Sample Return mission planning, and the launch of the James Webb Space Telescope's successor.", "1339": "AWS 2026 unveils 'Lambda SnapStart' for quicker cold start times, 'App Runner' for simplified app deployments, and expanded 'S3 Object Lambda' capabilities.", "1340": "TypeScript 4.6 (2025) brings 'ESM support' improvements, 'exact optional property types', and 'control flow analysis' enhancements for better type checking.", "1341": "Latest TypeScript updates include improved type-checking speed, support for 'type-only imports', and 'declaration emit' optimizations in version 4.9.", "1342": "React 18 introduces 'Concurrent Mode' for better rendering capabilities, 'automatic batching' of updates, and the new 'Suspense' feature for data fetching.", "1343": "AWS changelog 2025 highlights include 'EC2 Auto Scaling' enhancements, introduction of 'AWS CDK v2', and new 'RDS' features for better database management.", "1344": "Space exploration changelog 2026 highlights include the successful Mars Sample Return mission planning, the launch of the Lunar Gateway, and ongoing updates from the Artemis program.", "1345": "React 2025 features include 'automatic hydration', new hooks for performance optimization, and enhancements to the 'React DevTools' for better debugging.", "1346": "AWS latest version release includes 'Amazon RDS' with Multi-AZ deployments for SQL databases, enhanced 'EKS' features for Kubernetes management, and 'S3' lifecycle policies.", "1347": "Latest space exploration updates highlight the Perseverance rover's ongoing Mars exploration, successful ISS missions, and developments in lunar base planning.", "1348": "Kubernetes latest version release 1.27 includes 'Kubelet Configuration' improvements, 'enhanced metrics server', and 'custom metrics' for better workload management.", "1349": "React recent news in November 2025 includes the release of 'React 18.1', improved server-side rendering capabilities, and community updates from the React Conf.", "1350": "TypeScript 5.2 was released on November 15, 2023, introducing new decorators and improved type inference for JSX. Enhancements focus on performance and developer experience.", "1351": "By 2025, AI has integrated into everyday applications with a focus on explainability. Notable advancements include GPT-4's contextual awareness and real-time language translation.", "1352": "In December 2023, Docker announced version 24.0, featuring improved security in container images and support for multi-architecture builds, enhancing deployment flexibility.", "1353": "The TypeScript changelog for 2026 notes the introduction of type-only imports and exports, improving module performance and clarity, set for release in Q2 2026.", "1354": "Space exploration in 2025 includes the Artemis III mission aiming for a lunar landing in late 2025, alongside advancements in Mars sample return missions and asteroid mining.", "1355": "Recent news in December 2023 highlights NASA's successful test of the Space Launch System, paving the way for upcoming lunar missions and interplanetary exploration.", "1356": "Shopify's 2026 changelog includes new features like augmented reality product displays, a revamped checkout process, and enhanced integration with social media platforms.", "1357": "AWS announced significant updates in November 2023, including the launch of Amazon SageMaker Canvas for no-code ML and enhanced security features for AWS Lambda.", "1358": "October 2023 saw AWS release new capabilities for Amazon RDS, including cross-region read replicas and automated backups for PostgreSQL, enhancing database resilience.", "1359": "Next.js 14 was released in December 2023, introducing native support for React Server Components and improved data fetching methods for optimized performance.", "1360": "November 2023 features news on the James Webb Telescope's first exoplanet imaging results, marking a milestone in astronomical research and deep space exploration.", "1361": "Python 3.12, set for release in 2025, will include structural pattern matching enhancements and performance improvements for integer operations, increasing execution speed.", "1362": "GitHub's November 2023 updates include new project management features, enhanced dependency graphs, and the introduction of AI-powered code review suggestions.", "1363": "Machine learning changelog for 2025 highlights the mainstream adoption of federated learning frameworks and enhanced model interpretability tools in major ML libraries.", "1364": "Next.js updates for November 2023 include improved static generation features and the introduction of a new image optimization API for faster load times.", "1365": "Latest AWS updates include the introduction of Amazon Bedrock for generative AI, expanded capabilities of AWS Lambda, and enhancements to AWS CloudFormation.", "1366": "Vue 3.3 changes in 2026 focus on improved reactivity APIs, TypeScript support enhancements, and integration with Vite for faster build times and improved performance.", "1367": "2025's space exploration changes include successful Mars colonization simulations, advancements in reusable rockets, and increased international collaboration in lunar missions.", "1368": "TypeScript 2026 introduces new features like `satisfies` operator for type assertions and improved support for ECMAScript modules, enhancing code maintainability.", "1369": "GitHub's 2025 updates include revamped project boards, enhanced repository insights, and the introduction of built-in code review automation using AI tools.", "1370": "Recent climate tech changes in 2026 focus on carbon capture innovations, widespread adoption of renewable energy technologies, and regulatory frameworks for green tech.", "1371": "Python 2026 changelog includes introduction of new syntax for data classes, performance enhancements, and expanded support for asynchronous programming paradigms.", "1372": "TDS Motorsports specializes in high-performance motorsport vehicles, focusing on customization and engineering excellence for racing applications and automotive enthusiasts.", "1373": "React Hooks tutorial covers useState and useEffect hooks, guiding users through state management and side effects in functional components for optimal performance.", "1374": "Docker container networking now supports IPv6 and improved service mesh integration, allowing seamless communication between services in multi-container applications.", "1375": "Use 'kubectl apply -f deployment.yaml' to deploy a pod. Specify replicas, selectors, and container specs in the YAML file. Monitor with 'kubectl get pods'.", "1376": "Set up AWS Lambda via the console or CLI. Choose a runtime (e.g., Node.js 14.x), configure triggers, and set the execution role for permissions.", "1377": "Integrate Stripe by installing the Stripe SDK. Use 'stripe.charges.create' to process payments. Ensure to set up webhooks for asynchronous events.", "1378": "Create a .github/workflows directory. Define a YAML file with triggers, jobs, and steps. Use 'runs-on: ubuntu-latest' for environment setup.", "1379": "Deploy to Vercel by connecting your GitHub repo. Configure build settings in 'vercel.json'. Run 'vercel' in the terminal for CLI deployment.", "1380": "Configure Supabase Auth by enabling providers in the dashboard. Use 'supabase.auth.signIn()' for user login and 'supabase.auth.onAuthStateChange()' for state tracking.", "1381": "Utilize Twilio SMS API with 'twilio.messages.create()' method. Set 'from' and 'to' numbers. Ensure to handle responses for successful delivery status.", "1382": "Set up Datadog monitoring by installing the agent on your servers. Configure integrations for AWS, Kubernetes, or any services you want to monitor.", "1383": "Integrate Sentry by adding the SDK to your application. Use 'Sentry.init()' with your DSN. Capture errors with 'Sentry.captureException()' in your code.", "1384": "Configure the Terraform AWS provider using 'provider \"aws\" { region = \"us-east-1\" }'. Use 'terraform init' and 'terraform apply' for deployment.", "1385": "Example playbook: - name: Install nginx tasks: - name: Install nginx apt: pkg=nginx state=present. Use 'ansible-playbook playbook.yml' to execute."}, "completed_indices": [1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 1312, 1313, 1314, 1315, 1316, 1317, 1318, 1319, 1320, 1321, 1322, 1323, 1324, 1325, 1326, 1327, 1328, 1329, 1330, 1331, 1332, 1333, 1334, 1335, 1336, 1337, 1338, 1339, 1340, 1341, 1342, 1343, 1344, 1345, 1346, 1347, 1348, 1349, 1350, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385]} \ No newline at end of file diff --git a/finetune/dataset/prepare_data.py b/finetune/dataset/prepare_data.py index 3006be6..ad7b28c 100644 --- a/finetune/dataset/prepare_data.py +++ b/finetune/dataset/prepare_data.py @@ -73,6 +73,8 @@ def format_for_training(ex: TrainingExample) -> dict: text = text.replace("\n\n\n\n", "") return { + "query": ex.query, + "output": ex.output_as_lists(), "text": text, "messages": messages, } diff --git a/finetune/dataset/prepare_data_lfm2.py b/finetune/dataset/prepare_data_lfm2.py new file mode 100644 index 0000000..8cae6a5 --- /dev/null +++ b/finetune/dataset/prepare_data_lfm2.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +"""Prepare QMD query expansion data for LFM2.5-1.2B-Instruct training. + +LFM2.5 uses ChatML format: + <|startoftext|><|im_start|>user + Expand this search query: {query}<|im_end|> + <|im_start|>assistant + {output}<|im_end|> + +No /no_think needed (that's Qwen3-specific). +""" + +import json +import os +import random +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from dataset.schema import normalize_output_items, output_items_to_text + +from transformers import AutoTokenizer + + +def format_for_training(query_text: str, output_items: list[list[str]], tokenizer) -> dict: + """Format a single example for SFT training using LFM2.5 chat format.""" + output_text = output_items_to_text(output_items) + + messages = [ + {"role": "user", "content": f"Expand this search query: {query_text}"}, + {"role": "assistant", "content": output_text}, + ] + + text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=False + ) + + return {"text": text} + + +def main(): + input_path = Path("data/qmd_expansion_v2.jsonl") + output_dir = Path("data/train-lfm2") + output_dir.mkdir(parents=True, exist_ok=True) + + print("Loading LFM2.5 tokenizer...") + tokenizer = AutoTokenizer.from_pretrained( + "LiquidAI/LFM2.5-1.2B-Instruct", trust_remote_code=True + ) + + examples = [] + with open(input_path) as f: + for line in f: + row = json.loads(line) + items = normalize_output_items(row["output"]) + example = format_for_training(row["query"], items, tokenizer) + examples.append(example) + + # Shuffle and split + random.seed(42) + random.shuffle(examples) + + split_idx = int(len(examples) * 0.9) + train = examples[:split_idx] + val = examples[split_idx:] + + # Write as JSONL + train_path = output_dir / "train.jsonl" + val_path = output_dir / "val.jsonl" + + with open(train_path, "w") as f: + for ex in train: + f.write(json.dumps(ex) + "\n") + + with open(val_path, "w") as f: + for ex in val: + f.write(json.dumps(ex) + "\n") + + print(f"Written {len(train)} train, {len(val)} val examples to {output_dir}") + print(f"\nSample formatted text:") + print(train[0]["text"][:500]) + + +if __name__ == "__main__": + main() diff --git a/finetune/eval_retrieval.py b/finetune/eval_retrieval.py new file mode 100644 index 0000000..345b5a3 --- /dev/null +++ b/finetune/eval_retrieval.py @@ -0,0 +1,488 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "transformers>=4.45.0", +# "peft>=0.7.0", +# "torch", +# "accelerate", +# ] +# /// +""" +QMD Retrieval-Based Evaluation with Precision & Recall + +Evaluates model outputs against golden data (training set). +Measures how well the model reproduces the expected expansions. + +Metrics: +- Precision: Of model-generated expansions, how many match golden? +- Recall: Of golden expansions, how many did the model generate? +- F1: Harmonic mean of precision and recall + +Matching is done via token overlap (Jaccard similarity) with a threshold. + +Usage: + uv run eval_retrieval.py ./outputs/sft + uv run eval_retrieval.py tobil/qmd-query-expansion-1.7B --golden data/qmd_expansion_v3_structured.jsonl + uv run eval_retrieval.py ./outputs/sft --threshold 0.5 --sample 100 +""" + +import argparse +import json +import random +import re +import sys +from collections import defaultdict +from pathlib import Path + +# ============================================================================= +# Matching Functions +# ============================================================================= + +def tokenize(text: str) -> set[str]: + """Tokenize text into lowercase word set, removing stopwords.""" + stopwords = {'the', 'a', 'an', 'is', 'are', 'to', 'for', 'of', 'in', 'and', + 'or', 'it', 'this', 'that', 'be', 'with', 'as', 'on', 'by', + 'how', 'what', 'do', 'does', 'can', 'you', 'your', 'i'} + words = re.findall(r'\b\w+\b', text.lower()) + return {w for w in words if w not in stopwords and len(w) > 1} + + +def jaccard_similarity(a: str, b: str) -> float: + """Jaccard similarity between two strings based on token overlap.""" + tokens_a = tokenize(a) + tokens_b = tokenize(b) + if not tokens_a or not tokens_b: + return 0.0 + intersection = len(tokens_a & tokens_b) + union = len(tokens_a | tokens_b) + return intersection / union if union > 0 else 0.0 + + +def find_best_match(pred: str, golden_list: list[str], threshold: float) -> tuple[str | None, float]: + """Find best matching golden expansion for a prediction.""" + best_match = None + best_score = 0.0 + for golden in golden_list: + score = jaccard_similarity(pred, golden) + if score > best_score: + best_score = score + best_match = golden + if best_score >= threshold: + return best_match, best_score + return None, best_score + + +# ============================================================================= +# Parsing +# ============================================================================= + +def parse_model_output(text: str) -> dict[str, list[str]]: + """Parse model output into {lex: [...], vec: [...], hyde: [...]}.""" + # Clean thinking tags + text = re.sub(r'.*?', '', text, flags=re.DOTALL) + text = text.replace('<|im_end|>', '').strip() + + result = {"lex": [], "vec": [], "hyde": []} + for line in text.strip().split("\n"): + line = line.strip() + if not line: + continue + if line.startswith("lex:"): + result["lex"].append(line[4:].strip()) + elif line.startswith("vec:"): + result["vec"].append(line[4:].strip()) + elif line.startswith("hyde:"): + result["hyde"].append(line[5:].strip()) + return result + + +def parse_golden_data(searches: list[dict] | str) -> dict[str, list[str]]: + """Parse golden data format into {lex: [...], vec: [...], hyde: [...]}.""" + # If it's a string (from messages format), parse it + if isinstance(searches, str): + return parse_model_output(searches) + + # Otherwise it's the structured format [{type, query}, ...] + result = {"lex": [], "vec": [], "hyde": []} + for item in searches: + exp_type = item.get("type", "") + value = item.get("query", "") or item.get("value", "") + if exp_type in result: + result[exp_type].append(value) + return result + + +def load_golden_data(filepath: Path) -> list[dict]: + """Load golden data from JSONL, supporting both structured and messages formats.""" + data = [] + with open(filepath) as f: + for line in f: + if not line.strip(): + continue + item = json.loads(line) + + # Structured format: {query, searches} + if "query" in item and "searches" in item: + data.append({ + "query": item["query"], + "searches": item["searches"] + }) + # Messages format: {messages: [{role, content}, ...]} + elif "messages" in item: + messages = item["messages"] + query = None + searches = None + for msg in messages: + if msg["role"] == "user": + # Extract query from "/no_think Expand this search query: ..." + content = msg["content"] + if "Expand this search query:" in content: + query = content.split("Expand this search query:")[-1].strip() + else: + query = content.strip() + elif msg["role"] == "assistant": + # The assistant content IS the expected output + searches = msg["content"] + if query and searches: + data.append({ + "query": query, + "searches": searches # Will be parsed as string + }) + return data + + +# ============================================================================= +# Metrics Calculation +# ============================================================================= + +# Different thresholds by type - lex needs strict matching, hyde is more flexible +DEFAULT_THRESHOLDS = { + "lex": 0.5, # Keywords should overlap well + "vec": 0.35, # Semantic sentences have more variation + "hyde": 0.25, # Passages have the most variation +} + + +def calculate_metrics( + predictions: dict[str, list[str]], + golden: dict[str, list[str]], + threshold: float | dict[str, float] = 0.4, + return_mismatches: bool = False +) -> dict: + """Calculate precision, recall, F1 per type and overall. + + Args: + threshold: Either a single float, or dict mapping type -> threshold + return_mismatches: If True, include lists of unmatched predictions/golden + """ + if isinstance(threshold, (int, float)): + thresholds = {"lex": threshold, "vec": threshold, "hyde": threshold} + else: + thresholds = threshold + + metrics = {} + mismatches = {} + total_tp = 0 + total_pred = 0 + total_golden = 0 + + for exp_type in ["lex", "vec", "hyde"]: + preds = predictions.get(exp_type, []) + golds = golden.get(exp_type, []) + type_threshold = thresholds.get(exp_type, 0.4) + + if not preds and not golds: + continue + + # Track which golden items were matched + matched_golden = set() + unmatched_preds = [] + tp = 0 + + for pred in preds: + match, score = find_best_match(pred, golds, type_threshold) + if match is not None: + tp += 1 + matched_golden.add(match) + else: + unmatched_preds.append((pred, score)) + + unmatched_golden = [g for g in golds if g not in matched_golden] + + precision = tp / len(preds) if preds else 0.0 + recall = len(matched_golden) / len(golds) if golds else 0.0 + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 + + metrics[exp_type] = { + "precision": precision, + "recall": recall, + "f1": f1, + "pred_count": len(preds), + "golden_count": len(golds), + "matched": tp, + } + + if return_mismatches: + mismatches[exp_type] = { + "unmatched_preds": unmatched_preds, + "unmatched_golden": unmatched_golden, + } + + total_tp += tp + total_pred += len(preds) + total_golden += len(golds) + + # Overall metrics (micro-averaged) + overall_precision = total_tp / total_pred if total_pred > 0 else 0.0 + overall_recall = total_tp / total_golden if total_golden > 0 else 0.0 + overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0.0 + + metrics["overall"] = { + "precision": overall_precision, + "recall": overall_recall, + "f1": overall_f1, + "pred_count": total_pred, + "golden_count": total_golden, + "matched": total_tp, + } + + if return_mismatches: + metrics["_mismatches"] = mismatches + + return metrics + + +# ============================================================================= +# Model Loading and Generation +# ============================================================================= + +def load_model(model_path: str): + """Load model (adapter or merged).""" + import torch + from peft import PeftModel + from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer + + model_path = Path(model_path) + adapter_config = model_path / "adapter_config.json" + + # Get base model from adapter config or default + base_model = "Qwen/Qwen3-1.7B" + if adapter_config.exists(): + with open(adapter_config) as f: + cfg = json.load(f) + base_model = cfg.get("base_model_name_or_path", base_model) + + print(f"Loading base: {base_model}", file=sys.stderr) + tokenizer = AutoTokenizer.from_pretrained(base_model) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "left" + + config = AutoConfig.from_pretrained(base_model) + config.tie_word_embeddings = False + model = AutoModelForCausalLM.from_pretrained( + base_model, dtype=torch.bfloat16, device_map={"": 0}, config=config + ) + if model.generation_config is not None: + model.generation_config.do_sample = False + model.generation_config.temperature = None + model.generation_config.top_p = None + model.generation_config.top_k = None + + # Load adapter if present + if adapter_config.exists(): + print(f"Loading adapter: {model_path}", file=sys.stderr) + model = PeftModel.from_pretrained(model, str(model_path)) + + model.eval() + return model, tokenizer + + +def generate_expansion(model, tokenizer, query: str, max_new_tokens: int = 400) -> str: + """Generate expansion for a single query.""" + import torch + + prompt = tokenizer.apply_chat_template( + [{"role": "user", "content": f"/no_think Expand this search query: {query}"}], + tokenize=False, + add_generation_prompt=True, + ) + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + input_len = inputs["input_ids"].shape[1] + + with torch.inference_mode(): + out = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + do_sample=False, + num_beams=1, + pad_token_id=tokenizer.pad_token_id, + eos_token_id=tokenizer.eos_token_id, + use_cache=True, + ) + + gen_tokens = out[0][input_len:] + return tokenizer.decode(gen_tokens, skip_special_tokens=True) + + +# ============================================================================= +# Main Evaluation +# ============================================================================= + +def main(): + parser = argparse.ArgumentParser(description="QMD Retrieval-Based Evaluation") + parser.add_argument("model", help="Model path (local or HF)") + parser.add_argument("--golden", default="data/qmd_expansion_v3_structured.jsonl", + help="Golden data JSONL file") + parser.add_argument("--threshold", type=float, default=None, + help="Jaccard similarity threshold for all types (overrides --type-thresholds)") + parser.add_argument("--type-thresholds", action="store_true", + help="Use type-specific thresholds (lex=0.5, vec=0.35, hyde=0.25)") + parser.add_argument("--sample", type=int, default=0, + help="Sample N queries (0 = all)") + parser.add_argument("--seed", type=int, default=42, + help="Random seed for sampling") + parser.add_argument("--max-new-tokens", type=int, default=400, + help="Max new tokens to generate") + parser.add_argument("--verbose", "-v", action="store_true", + help="Show per-query details") + parser.add_argument("--show-mismatches", action="store_true", + help="Show examples of mismatched predictions") + args = parser.parse_args() + + # Determine thresholds + if args.threshold is not None: + thresholds = args.threshold + elif args.type_thresholds: + thresholds = DEFAULT_THRESHOLDS.copy() + else: + thresholds = 0.4 # Default single threshold + + # Load golden data + golden_path = Path(args.golden) + if not golden_path.exists(): + # Try relative to script directory + golden_path = Path(__file__).parent / args.golden + + if not golden_path.exists(): + print(f"Error: Golden data file not found: {args.golden}", file=sys.stderr) + sys.exit(1) + + print(f"Loading golden data from {golden_path}...", file=sys.stderr) + golden_data = load_golden_data(golden_path) + print(f"Loaded {len(golden_data)} golden examples", file=sys.stderr) + + # Sample if requested + if args.sample > 0 and args.sample < len(golden_data): + random.seed(args.seed) + golden_data = random.sample(golden_data, args.sample) + print(f"Sampled {len(golden_data)} examples", file=sys.stderr) + + # Load model + model, tokenizer = load_model(args.model) + + # Evaluate + all_metrics = [] + all_mismatches = [] + type_aggregates = defaultdict(lambda: {"precision": [], "recall": [], "f1": []}) + + threshold_desc = thresholds if isinstance(thresholds, (int, float)) else f"lex={thresholds['lex']}, vec={thresholds['vec']}, hyde={thresholds['hyde']}" + print(f"\nEvaluating {len(golden_data)} queries (thresholds: {threshold_desc})...\n") + + for i, item in enumerate(golden_data, 1): + query = item["query"] + golden_parsed = parse_golden_data(item["searches"]) + + # Generate model output + output = generate_expansion(model, tokenizer, query, args.max_new_tokens) + pred_parsed = parse_model_output(output) + + # Calculate metrics + metrics = calculate_metrics(pred_parsed, golden_parsed, thresholds, return_mismatches=args.show_mismatches) + all_metrics.append({"query": query, "metrics": metrics, "pred": pred_parsed, "golden": golden_parsed}) + + if args.show_mismatches and "_mismatches" in metrics: + all_mismatches.append({"query": query, "mismatches": metrics.pop("_mismatches")}) + + # Aggregate by type + for exp_type in ["lex", "vec", "hyde", "overall"]: + if exp_type in metrics: + type_aggregates[exp_type]["precision"].append(metrics[exp_type]["precision"]) + type_aggregates[exp_type]["recall"].append(metrics[exp_type]["recall"]) + type_aggregates[exp_type]["f1"].append(metrics[exp_type]["f1"]) + + # Progress + overall = metrics.get("overall", {}) + p = overall.get("precision", 0) * 100 + r = overall.get("recall", 0) * 100 + f = overall.get("f1", 0) * 100 + + if args.verbose: + print(f"[{i:3d}/{len(golden_data)}] P={p:5.1f}% R={r:5.1f}% F1={f:5.1f}% {query[:50]}") + elif i % 50 == 0 or i == len(golden_data): + print(f" Processed {i}/{len(golden_data)}...", file=sys.stderr) + + # Summary + print(f"\n{'='*60}") + print(f"RESULTS: {args.model}") + print(f"{'='*60}") + print(f"Threshold: {args.threshold} | Samples: {len(golden_data)}") + print() + + print(f"{'Type':<10} {'Precision':>10} {'Recall':>10} {'F1':>10}") + print("-" * 42) + + for exp_type in ["lex", "vec", "hyde", "overall"]: + if exp_type in type_aggregates: + agg = type_aggregates[exp_type] + avg_p = sum(agg["precision"]) / len(agg["precision"]) * 100 if agg["precision"] else 0 + avg_r = sum(agg["recall"]) / len(agg["recall"]) * 100 if agg["recall"] else 0 + avg_f = sum(agg["f1"]) / len(agg["f1"]) * 100 if agg["f1"] else 0 + label = exp_type.upper() if exp_type != "overall" else "OVERALL" + print(f"{label:<10} {avg_p:>9.1f}% {avg_r:>9.1f}% {avg_f:>9.1f}%") + + print(f"{'='*60}") + + # Show worst examples + print("\nBottom 5 by F1:") + sorted_by_f1 = sorted(all_metrics, key=lambda x: x["metrics"].get("overall", {}).get("f1", 0)) + for item in sorted_by_f1[:5]: + f1 = item["metrics"].get("overall", {}).get("f1", 0) * 100 + print(f" {f1:5.1f}% {item['query'][:60]}") + + # Show mismatches if requested + if args.show_mismatches and all_mismatches: + print(f"\n{'='*60}") + print("MISMATCH EXAMPLES") + print(f"{'='*60}") + + # Group by type and show up to 3 examples per type + for exp_type in ["lex", "vec", "hyde"]: + type_mismatches = [] + for item in all_mismatches: + if exp_type in item["mismatches"]: + mm = item["mismatches"][exp_type] + if mm["unmatched_preds"] or mm["unmatched_golden"]: + type_mismatches.append({ + "query": item["query"], + **mm + }) + + if type_mismatches: + print(f"\n--- {exp_type.upper()} mismatches ({len(type_mismatches)} queries) ---") + for example in type_mismatches[:3]: + print(f"\nQuery: {example['query'][:60]}") + if example["unmatched_preds"]: + print(f" Unmatched predictions:") + for pred, score in example["unmatched_preds"][:2]: + print(f" - [{score:.2f}] {pred[:80]}{'...' if len(pred) > 80 else ''}") + if example["unmatched_golden"]: + print(f" Missing golden:") + for g in example["unmatched_golden"][:2]: + print(f" - {g[:80]}{'...' if len(g) > 80 else ''}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/finetune/train-0.8B.log b/finetune/train-0.8B.log new file mode 100644 index 0000000..18f6ab2 --- /dev/null +++ b/finetune/train-0.8B.log @@ -0,0 +1,20 @@ +============================================================ +QMD Query Expansion — Unsloth SFT + Base model: unsloth/Qwen3.5-0.8B + Output: outputs/qwen3.5-0.8B + Data: data/train/train.jsonl + Epochs: 5 + Batch: 4 x 4 accum + LR: 0.0002 + LoRA rank: 16 + Max seq len: 512 +============================================================ +Traceback (most recent call last): + File "/home/tobi/src/github.com/tobi/qmd/finetune/train_unsloth.py", line 198, in + main() + ~~~~^^ + File "/home/tobi/src/github.com/tobi/qmd/finetune/train_unsloth.py", line 68, in main + from unsloth import FastLanguageModel + File "/home/tobi/src/github.com/tobi/qmd/finetune/.venv-unsloth/lib/python3.14/site-packages/unsloth/__init__.py", line 93, in + raise NotImplementedError("Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!") +NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs! diff --git a/finetune/train_unsloth.py b/finetune/train_unsloth.py new file mode 100644 index 0000000..508c9f6 --- /dev/null +++ b/finetune/train_unsloth.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +""" +QMD Query Expansion fine-tuning with Unsloth (Qwen3.5 support). + +Usage: + python train_unsloth.py --model 0.8B + python train_unsloth.py --model 2B + python train_unsloth.py --model 4B --epochs 3 + +Requires: pip install unsloth unsloth_zoo +""" + +import argparse +import json +import sys +from pathlib import Path + +MODEL_MAP = { + "0.8B": "unsloth/Qwen3.5-0.8B", + "2B": "unsloth/Qwen3.5-2B", + "4B": "unsloth/Qwen3.5-4B", + "9B": "unsloth/Qwen3.5-9B", + "27B": "unsloth/Qwen3.5-27B", +} + +def main(): + parser = argparse.ArgumentParser(description="QMD fine-tuning with Unsloth") + parser.add_argument("--model", required=True, choices=list(MODEL_MAP.keys()), + help="Model size to train") + parser.add_argument("--epochs", type=int, default=5) + parser.add_argument("--batch-size", type=int, default=4) + parser.add_argument("--grad-accum", type=int, default=4) + parser.add_argument("--lr", type=float, default=2e-4) + parser.add_argument("--max-seq-len", type=int, default=512) + parser.add_argument("--lora-rank", type=int, default=16) + parser.add_argument("--data", type=str, default="data/train/train.jsonl") + parser.add_argument("--output", type=str, default=None, + help="Output directory (default: outputs/qwen3.5-{size})") + parser.add_argument("--push-hub", type=str, default=None, + help="Push to HF hub (e.g. tobil/qmd-query-expansion-qwen3.5-0.8B)") + parser.add_argument("--no-gguf", action="store_true") + parser.add_argument("--no-eval", action="store_true") + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + model_name = MODEL_MAP[args.model] + output_dir = args.output or f"outputs/qwen3.5-{args.model}" + + print(f"{'='*60}") + print(f"QMD Query Expansion — Unsloth SFT") + print(f" Base model: {model_name}") + print(f" Output: {output_dir}") + print(f" Data: {args.data}") + print(f" Epochs: {args.epochs}") + print(f" Batch: {args.batch_size} x {args.grad_accum} accum") + print(f" LR: {args.lr}") + print(f" LoRA rank: {args.lora_rank}") + print(f" Max seq len: {args.max_seq_len}") + print(f"{'='*60}") + + if args.dry_run: + print("Dry run — exiting.") + return + + # --- Imports (heavy) --- + import os + import torch + from unsloth import FastLanguageModel + from datasets import load_dataset + from trl import SFTTrainer, SFTConfig + + # --- Load model --- + print(f"\nLoading {model_name}...") + model, tokenizer = FastLanguageModel.from_pretrained( + model_name=model_name, + max_seq_length=args.max_seq_len, + load_in_4bit=False, + load_in_16bit=True, + full_finetuning=False, + ) + + # --- LoRA --- + model = FastLanguageModel.get_peft_model( + model, + r=args.lora_rank, + target_modules=[ + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj", + ], + lora_alpha=args.lora_rank, + lora_dropout=0, + bias="none", + use_gradient_checkpointing="unsloth", + random_state=3407, + max_seq_length=args.max_seq_len, + ) + + # --- Dataset --- + print(f"Loading dataset from {args.data}...") + dataset = load_dataset("json", data_files=args.data, split="train") + dataset = dataset.shuffle(seed=42) + split = dataset.train_test_split(test_size=0.1, seed=42) + train_ds = split["train"] + eval_ds = split["test"] + print(f" Train: {len(train_ds)}, Eval: {len(eval_ds)}") + + # --- Tracking --- + report_to = "none" + if os.environ.get("HF_TOKEN"): + try: + import trackio + report_to = "trackio" + os.environ.setdefault("TRACKIO_PROJECT", "qmd-query-expansion") + except ImportError: + pass + + # --- Trainer --- + trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=train_ds, + eval_dataset=eval_ds, + args=SFTConfig( + output_dir=output_dir, + max_seq_length=args.max_seq_len, + num_train_epochs=args.epochs, + per_device_train_batch_size=args.batch_size, + gradient_accumulation_steps=args.grad_accum, + learning_rate=args.lr, + warmup_ratio=0.03, + lr_scheduler_type="cosine", + logging_steps=10, + save_strategy="steps", + save_steps=200, + save_total_limit=3, + eval_strategy="steps", + eval_steps=200, + bf16=True, + optim="adamw_8bit", + seed=3407, + dataset_num_proc=4, + report_to=report_to, + run_name=f"sft-qwen3.5-{args.model}", + ), + ) + + print("\nStarting training...") + stats = trainer.train() + print(f"\nTraining complete!") + print(f" Total steps: {stats.global_step}") + print(f" Final loss: {stats.training_loss:.4f}") + + # --- Save --- + trainer.save_model(output_dir) + tokenizer.save_pretrained(output_dir) + print(f"Adapter saved to {output_dir}") + + # --- GGUF export --- + if not args.no_gguf: + print("\nExporting GGUF quantizations...") + gguf_dir = f"{output_dir}/gguf" + for quant in ["q4_k_m", "q8_0"]: + print(f" {quant}...") + try: + model.save_pretrained_gguf( + gguf_dir, tokenizer, quantization_method=quant + ) + print(f" ✓ {quant} saved") + except Exception as e: + print(f" ✗ {quant} failed: {e}") + + # --- Push to Hub --- + if args.push_hub: + print(f"\nPushing to {args.push_hub}...") + model.push_to_hub_merged(args.push_hub, tokenizer, save_method="lora") + if not args.no_gguf: + for quant in ["q4_k_m", "q8_0"]: + try: + model.push_to_hub_gguf(args.push_hub, tokenizer, quantization_method=quant) + except Exception as e: + print(f" GGUF push {quant} failed: {e}") + + # --- Eval --- + if not args.no_eval: + print("\nRunning evaluation...") + import subprocess + subprocess.run( + [sys.executable, "eval.py", output_dir], + cwd=str(Path(__file__).parent), + ) + + print(f"\n{'='*60}") + print(f"Done! Model at: {output_dir}") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/flake.nix b/flake.nix index f3a1fc1..4aa95cd 100644 --- a/flake.nix +++ b/flake.nix @@ -48,7 +48,7 @@ cp package.json $out/lib/qmd/ makeWrapper ${pkgs.bun}/bin/bun $out/bin/qmd \ - --add-flags "$out/lib/qmd/src/qmd.ts" \ + --add-flags "$out/lib/qmd/src/cli/qmd.ts" \ --set DYLD_LIBRARY_PATH "${pkgs.sqlite.out}/lib" \ --set LD_LIBRARY_PATH "${pkgs.sqlite.out}/lib" ''; @@ -81,7 +81,7 @@ shellHook = '' export BREW_PREFIX="''${BREW_PREFIX:-${sqliteWithExtensions.out}}" echo "QMD development shell" - echo "Run: bun src/qmd.ts " + echo "Run: bun src/cli/qmd.ts " ''; }; } diff --git a/src/llm.ts b/src/llm.ts index 2385456..bd276bb 100644 --- a/src/llm.ts +++ b/src/llm.ts @@ -209,7 +209,9 @@ export const DEFAULT_RERANK_MODEL_URI = DEFAULT_RERANK_MODEL; export const DEFAULT_GENERATE_MODEL_URI = DEFAULT_GENERATE_MODEL; // Local model cache directory -const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models"); +const MODEL_CACHE_DIR = process.env.XDG_CACHE_HOME + ? join(process.env.XDG_CACHE_HOME, "qmd", "models") + : join(homedir(), ".cache", "qmd", "models"); export const DEFAULT_MODEL_CACHE_DIR = MODEL_CACHE_DIR; export type PullResult = { @@ -757,9 +759,16 @@ export class LlamaCpp implements LLM { * - Combined: drops from 11.6 GB (auto, no flash) to 568 MB per context (20×) */ // Qwen3 reranker template adds ~200 tokens overhead (system prompt, tags, etc.) - // Chunks are max 800 tokens, so 800 + 200 + query ≈ 1100 tokens typical. - // Use 2048 for safety margin. Still 17× less than auto (40960). - private static readonly RERANK_CONTEXT_SIZE = 2048; + // Default 2048 was too small for longer documents (e.g. session transcripts, + // CJK text, or large markdown files) — callers hit "input lengths exceed + // context size" errors even after truncation because the overhead estimate + // was insufficient. 4096 comfortably fits the largest real-world chunks + // while staying well below the 40 960-token auto size. + // Override with QMD_RERANK_CONTEXT_SIZE env var if you need more headroom. + private static readonly RERANK_CONTEXT_SIZE: number = (() => { + const v = parseInt(process.env.QMD_RERANK_CONTEXT_SIZE ?? "", 10); + return Number.isFinite(v) && v > 0 ? v : 4096; + })(); private async ensureRerankContexts(): Promise>[]> { if (this.rerankContexts.length === 0) { const model = await this.ensureRerankModel(); @@ -1099,8 +1108,10 @@ export class LlamaCpp implements LLM { } } - // Qwen3 reranker chat template overhead (system prompt, tags, separators) - private static readonly RERANK_TEMPLATE_OVERHEAD = 200; + // Qwen3 reranker chat template overhead (system prompt, tags, separators). + // Measured at ~350 tokens on real queries; use 512 as a safe upper bound so + // the truncation budget never lets a document slip past the context limit. + private static readonly RERANK_TEMPLATE_OVERHEAD = 512; private static readonly RERANK_TARGET_DOCS_PER_CONTEXT = 10; async rerank( diff --git a/src/mcp/server.ts b/src/mcp/server.ts index f1cc2a9..b7fada7 100644 --- a/src/mcp/server.ts +++ b/src/mcp/server.ts @@ -296,9 +296,12 @@ Intent-aware lex (C++ performance, not sports): intent: z.string().optional().describe( "Background context to disambiguate the query. Example: query='performance', intent='web page load times and Core Web Vitals'. Does not search on its own." ), + rerank: z.boolean().optional().default(true).describe( + "Rerank results using LLM (default: true). Set to false for faster results on CPU-only machines." + ), }, }, - async ({ searches, limit, minScore, candidateLimit, collections, intent }) => { + async ({ searches, limit, minScore, candidateLimit, collections, intent, rerank }) => { // Map to internal format const queries: ExpandedQuery[] = searches.map(s => ({ type: s.type, @@ -313,6 +316,7 @@ Intent-aware lex (C++ performance, not sports): collections: effectiveCollections.length > 0 ? effectiveCollections : undefined, limit, minScore, + rerank, intent, }); diff --git a/src/store.ts b/src/store.ts index bcc9b9f..d1b24eb 100644 --- a/src/store.ts +++ b/src/store.ts @@ -1421,6 +1421,12 @@ export async function generateEmbeddings( const batches = buildEmbeddingBatches(docsToEmbed, maxDocsPerBatch, maxBatchBytes); for (const batchMeta of batches) { + // Abort early if session has been invalidated + if (!session.isValid) { + console.warn(`⚠ Session expired — skipping remaining document batches`); + break; + } + const batchDocs = getEmbeddingDocsForBatch(db, batchMeta); const batchChunks: ChunkItem[] = []; const batchBytes = batchMeta.reduce((sum, doc) => sum + Math.max(0, doc.bytes), 0); @@ -1434,6 +1440,7 @@ export async function generateEmbeddings( undefined, undefined, undefined, doc.path, options?.chunkStrategy, + session.signal, ); for (let seq = 0; seq < chunks.length; seq++) { @@ -1472,6 +1479,23 @@ export async function generateEmbeddings( let batchChunkBytesProcessed = 0; for (let batchStart = 0; batchStart < batchChunks.length; batchStart += BATCH_SIZE) { + // Abort early if session has been invalidated (e.g. max duration exceeded) + if (!session.isValid) { + const remaining = batchChunks.length - batchStart; + errors += remaining; + console.warn(`⚠ Session expired — skipping ${remaining} remaining chunks`); + break; + } + + // Abort early if error rate is too high (>80% of processed chunks failed) + const processed = chunksEmbedded + errors; + if (processed >= BATCH_SIZE && errors > processed * 0.8) { + const remaining = batchChunks.length - batchStart; + errors += remaining; + console.warn(`⚠ Error rate too high (${errors}/${processed}) — aborting embedding`); + break; + } + const batchEnd = Math.min(batchStart + BATCH_SIZE, batchChunks.length); const chunkBatch = batchChunks.slice(batchStart, batchEnd); const texts = chunkBatch.map(chunk => formatDocForEmbedding(chunk.text, chunk.title)); @@ -1491,20 +1515,26 @@ export async function generateEmbeddings( } } catch { // Batch failed — try individual embeddings as fallback - for (const chunk of chunkBatch) { - try { - const text = formatDocForEmbedding(chunk.text, chunk.title); - const result = await session.embed(text); - if (result) { - insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now); - chunksEmbedded++; - } else { + // But skip if session is already invalid (avoids N doomed retries) + if (!session.isValid) { + errors += chunkBatch.length; + batchChunkBytesProcessed += chunkBatch.reduce((sum, c) => sum + c.bytes, 0); + } else { + for (const chunk of chunkBatch) { + try { + const text = formatDocForEmbedding(chunk.text, chunk.title); + const result = await session.embed(text); + if (result) { + insertEmbedding(db, chunk.hash, chunk.seq, chunk.pos, new Float32Array(result.embedding), model, now); + chunksEmbedded++; + } else { + errors++; + } + } catch { errors++; } - } catch { - errors++; + batchChunkBytesProcessed += chunk.bytes; } - batchChunkBytesProcessed += chunk.bytes; } } @@ -1684,7 +1714,6 @@ export function handelize(path: string): string { const result = path .replace(/___/g, '/') // Triple underscore becomes folder separator - .toLowerCase() .split('/') .map((segment, idx, arr) => { const isLastSegment = idx === arr.length - 1; @@ -1699,7 +1728,7 @@ export function handelize(path: string): string { const nameWithoutExt = ext ? segment.slice(0, -ext.length) : segment; const cleanedName = nameWithoutExt - .replace(/[^\p{L}\p{N}$]+/gu, '-') // Keep route marker "$", dash-separate other chars + .replace(/[^\p{L}\p{N}.$]+/gu, '-') // Keep letters, numbers, dots, "$"; dash-separate rest .replace(/^-+|-+$/g, ''); // Remove leading/trailing dashes return cleanedName + ext; @@ -2170,6 +2199,7 @@ export async function chunkDocumentByTokens( windowTokens: number = CHUNK_WINDOW_TOKENS, filepath?: string, chunkStrategy: ChunkStrategy = "regex", + signal?: AbortSignal ): Promise<{ text: string; pos: number; tokens: number }[]> { const llm = getDefaultLlamaCpp(); @@ -2188,6 +2218,9 @@ export async function chunkDocumentByTokens( const results: { text: string; pos: number; tokens: number }[] = []; for (const chunk of charChunks) { + // Respect abort signal to avoid runaway tokenization + if (signal?.aborted) break; + const tokens = await llm.tokenize(chunk.text); if (tokens.length <= maxTokens) { @@ -2201,6 +2234,7 @@ export async function chunkDocumentByTokens( const subChunks = chunkDocument(chunk.text, safeMaxChars, Math.floor(overlapChars * actualCharsPerToken / 2), Math.floor(windowChars * actualCharsPerToken / 2)); for (const subChunk of subChunks) { + if (signal?.aborted) break; const subTokens = await llm.tokenize(subChunk.text); results.push({ text: subChunk.text, @@ -2732,20 +2766,46 @@ function sanitizeFTS5Term(term: string): string { return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase(); } +/** + * Check if a token is a hyphenated compound word (e.g., multi-agent, DEC-0054, gpt-4). + * Returns true if the token contains internal hyphens between word/digit characters. + */ +function isHyphenatedToken(token: string): boolean { + return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token); +} + +/** + * Sanitize a hyphenated term into an FTS5 phrase by splitting on hyphens + * and sanitizing each part. Returns the parts joined by spaces for use + * inside FTS5 quotes: "multi agent" matches "multi-agent" in porter tokenizer. + */ +function sanitizeHyphenatedTerm(term: string): string { + return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); +} + /** * Parse lex query syntax into FTS5 query. * * Supports: * - Quoted phrases: "exact phrase" → "exact phrase" (exact match) * - Negation: -term or -"phrase" → uses FTS5 NOT operator + * - Hyphenated tokens: multi-agent, DEC-0054, gpt-4 → treated as phrases * - Plain terms: term → "term"* (prefix match) * * FTS5 NOT is a binary operator: `term1 NOT term2` means "match term1 but not term2". * So `-term` only works when there are also positive terms. * + * Hyphen disambiguation: `-sports` at a word boundary is negation, but `multi-agent` + * (where `-` is between word characters) is treated as a hyphenated phrase. + * When a leading `-` is followed by what looks like a hyphenated compound word + * (e.g., `-multi-agent`), the entire token is treated as a negated phrase. + * * Examples: * performance -sports → "performance"* NOT "sports"* * "machine learning" → "machine learning" + * multi-agent memory → "multi agent" AND "memory"* + * DEC-0054 → "dec 0054" + * -multi-agent → NOT "multi agent" */ function buildFTS5Query(query: string): string | null { const positive: string[] = []; @@ -2787,13 +2847,27 @@ function buildFTS5Query(query: string): string | null { while (i < s.length && !/[\s"]/.test(s[i]!)) i++; const term = s.slice(start, i); - const sanitized = sanitizeFTS5Term(term); - if (sanitized) { - const ftsTerm = `"${sanitized}"*`; // Prefix match - if (negated) { - negative.push(ftsTerm); - } else { - positive.push(ftsTerm); + // Handle hyphenated tokens: multi-agent, DEC-0054, gpt-4 + // These get split into phrase queries so FTS5 porter tokenizer matches them. + if (isHyphenatedToken(term)) { + const sanitized = sanitizeHyphenatedTerm(term); + if (sanitized) { + const ftsPhrase = `"${sanitized}"`; // Phrase match (no prefix) + if (negated) { + negative.push(ftsPhrase); + } else { + positive.push(ftsPhrase); + } + } + } else { + const sanitized = sanitizeFTS5Term(term); + if (sanitized) { + const ftsTerm = `"${sanitized}"*`; // Prefix match + if (negated) { + negative.push(ftsTerm); + } else { + positive.push(ftsTerm); + } } } } @@ -2842,20 +2916,38 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle const ftsQuery = buildFTS5Query(query); if (!ftsQuery) return []; + // Use a CTE to force FTS5 to run first, then filter by collection. + // Without the CTE, SQLite's query planner combines FTS5 MATCH with the + // collection filter in a single WHERE clause, which can cause it to + // abandon the FTS5 index and fall back to a full scan — turning an 8ms + // query into a 17-second query on large collections. + const params: (string | number)[] = [ftsQuery]; + + // When filtering by collection, fetch extra candidates from the FTS index + // since some will be filtered out. Without a collection filter we can + // fetch exactly the requested limit. + const ftsLimit = collectionName ? limit * 10 : limit; + let sql = ` + WITH fts_matches AS ( + SELECT rowid, bm25(documents_fts, 1.5, 4.0, 1.0) as bm25_score + FROM documents_fts + WHERE documents_fts MATCH ? + ORDER BY bm25_score ASC + LIMIT ${ftsLimit} + ) SELECT 'qmd://' || d.collection || '/' || d.path as filepath, d.collection || '/' || d.path as display_path, d.title, content.doc as body, d.hash, - bm25(documents_fts, 10.0, 1.0) as bm25_score - FROM documents_fts f - JOIN documents d ON d.id = f.rowid + fm.bm25_score + FROM fts_matches fm + JOIN documents d ON d.id = fm.rowid JOIN content ON content.hash = d.hash - WHERE documents_fts MATCH ? AND d.active = 1 + WHERE d.active = 1 `; - const params: (string | number)[] = [ftsQuery]; if (collectionName) { sql += ` AND d.collection = ?`; @@ -2863,7 +2955,7 @@ export function searchFTS(db: Database, query: string, limit: number = 20, colle } // bm25 lower is better; sort ascending. - sql += ` ORDER BY bm25_score ASC LIMIT ?`; + sql += ` ORDER BY fm.bm25_score ASC LIMIT ?`; params.push(limit); const rows = db.prepare(sql).all(...params) as { filepath: string; display_path: string; title: string; body: string; hash: string; bm25_score: number }[]; @@ -3021,6 +3113,12 @@ export function clearAllEmbeddings(db: Database): void { /** * Insert a single embedding into both content_vectors and vectors_vec tables. * The hash_seq key is formatted as "hash_seq" for the vectors_vec table. + * + * content_vectors is inserted first so that getHashesForEmbedding (which checks + * only content_vectors) won't re-select the hash on a crash between the two inserts. + * + * vectors_vec uses DELETE + INSERT instead of INSERT OR REPLACE because sqlite-vec's + * vec0 virtual tables silently ignore the OR REPLACE conflict clause. */ export function insertEmbedding( db: Database, @@ -3032,11 +3130,16 @@ export function insertEmbedding( embeddedAt: string ): void { const hashSeq = `${hash}_${seq}`; - const insertVecStmt = db.prepare(`INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`); - const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`); - insertVecStmt.run(hashSeq, embedding); + // Insert content_vectors first — crash-safe ordering (see getHashesForEmbedding) + const insertContentVectorStmt = db.prepare(`INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at) VALUES (?, ?, ?, ?, ?)`); insertContentVectorStmt.run(hash, seq, pos, model, embeddedAt); + + // vec0 virtual tables don't support OR REPLACE — use DELETE + INSERT + const deleteVecStmt = db.prepare(`DELETE FROM vectors_vec WHERE hash_seq = ?`); + const insertVecStmt = db.prepare(`INSERT INTO vectors_vec (hash_seq, embedding) VALUES (?, ?)`); + deleteVecStmt.run(hashSeq); + insertVecStmt.run(hashSeq, embedding); } // ============================================================================= diff --git a/test/eval-deep-research.jsonl b/test/eval-deep-research.jsonl new file mode 100644 index 0000000..060d524 --- /dev/null +++ b/test/eval-deep-research.jsonl @@ -0,0 +1,25 @@ +{"query": "that tradeoff between data correctness and always being up", "expected_doc": "distributed-systems", "difficulty": "hard", "intent": "distributed systems architecture", "notes": "CAP theorem - no keywords match"} +{"query": "what we learned from the dashboard thing", "expected_doc": "product-launch", "difficulty": "hard", "intent": "project retrospectives", "notes": "Project Phoenix retrospective - vague reference"} +{"query": "how much we're burning through each month", "expected_doc": "fundraising", "difficulty": "hard", "intent": "startup finances", "notes": "burn rate - colloquial phrasing"} +{"query": "when do I need to be online", "expected_doc": "remote-work", "difficulty": "hard", "intent": "work schedule policies", "notes": "core hours policy - no exact terms"} +{"query": "that algorithm for getting nodes to agree", "expected_doc": "distributed-systems", "difficulty": "hard", "intent": "distributed consensus", "notes": "consensus/Raft/Paxos - conceptual reference"} +{"query": "why we pushed back the release date", "expected_doc": "product-launch", "difficulty": "hard", "intent": "project timeline decisions", "notes": "timeline pressure - implied from retrospective"} +{"query": "how to structure URLs for our service", "expected_doc": "api-design", "difficulty": "hard", "intent": "API design patterns", "notes": "REST endpoints - no exact match"} +{"query": "preventing the model from just memorizing", "expected_doc": "machine-learning", "difficulty": "hard", "intent": "ML model training", "notes": "overfitting - conceptual synonym"} +{"query": "who we're pitching to first", "expected_doc": "fundraising", "difficulty": "hard", "intent": "investor outreach strategy", "notes": "tier 1 investors - colloquial"} +{"query": "can I work from another country", "expected_doc": "remote-work", "difficulty": "hard", "intent": "remote work eligibility", "notes": "remote eligibility - implied question"} +{"query": "how the beta users found problems", "expected_doc": "product-launch", "difficulty": "hard", "intent": "product testing feedback", "notes": "beta program bugs - indirect reference"} +{"query": "that thing Leslie Lamport invented", "expected_doc": "distributed-systems", "difficulty": "hard", "intent": "distributed systems history", "notes": "Paxos - person reference only"} +{"query": "what happens when the network splits", "expected_doc": "distributed-systems", "difficulty": "hard", "intent": "network failure handling", "notes": "partition tolerance - rephrased concept"} +{"query": "teaching computers to find patterns", "expected_doc": "machine-learning", "difficulty": "hard", "intent": "machine learning fundamentals", "notes": "ML definition - abstract description"} +{"query": "how much runway before we're out of cash", "expected_doc": "fundraising", "difficulty": "hard", "intent": "startup financial planning", "notes": "runway months - colloquial finance term"} +{"query": "the 47 issues we found before shipping", "expected_doc": "product-launch", "difficulty": "hard", "intent": "pre-launch QA", "notes": "beta bugs - specific number, no keywords"} +{"query": "grouping customers by behavior", "expected_doc": "machine-learning", "difficulty": "hard", "intent": "customer analytics", "notes": "clustering/segmentation - conceptual"} +{"query": "why URLs should be things not actions", "expected_doc": "api-design", "difficulty": "hard", "intent": "RESTful design principles", "notes": "nouns not verbs - conceptual inversion"} +{"query": "what Eric Brewer proved you can't have", "expected_doc": "distributed-systems", "difficulty": "hard", "intent": "distributed systems theory", "notes": "CAP theorem - person + concept"} +{"query": "how fast the new feature loaded", "expected_doc": "product-launch", "difficulty": "hard", "intent": "performance metrics", "notes": "performance 4.2s - indirect reference"} +{"query": "days everyone needs to be in the office", "expected_doc": "remote-work", "difficulty": "hard", "intent": "hybrid work schedule", "notes": "collaboration days - rephrased"} +{"query": "the number that shows customers are expanding", "expected_doc": "fundraising", "difficulty": "hard", "intent": "SaaS growth metrics", "notes": "NRR 124% - metric description"} +{"query": "telling spam from real email", "expected_doc": "machine-learning", "difficulty": "hard", "intent": "classification use cases", "notes": "classification example - specific use case"} +{"query": "how to get user 123's purchases", "expected_doc": "api-design", "difficulty": "hard", "intent": "API endpoint design", "notes": "hierarchical URLs - example-based query"} +{"query": "zookeeper etcd consul what they have in common", "expected_doc": "distributed-systems", "difficulty": "hard", "intent": "distributed coordination tools", "notes": "CP systems - asking about category"} diff --git a/test/eval-deep-research.ts b/test/eval-deep-research.ts new file mode 100644 index 0000000..fdee461 --- /dev/null +++ b/test/eval-deep-research.ts @@ -0,0 +1,209 @@ +/** + * Deep Research Evaluation for QMD + * + * Tests end-to-end retrieval quality: query → expansion → reranking → results + * + * These are HARD queries with NO exact keyword matches - they require + * semantic understanding via query expansion and reranking to succeed. + * + * Run: bun test/eval-deep-research.ts + */ + +import { execSync } from "child_process"; +import { readFileSync, existsSync } from "fs"; +import { join, dirname } from "path"; +import { fileURLToPath } from "url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +interface EvalQuery { + query: string; + expected_doc: string; + difficulty: string; + intent: string; // Domain context hint for future intent-aware retrieval + notes: string; +} + +interface SearchResult { + file: string; + score: number; + title?: string; +} + +function loadQueries(): EvalQuery[] { + const path = join(__dirname, "eval-deep-research.jsonl"); + const content = readFileSync(path, "utf-8"); + return content + .split("\n") + .filter((line) => line.trim()) + .map((line) => JSON.parse(line)); +} + +function runBM25Search(query: string): SearchResult[] { + try { + const output = execSync( + `bun src/qmd.ts search "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`, + { encoding: "utf-8", timeout: 30000 } + ); + return JSON.parse(output); + } catch { + return []; + } +} + +function runDeepResearch(query: string): SearchResult[] { + try { + const output = execSync( + `bun src/qmd.ts query "${query.replace(/"/g, '\\"')}" -c eval-docs --json -n 5 2>/dev/null`, + { encoding: "utf-8", timeout: 120000 } + ); + return JSON.parse(output); + } catch { + return []; + } +} + +function matchesExpected(filepath: string, expectedDoc: string): boolean { + return filepath.toLowerCase().includes(expectedDoc.toLowerCase()); +} + +function findRank(results: SearchResult[], expectedDoc: string): number { + for (let i = 0; i < results.length; i++) { + if (matchesExpected(results[i]!.file, expectedDoc)) { + return i + 1; + } + } + return -1; // Not found +} + +interface MethodResults { + hit1: number; + hit3: number; + hit5: number; + total: number; + details: { query: string; rank: number; expected: string; intent?: string }[]; +} + +function evaluate( + queries: EvalQuery[], + searchFn: (q: string) => SearchResult[], + label: string +): MethodResults { + const results: MethodResults = { + hit1: 0, + hit3: 0, + hit5: 0, + total: queries.length, + details: [], + }; + + console.log(`\n${"=".repeat(60)}`); + console.log(` ${label}`); + console.log(`${"=".repeat(60)}\n`); + + for (const { query, expected_doc, intent, notes } of queries) { + const searchResults = searchFn(query); + const rank = findRank(searchResults, expected_doc); + + results.details.push({ query, rank, expected: expected_doc, intent }); + + if (rank === 1) results.hit1++; + if (rank >= 1 && rank <= 3) results.hit3++; + if (rank >= 1 && rank <= 5) results.hit5++; + + const status = + rank === 1 ? "✓" : rank > 0 && rank <= 3 ? `@${rank}` : rank > 0 ? `@${rank}` : "✗"; + const statusPad = status.padEnd(4); + console.log(` ${statusPad} "${query.slice(0, 45).padEnd(45)}" → ${expected_doc}`); + if (rank === -1) { + console.log(` intent: ${intent} | ${notes}`); + } + } + + const hit1Pct = ((results.hit1 / results.total) * 100).toFixed(0); + const hit3Pct = ((results.hit3 / results.total) * 100).toFixed(0); + const hit5Pct = ((results.hit5 / results.total) * 100).toFixed(0); + + console.log(`\n ${"─".repeat(50)}`); + console.log(` Hit@1: ${hit1Pct}% (${results.hit1}/${results.total})`); + console.log(` Hit@3: ${hit3Pct}% (${results.hit3}/${results.total})`); + console.log(` Hit@5: ${hit5Pct}% (${results.hit5}/${results.total})`); + + return results; +} + +async function main() { + console.log("QMD Deep Research Evaluation"); + console.log("=".repeat(60)); + console.log("Testing hard queries that require semantic understanding."); + console.log("These have NO exact keyword matches in documents."); + + // Check if eval-docs collection exists + try { + const status = execSync("bun src/qmd.ts status --json 2>/dev/null", { + encoding: "utf-8", + }); + if (!status.includes("eval-docs")) { + console.log("\n⚠️ eval-docs collection not found. Run:"); + console.log(" qmd collection add test/eval-docs --name eval-docs"); + console.log(" qmd embed"); + process.exit(1); + } + } catch { + console.log("\n⚠️ Could not check status. Make sure qmd is working."); + } + + const queries = loadQueries(); + console.log(`\nLoaded ${queries.length} hard queries.`); + + // Run BM25 baseline (expected to fail on most) + const bm25Results = evaluate(queries, runBM25Search, "BM25 BASELINE (keyword search)"); + + // Run deep research (expected to succeed via expansion + reranking) + const deepResults = evaluate(queries, runDeepResearch, "DEEP RESEARCH (expansion + reranking)"); + + // Comparison + console.log(`\n${"=".repeat(60)}`); + console.log(" COMPARISON"); + console.log(`${"=".repeat(60)}`); + console.log(`\n Method Hit@1 Hit@3 Hit@5`); + console.log(` ${"─".repeat(45)}`); + console.log( + ` BM25 (baseline) ${((bm25Results.hit1 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit3 / bm25Results.total) * 100).toFixed(0).padStart(3)}% ${((bm25Results.hit5 / bm25Results.total) * 100).toFixed(0).padStart(3)}%` + ); + console.log( + ` Deep Research ${((deepResults.hit1 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit3 / deepResults.total) * 100).toFixed(0).padStart(3)}% ${((deepResults.hit5 / deepResults.total) * 100).toFixed(0).padStart(3)}%` + ); + + const improvement = deepResults.hit3 - bm25Results.hit3; + console.log(`\n Improvement (Hit@3): +${improvement} queries (${((improvement / bm25Results.total) * 100).toFixed(0)}%)`); + + // Show queries where deep research recovered failures + const recovered = deepResults.details.filter( + (d) => + d.rank >= 1 && + d.rank <= 3 && + bm25Results.details.find((b) => b.query === d.query)?.rank === -1 + ); + + if (recovered.length > 0) { + console.log(`\n Recovered by expansion + reranking (${recovered.length}):`); + for (const { query, rank, expected } of recovered.slice(0, 5)) { + console.log(` @${rank} "${query.slice(0, 40)}..." → ${expected}`); + } + if (recovered.length > 5) { + console.log(` ... and ${recovered.length - 5} more`); + } + } + + // Exit with error if deep research performs poorly + const deepHit3Pct = (deepResults.hit3 / deepResults.total) * 100; + if (deepHit3Pct < 60) { + console.log(`\n❌ Deep research Hit@3 < 60% (${deepHit3Pct.toFixed(0)}%)`); + process.exit(1); + } else { + console.log(`\n✓ Deep research Hit@3 >= 60% (${deepHit3Pct.toFixed(0)}%)`); + } +} + +main(); diff --git a/test/store.helpers.unit.test.ts b/test/store.helpers.unit.test.ts index eb7f8a6..e3c2373 100644 --- a/test/store.helpers.unit.test.ts +++ b/test/store.helpers.unit.test.ts @@ -114,14 +114,14 @@ describe("cleanupOrphanedVectors", () => { // ============================================================================= describe("handelize", () => { - test("converts to lowercase", () => { - expect(handelize("README.md")).toBe("readme.md"); - expect(handelize("MyFile.MD")).toBe("myfile.md"); + test("preserves original case", () => { + expect(handelize("README.md")).toBe("README.md"); + expect(handelize("MyFile.MD")).toBe("MyFile.MD"); }); test("preserves folder structure", () => { expect(handelize("a/b/c/d.md")).toBe("a/b/c/d.md"); - expect(handelize("docs/api/README.md")).toBe("docs/api/readme.md"); + expect(handelize("docs/api/README.md")).toBe("docs/api/README.md"); }); test("replaces non-word characters with dash", () => { @@ -151,7 +151,7 @@ describe("handelize", () => { test("handles complex real-world meeting notes", () => { const complexName = "Money Movement Licensing Review - 2025/11/19 10:25 EST - Notes by Gemini.md"; const result = handelize(complexName); - expect(result).toBe("money-movement-licensing-review-2025-11-19-10-25-est-notes-by-gemini.md"); + expect(result).toBe("Money-Movement-Licensing-Review-2025-11-19-10-25-EST-Notes-by-Gemini.md"); expect(result).not.toContain(" "); expect(result).not.toContain("/"); expect(result).not.toContain(":"); @@ -159,7 +159,7 @@ describe("handelize", () => { test("handles unicode characters", () => { expect(handelize("日本語.md")).toBe("日本語.md"); - expect(handelize("Зоны и проекты.md")).toBe("зоны-и-проекты.md"); + expect(handelize("Зоны и проекты.md")).toBe("Зоны-и-проекты.md"); expect(handelize("café-notes.md")).toBe("café-notes.md"); expect(handelize("naïve.md")).toBe("naïve.md"); expect(handelize("日本語-notes.md")).toBe("日本語-notes.md"); @@ -181,13 +181,13 @@ describe("handelize", () => { test("handles dates and times in filenames", () => { expect(handelize("meeting-2025-01-15.md")).toBe("meeting-2025-01-15.md"); expect(handelize("notes 2025/01/15.md")).toBe("notes-2025/01/15.md"); - expect(handelize("call_10:30_AM.md")).toBe("call-10-30-am.md"); + expect(handelize("call_10:30_AM.md")).toBe("call-10-30-AM.md"); }); test("handles special project naming patterns", () => { - expect(handelize("PROJECT_ABC_v2.0.md")).toBe("project-abc-v2-0.md"); - expect(handelize("[WIP] Feature Request.md")).toBe("wip-feature-request.md"); - expect(handelize("(DRAFT) Proposal v1.md")).toBe("draft-proposal-v1.md"); + expect(handelize("PROJECT_ABC_v2.0.md")).toBe("PROJECT-ABC-v2.0.md"); + expect(handelize("[WIP] Feature Request.md")).toBe("WIP-Feature-Request.md"); + expect(handelize("(DRAFT) Proposal v1.md")).toBe("DRAFT-Proposal-v1.md"); }); test("handles symbol-only route filenames", () => { diff --git a/test/store.test.ts b/test/store.test.ts index d4f99dd..073297d 100644 --- a/test/store.test.ts +++ b/test/store.test.ts @@ -1327,6 +1327,34 @@ describe("FTS Search", () => { await cleanupTestDb(store); }); + test("searchFTS title boost outweighs higher body frequency", async () => { + const store = await createTestStore(); + const collectionName = await createTestCollection(); + + // Document with "quantum" mentioned in a longer body but NOT in the title + await insertTestDocument(store.db, collectionName, { + name: "body-only", + title: "General Science Notes", + body: "This research paper discusses quantum mechanics and the quantum model of computation. The quantum approach offers improvements over classical methods.", + displayPath: "test/body-only.md", + }); + + // Document with "quantum" in the title but a shorter body mention + await insertTestDocument(store.db, collectionName, { + name: "title-match", + title: "Quantum Computing Overview", + body: "An introduction to the fundamentals of this emerging computing paradigm.", + displayPath: "test/title-match.md", + }); + + const results = store.searchFTS("quantum", 10); + expect(results.length).toBe(2); + // Title-match doc should rank higher due to BM25 column weights boosting title + expect(results[0]!.displayPath).toBe(`${collectionName}/test/title-match.md`); + + await cleanupTestDb(store); + }); + test("searchFTS respects limit parameter", async () => { const store = await createTestStore(); const collectionName = await createTestCollection(); diff --git a/test/structured-search.test.ts b/test/structured-search.test.ts index 5c4e97f..d704210 100644 --- a/test/structured-search.test.ts +++ b/test/structured-search.test.ts @@ -399,6 +399,14 @@ describe("buildFTS5Query (lex parser)", () => { return term.replace(/[^\p{L}\p{N}']/gu, '').toLowerCase(); } + function isHyphenatedToken(token: string): boolean { + return /^[\p{L}\p{N}][\p{L}\p{N}'-]*-[\p{L}\p{N}][\p{L}\p{N}'-]*$/u.test(token); + } + + function sanitizeHyphenatedTerm(term: string): string { + return term.split('-').map(t => sanitizeFTS5Term(t)).filter(t => t).join(' '); + } + function buildFTS5Query(query: string): string | null { const positive: string[] = []; const negative: string[] = []; @@ -424,8 +432,14 @@ describe("buildFTS5Query (lex parser)", () => { const start = i; while (i < s.length && !/[\s"]/.test(s[i]!)) i++; const term = s.slice(start, i); - const sanitized = sanitizeFTS5Term(term); - if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`); + + if (isHyphenatedToken(term)) { + const sanitized = sanitizeHyphenatedTerm(term); + if (sanitized) (negated ? negative : positive).push(`"${sanitized}"`); + } else { + const sanitized = sanitizeFTS5Term(term); + if (sanitized) (negated ? negative : positive).push(`"${sanitized}"*`); + } } } @@ -488,4 +502,37 @@ describe("buildFTS5Query (lex parser)", () => { test("special chars in terms stripped", () => { expect(buildFTS5Query("hello!world")).toBe('"helloworld"*'); }); + + // Hyphenated token tests + test("hyphenated term → phrase match", () => { + expect(buildFTS5Query("multi-agent")).toBe('"multi agent"'); + }); + + test("hyphenated identifier → phrase match", () => { + expect(buildFTS5Query("DEC-0054")).toBe('"dec 0054"'); + }); + + test("hyphenated model name → phrase match", () => { + expect(buildFTS5Query("gpt-4")).toBe('"gpt 4"'); + }); + + test("multi-hyphen term → phrase match", () => { + expect(buildFTS5Query("foo-bar-baz")).toBe('"foo bar baz"'); + }); + + test("hyphenated term mixed with plain terms", () => { + expect(buildFTS5Query("multi-agent memory")).toBe('"multi agent" AND "memory"*'); + }); + + test("negation still works alongside hyphenated terms", () => { + expect(buildFTS5Query("multi-agent -sports")).toBe('"multi agent" NOT "sports"*'); + }); + + test("negated hyphenated term", () => { + expect(buildFTS5Query("performance -multi-agent")).toBe('"performance"* NOT "multi agent"'); + }); + + test("plain negation still works (not confused with hyphen)", () => { + expect(buildFTS5Query("performance -sports")).toBe('"performance"* NOT "sports"*'); + }); });