diff --git a/test/structured-search.test.ts b/test/structured-search.test.ts new file mode 100644 index 0000000..292a98e --- /dev/null +++ b/test/structured-search.test.ts @@ -0,0 +1,320 @@ +/** + * structured-search.test.ts - Tests for structured search functionality + * + * Tests cover: + * - CLI query parser (parseStructuredQuery) + * - StructuredSubSearch type validation + * - Basic structuredSearch function behavior + * + * Run with: bun test structured-search.test.ts + */ + +import { describe, test, expect, beforeAll, afterAll } from "vitest"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + createStore, + structuredSearch, + type StructuredSubSearch, + type Store, +} from "../src/store.js"; +import { disposeDefaultLlamaCpp } from "../src/llm.js"; + +// ============================================================================= +// parseStructuredQuery Tests (CLI Parser) +// ============================================================================= + +/** + * Parse structured search query syntax. + * This is a copy of the function from qmd.ts for isolated testing. + */ +function parseStructuredQuery(query: string): StructuredSubSearch[] | null { + const lines = query.split('\n').map(l => l.trim()).filter(l => l.length > 0); + if (lines.length === 0) return null; + + const prefixRe = /^(lex|vec|hyde):\s*/i; + const searches: StructuredSubSearch[] = []; + const plainLines: string[] = []; + + for (const line of lines) { + const match = line.match(prefixRe); + if (match) { + const type = match[1]!.toLowerCase() as 'lex' | 'vec' | 'hyde'; + const text = line.slice(match[0].length).trim(); + if (text.length > 0) { + searches.push({ type, query: text }); + } + } else { + plainLines.push(line); + } + } + + // All plain lines, no prefixes -> null (use normal expansion) + if (searches.length === 0 && plainLines.length === 1) { + return null; + } + + // Multiple plain lines without prefixes -> ambiguous, error + if (plainLines.length > 1) { + throw new Error("Ambiguous query: multiple lines without lex:/vec:/hyde: prefix."); + } + + // Mix of prefixed and one plain line -> treat plain as lex + if (plainLines.length === 1) { + searches.unshift({ type: 'lex', query: plainLines[0]! }); + } + + return searches.length > 0 ? searches : null; +} + +describe("parseStructuredQuery", () => { + describe("plain queries (returns null for normal expansion)", () => { + test("single line without prefix", () => { + expect(parseStructuredQuery("CAP theorem")).toBeNull(); + expect(parseStructuredQuery("distributed systems")).toBeNull(); + }); + + test("empty queries", () => { + expect(parseStructuredQuery("")).toBeNull(); + expect(parseStructuredQuery(" ")).toBeNull(); + expect(parseStructuredQuery("\n\n")).toBeNull(); + }); + }); + + describe("single prefixed queries", () => { + test("lex: prefix", () => { + const result = parseStructuredQuery("lex: CAP theorem"); + expect(result).toEqual([{ type: "lex", query: "CAP theorem" }]); + }); + + test("vec: prefix", () => { + const result = parseStructuredQuery("vec: what is the CAP theorem"); + expect(result).toEqual([{ type: "vec", query: "what is the CAP theorem" }]); + }); + + test("hyde: prefix", () => { + const result = parseStructuredQuery("hyde: The CAP theorem states that..."); + expect(result).toEqual([{ type: "hyde", query: "The CAP theorem states that..." }]); + }); + + test("uppercase prefix", () => { + expect(parseStructuredQuery("LEX: keywords")).toEqual([{ type: "lex", query: "keywords" }]); + expect(parseStructuredQuery("VEC: question")).toEqual([{ type: "vec", query: "question" }]); + expect(parseStructuredQuery("HYDE: passage")).toEqual([{ type: "hyde", query: "passage" }]); + }); + + test("mixed case prefix", () => { + expect(parseStructuredQuery("Lex: test")).toEqual([{ type: "lex", query: "test" }]); + expect(parseStructuredQuery("VeC: test")).toEqual([{ type: "vec", query: "test" }]); + }); + }); + + describe("multiple prefixed queries", () => { + test("lex + vec", () => { + const result = parseStructuredQuery("lex: keywords\nvec: natural language"); + expect(result).toEqual([ + { type: "lex", query: "keywords" }, + { type: "vec", query: "natural language" }, + ]); + }); + + test("all three types", () => { + const result = parseStructuredQuery("lex: keywords\nvec: question\nhyde: hypothetical doc"); + expect(result).toEqual([ + { type: "lex", query: "keywords" }, + { type: "vec", query: "question" }, + { type: "hyde", query: "hypothetical doc" }, + ]); + }); + + test("duplicate types allowed", () => { + const result = parseStructuredQuery("lex: term1\nlex: term2\nlex: term3"); + expect(result).toEqual([ + { type: "lex", query: "term1" }, + { type: "lex", query: "term2" }, + { type: "lex", query: "term3" }, + ]); + }); + + test("order preserved", () => { + const result = parseStructuredQuery("hyde: passage\nvec: question\nlex: keywords"); + expect(result).toEqual([ + { type: "hyde", query: "passage" }, + { type: "vec", query: "question" }, + { type: "lex", query: "keywords" }, + ]); + }); + }); + + describe("mixed plain and prefixed", () => { + test("single plain line with prefixed lines -> plain becomes lex first", () => { + const result = parseStructuredQuery("plain keywords\nvec: semantic question"); + expect(result).toEqual([ + { type: "lex", query: "plain keywords" }, + { type: "vec", query: "semantic question" }, + ]); + }); + + test("plain line prepended before other prefixed", () => { + const result = parseStructuredQuery("keywords\nhyde: passage\nvec: question"); + expect(result).toEqual([ + { type: "lex", query: "keywords" }, + { type: "hyde", query: "passage" }, + { type: "vec", query: "question" }, + ]); + }); + }); + + describe("error cases", () => { + test("multiple plain lines throws", () => { + expect(() => parseStructuredQuery("line one\nline two")).toThrow("Ambiguous query"); + }); + + test("three plain lines throws", () => { + expect(() => parseStructuredQuery("a\nb\nc")).toThrow("Ambiguous query"); + }); + }); + + describe("whitespace handling", () => { + test("empty lines ignored", () => { + const result = parseStructuredQuery("lex: keywords\n\nvec: question\n"); + expect(result).toEqual([ + { type: "lex", query: "keywords" }, + { type: "vec", query: "question" }, + ]); + }); + + test("whitespace-only lines ignored", () => { + const result = parseStructuredQuery("lex: keywords\n \nvec: question"); + expect(result).toEqual([ + { type: "lex", query: "keywords" }, + { type: "vec", query: "question" }, + ]); + }); + + test("leading/trailing whitespace trimmed from lines", () => { + const result = parseStructuredQuery(" lex: keywords \n vec: question "); + expect(result).toEqual([ + { type: "lex", query: "keywords" }, + { type: "vec", query: "question" }, + ]); + }); + + test("internal whitespace preserved in query", () => { + const result = parseStructuredQuery("lex: multiple spaces "); + expect(result).toEqual([{ type: "lex", query: "multiple spaces" }]); + }); + + test("empty prefix value skipped", () => { + const result = parseStructuredQuery("lex: \nvec: actual query"); + expect(result).toEqual([{ type: "vec", query: "actual query" }]); + }); + + test("only empty prefix values returns null", () => { + const result = parseStructuredQuery("lex: \nvec: \nhyde: "); + expect(result).toBeNull(); + }); + }); + + describe("edge cases", () => { + test("colon in query text preserved", () => { + const result = parseStructuredQuery("lex: time: 12:30 PM"); + expect(result).toEqual([{ type: "lex", query: "time: 12:30 PM" }]); + }); + + test("prefix-like text in query preserved", () => { + const result = parseStructuredQuery("vec: what does lex: mean"); + expect(result).toEqual([{ type: "vec", query: "what does lex: mean" }]); + }); + + test("newline in hyde passage (as single line)", () => { + // If user wants actual newlines in hyde, they need to escape or use multiline syntax + const result = parseStructuredQuery("hyde: The answer is X. It means Y."); + expect(result).toEqual([{ type: "hyde", query: "The answer is X. It means Y." }]); + }); + }); +}); + +// ============================================================================= +// StructuredSubSearch Type Tests +// ============================================================================= + +describe("StructuredSubSearch type", () => { + test("accepts lex type", () => { + const search: StructuredSubSearch = { type: "lex", query: "test" }; + expect(search.type).toBe("lex"); + expect(search.query).toBe("test"); + }); + + test("accepts vec type", () => { + const search: StructuredSubSearch = { type: "vec", query: "test" }; + expect(search.type).toBe("vec"); + expect(search.query).toBe("test"); + }); + + test("accepts hyde type", () => { + const search: StructuredSubSearch = { type: "hyde", query: "test" }; + expect(search.type).toBe("hyde"); + expect(search.query).toBe("test"); + }); +}); + +// ============================================================================= +// structuredSearch Function Tests +// ============================================================================= + +describe("structuredSearch", () => { + let testDir: string; + let store: Store; + + beforeAll(async () => { + testDir = await mkdtemp(join(tmpdir(), "qmd-structured-test-")); + const testDbPath = join(testDir, "test.sqlite"); + const testConfigDir = await mkdtemp(join(testDir, "config-")); + process.env.QMD_CONFIG_DIR = testConfigDir; + store = createStore(testDbPath); + }); + + afterAll(async () => { + store.close(); + await disposeDefaultLlamaCpp(); + if (testDir) { + await rm(testDir, { recursive: true, force: true }); + } + }); + + test("returns empty array for empty searches", async () => { + const results = await structuredSearch(store, []); + expect(results).toEqual([]); + }); + + test("returns empty array when no documents match", async () => { + const results = await structuredSearch(store, [ + { type: "lex", query: "nonexistent-term-xyz123" } + ]); + expect(results).toEqual([]); + }); + + test("accepts all search types without error", async () => { + // These may return empty results but should not throw + await expect(structuredSearch(store, [{ type: "lex", query: "test" }])).resolves.toBeDefined(); + // vec and hyde require embeddings, so just test lex + }); + + test("respects limit option", async () => { + const results = await structuredSearch(store, [ + { type: "lex", query: "test" } + ], { limit: 5 }); + expect(results.length).toBeLessThanOrEqual(5); + }); + + test("respects minScore option", async () => { + const results = await structuredSearch(store, [ + { type: "lex", query: "test" } + ], { minScore: 0.5 }); + for (const r of results) { + expect(r.score).toBeGreaterThanOrEqual(0.5); + } + }); +});