qmd/test/ast.test.ts
James Risberg 244ddf5ecb feat: AST-aware chunking for code files via tree-sitter
Add opt-in AST-aware chunk boundary detection for code files using
web-tree-sitter. When enabled with `--chunk-strategy auto`, code files
(.ts, .tsx, .js, .jsx, .py, .go, .rs) are chunked at function, class,
and import boundaries instead of arbitrary text positions. Default
behavior (`regex`) is unchanged — no surprises on upgrade.

In testing on QMD's own codebase, AST mode split 42% fewer function
bodies across chunk boundaries compared to regex-only chunking.

Usage:
  qmd embed --chunk-strategy auto
  qmd query "search terms" --chunk-strategy auto

What's included:
- Language detection from file extension with support for TypeScript,
  JavaScript (including arrow functions and function expressions),
  Python, Go, and Rust
- Per-language tree-sitter queries with scored break points aligned to
  the existing markdown scale (class=100, function=90, type=80, import=60)
- AST break points merged with regex break points — highest score wins
  at each position, so embedded markdown (comments, docstrings) still
  benefits from regex patterns
- Refactored chunking core: chunkDocumentWithBreakPoints() extracted,
  mergeBreakPoints() added, async chunkDocumentAsync() wrapper for AST
- ChunkStrategy type ("auto" | "regex") threaded through
  generateEmbeddings(), hybridQuery(), structuredSearch(), CLI, and SDK
- getASTStatus() health check wired into `qmd status`
- Parse failures log a warning and fall back to regex — never crash

Hardening:
- Grammar packages are optionalDependencies with pinned versions to
  prevent ABI breaks from semver drift
- web-tree-sitter is a direct dependency (pinned)
- Errors are logged (not silently swallowed) for debuggability
- Tested on both Node.js and Bun (Bun is actually faster)

Testing:
- 26 unit tests (test/ast.test.ts) — all 4 languages, error handling
- 7 integration tests (test/store.test.ts) — merge, equivalence, bypass
- Standalone test-ast-chunking.mjs with 63 synthetic tests and a
  real-collection performance scanner (npx tsx test-ast-chunking.mjs ~/code)
- Validated end-to-end with qmd embed + qmd query on QMD's own codebase
- Zero markdown regressions across all test paths

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 01:22:39 -04:00

330 lines
11 KiB
TypeScript

/**
* ast.test.ts - Tests for AST-aware chunking support
*
* Tests language detection, AST break point extraction for each
* supported language, and graceful fallback on errors.
*/
import { describe, test, expect } from "vitest";
import { detectLanguage, getASTBreakPoints, extractSymbols } from "../src/ast.js";
import type { SupportedLanguage } from "../src/ast.js";
// =============================================================================
// Language Detection
// =============================================================================
describe("detectLanguage", () => {
test("recognizes TypeScript extensions", () => {
expect(detectLanguage("src/auth.ts")).toBe("typescript");
expect(detectLanguage("src/auth.mts")).toBe("typescript");
expect(detectLanguage("src/auth.cts")).toBe("typescript");
});
test("recognizes TSX extension", () => {
expect(detectLanguage("src/App.tsx")).toBe("tsx");
});
test("recognizes JavaScript extensions", () => {
expect(detectLanguage("src/util.js")).toBe("javascript");
expect(detectLanguage("src/util.mjs")).toBe("javascript");
expect(detectLanguage("src/util.cjs")).toBe("javascript");
});
test("recognizes JSX as tsx", () => {
expect(detectLanguage("src/App.jsx")).toBe("tsx");
});
test("recognizes Python extension", () => {
expect(detectLanguage("src/auth.py")).toBe("python");
});
test("recognizes Go extension", () => {
expect(detectLanguage("src/auth.go")).toBe("go");
});
test("recognizes Rust extension", () => {
expect(detectLanguage("src/auth.rs")).toBe("rust");
});
test("returns null for markdown", () => {
expect(detectLanguage("docs/README.md")).toBeNull();
});
test("returns null for unknown extensions", () => {
expect(detectLanguage("data/file.csv")).toBeNull();
expect(detectLanguage("config.yaml")).toBeNull();
expect(detectLanguage("Makefile")).toBeNull();
});
test("is case-insensitive for extensions", () => {
expect(detectLanguage("src/Auth.TS")).toBe("typescript");
expect(detectLanguage("src/Auth.PY")).toBe("python");
});
test("works with virtual qmd:// paths", () => {
expect(detectLanguage("qmd://myproject/src/auth.ts")).toBe("typescript");
expect(detectLanguage("qmd://docs/README.md")).toBeNull();
});
});
// =============================================================================
// AST Break Points - TypeScript
// =============================================================================
describe("getASTBreakPoints - TypeScript", () => {
const TS_SAMPLE = `import { Database } from './db';
import type { User } from './types';
interface AuthConfig {
secret: string;
ttl: number;
}
type UserId = string;
export class AuthService {
constructor(private db: Database) {}
async authenticate(user: User, token: string): Promise<boolean> {
const session = await this.db.findSession(token);
return session?.userId === user.id;
}
validateToken(token: string): boolean {
return token.length === 64;
}
}
export function hashPassword(password: string): string {
return crypto.createHash('sha256').update(password).digest('hex');
}
`;
test("produces break points at function, class, and import boundaries", async () => {
const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
expect(points.length).toBeGreaterThan(0);
// Should have import, interface, type, class (via export), method, and function break points
const types = points.map(p => p.type);
expect(types.some(t => t.includes("import"))).toBe(true);
expect(types.some(t => t.includes("iface"))).toBe(true);
expect(types.some(t => t.includes("type"))).toBe(true);
expect(types.some(t => t.includes("export") || t.includes("class"))).toBe(true);
expect(types.some(t => t.includes("method"))).toBe(true);
});
test("break points are sorted by position", async () => {
const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
for (let i = 1; i < points.length; i++) {
expect(points[i]!.pos).toBeGreaterThanOrEqual(points[i - 1]!.pos);
}
});
test("scores align with expected hierarchy", async () => {
const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
// Class/interface should score 100
const ifacePoint = points.find(p => p.type === "ast:iface");
expect(ifacePoint?.score).toBe(100);
// Function/method should score 90
const methodPoint = points.find(p => p.type === "ast:method");
expect(methodPoint?.score).toBe(90);
// Import should score 60
const importPoint = points.find(p => p.type === "ast:import");
expect(importPoint?.score).toBe(60);
});
test("break point positions match actual content positions", async () => {
const points = await getASTBreakPoints(TS_SAMPLE, "src/auth.ts");
// First import should be at position 0
const firstImport = points.find(p => p.type === "ast:import");
expect(firstImport).toBeDefined();
expect(TS_SAMPLE.slice(firstImport!.pos, firstImport!.pos + 6)).toBe("import");
});
});
// =============================================================================
// AST Break Points - Python
// =============================================================================
describe("getASTBreakPoints - Python", () => {
const PY_SAMPLE = `import os
from typing import Optional
class AuthService:
def __init__(self, db):
self.db = db
async def authenticate(self, user, token):
session = await self.db.find(token)
return session.user_id == user.id
def validate_token(self, token):
return len(token) == 64
def hash_password(password: str) -> str:
return hashlib.sha256(password.encode()).hexdigest()
@decorator
def decorated_func():
pass
`;
test("produces break points for class, function, import, and decorated definitions", async () => {
const points = await getASTBreakPoints(PY_SAMPLE, "auth.py");
const types = points.map(p => p.type);
expect(types.some(t => t.includes("import"))).toBe(true);
expect(types.some(t => t.includes("class"))).toBe(true);
expect(types.some(t => t.includes("func"))).toBe(true);
expect(types.some(t => t.includes("decorated"))).toBe(true);
});
test("captures method definitions inside classes", async () => {
const points = await getASTBreakPoints(PY_SAMPLE, "auth.py");
// Should capture __init__, authenticate, and validate_token as func
const funcPoints = points.filter(p => p.type === "ast:func");
expect(funcPoints.length).toBeGreaterThanOrEqual(3);
});
});
// =============================================================================
// AST Break Points - Go
// =============================================================================
describe("getASTBreakPoints - Go", () => {
const GO_SAMPLE = `package main
import "fmt"
type AuthService struct {
db *Database
}
func (s *AuthService) Authenticate(user User) bool {
return true
}
func HashPassword(password string) string {
return "hash"
}
`;
test("produces break points for type, function, method, and import", async () => {
const points = await getASTBreakPoints(GO_SAMPLE, "auth.go");
const types = points.map(p => p.type);
expect(types.some(t => t.includes("import"))).toBe(true);
expect(types.some(t => t.includes("type"))).toBe(true);
expect(types.some(t => t.includes("method"))).toBe(true);
expect(types.some(t => t.includes("func"))).toBe(true);
});
test("function and method both score 90", async () => {
const points = await getASTBreakPoints(GO_SAMPLE, "auth.go");
const funcPoint = points.find(p => p.type === "ast:func");
const methodPoint = points.find(p => p.type === "ast:method");
expect(funcPoint?.score).toBe(90);
expect(methodPoint?.score).toBe(90);
});
});
// =============================================================================
// AST Break Points - Rust
// =============================================================================
describe("getASTBreakPoints - Rust", () => {
const RS_SAMPLE = `use std::collections::HashMap;
struct AuthService {
db: Database,
}
impl AuthService {
fn authenticate(&self, user: &User) -> bool {
true
}
}
trait Authenticatable {
fn validate(&self) -> bool;
}
enum Role {
Admin,
User,
}
fn hash_password(password: &str) -> String {
String::new()
}
`;
test("produces break points for struct, impl, trait, enum, function, and use", async () => {
const points = await getASTBreakPoints(RS_SAMPLE, "auth.rs");
const types = points.map(p => p.type);
expect(types.some(t => t.includes("import"))).toBe(true); // use_declaration -> @import
expect(types.some(t => t.includes("struct"))).toBe(true);
expect(types.some(t => t.includes("impl"))).toBe(true);
expect(types.some(t => t.includes("trait"))).toBe(true);
expect(types.some(t => t.includes("enum"))).toBe(true);
expect(types.some(t => t.includes("func"))).toBe(true);
});
test("struct, impl, and trait all score 100", async () => {
const points = await getASTBreakPoints(RS_SAMPLE, "auth.rs");
const structPoint = points.find(p => p.type === "ast:struct");
const implPoint = points.find(p => p.type === "ast:impl");
const traitPoint = points.find(p => p.type === "ast:trait");
expect(structPoint?.score).toBe(100);
expect(implPoint?.score).toBe(100);
expect(traitPoint?.score).toBe(100);
});
});
// =============================================================================
// Error Handling & Fallback
// =============================================================================
describe("getASTBreakPoints - error handling", () => {
test("returns empty array for unsupported file types", async () => {
const points = await getASTBreakPoints("# Hello World", "readme.md");
expect(points).toEqual([]);
});
test("returns empty array for unknown extensions", async () => {
const points = await getASTBreakPoints("data,here", "file.csv");
expect(points).toEqual([]);
});
test("handles empty content gracefully", async () => {
const points = await getASTBreakPoints("", "empty.ts");
expect(points).toEqual([]);
});
test("handles syntactically invalid code gracefully", async () => {
// Tree-sitter is error-tolerant, so this should still parse (with error nodes)
// but should not crash
const points = await getASTBreakPoints("function { broken syntax %%%", "broken.ts");
// Should either return some partial break points or empty array — not throw
expect(Array.isArray(points)).toBe(true);
});
});
// =============================================================================
// Symbol Extraction Stub (Phase 2)
// =============================================================================
describe("extractSymbols", () => {
test("returns empty array (Phase 2 stub)", () => {
const symbols = extractSymbols("function foo() {}", "typescript", 0, 18);
expect(symbols).toEqual([]);
});
});