/** * Shared LLM-as-judge helpers for eval and E2E tests. * * Provides callJudge (generic JSON-from-LLM with cache + tier support), * judge (doc quality scorer), and outcomeJudge (planted-bug detection scorer). * * Requires: ANTHROPIC_API_KEY env var (skipped on cache hit) * * Env vars: * EVAL_JUDGE_TIER — model tier for judge calls (fast/standard/full, default: standard) * EVAL_CACHE=0 — bypass cache, always re-run */ import Anthropic from '@anthropic-ai/sdk'; import { computeCacheKey, cacheRead, cacheWrite } from '../../lib/eval-cache'; import { resolveJudgeTier, tierToModel } from '../../lib/eval-tier'; export interface JudgeScore { clarity: number; // 1-5 completeness: number; // 1-5 actionability: number; // 1-5 reasoning: string; } export interface OutcomeJudgeResult { detected: string[]; missed: string[]; false_positives: number; detection_rate: number; evidence_quality: number; reasoning: string; } export interface JudgeMeta { model: string; input_tokens: number; output_tokens: number; cached: boolean; } /** * Call the judge model with a prompt, extract JSON response. * Uses eval-cache for SHA-based caching and eval-tier for model selection. * Retries once on 429 rate limit errors. */ export async function callJudge(prompt: string): Promise<{ result: T; meta: JudgeMeta }> { const model = tierToModel(resolveJudgeTier()); // Check cache (keyed by model + prompt content) const cacheKey = computeCacheKey([], `${model}:${prompt}`); const cached = cacheRead('llm-judge', cacheKey); if (cached !== null) { return { result: cached as T, meta: { model, input_tokens: 0, output_tokens: 0, cached: true }, }; } const client = new Anthropic(); const makeRequest = () => client.messages.create({ model, max_tokens: 1024, messages: [{ role: 'user', content: prompt }], }); let response; try { response = await makeRequest(); } catch (err: any) { if (err.status === 429) { await new Promise(r => setTimeout(r, 1000)); response = await makeRequest(); } else { throw err; } } const text = response.content[0].type === 'text' ? response.content[0].text : ''; const jsonMatch = text.match(/\{[\s\S]*\}/); if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`); const result = JSON.parse(jsonMatch[0]) as T; // Write to cache cacheWrite('llm-judge', cacheKey, result, { model }); const meta: JudgeMeta = { model, input_tokens: (response.usage as any)?.input_tokens || 0, output_tokens: (response.usage as any)?.output_tokens || 0, cached: false, }; return { result, meta }; } /** * Score documentation quality on clarity/completeness/actionability (1-5). */ export async function judge(section: string, content: string): Promise<{ result: JudgeScore; meta: JudgeMeta }> { return callJudge(`You are evaluating documentation quality for an AI coding agent's CLI tool reference. The agent reads this documentation to learn how to use a headless browser CLI. It needs to: 1. Understand what each command does 2. Know what arguments to pass 3. Know valid values for enum-like parameters 4. Construct correct command invocations without guessing Rate the following ${section} on three dimensions (1-5 scale): - **clarity** (1-5): Can an agent understand what each command/flag does from the description alone? - **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything? - **actionability** (1-5): Can an agent construct correct command invocations from this reference alone? Scoring guide: - 5: Excellent — no ambiguity, all info present - 4: Good — minor gaps an experienced agent could infer - 3: Adequate — some guessing required - 2: Poor — significant info missing - 1: Unusable — agent would fail without external help Respond with ONLY valid JSON in this exact format: {"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"} Here is the ${section} to evaluate: ${content}`); } /** * Evaluate a QA report against planted-bug ground truth. * Returns detection metrics for the planted bugs. * Note: outcomeJudge returns just the result (not meta) for backward compat * with E2E test callers. Cache still works internally. */ export async function outcomeJudge( groundTruth: any, report: string, ): Promise { const { result } = await callJudge(`You are evaluating a QA testing report against known ground truth bugs. GROUND TRUTH (${groundTruth.total_bugs} planted bugs): ${JSON.stringify(groundTruth.bugs, null, 2)} QA REPORT (generated by an AI agent): ${report} For each planted bug, determine if the report identified it. A bug counts as "detected" if the report describes the same defect, even if the wording differs. Use the detection_hint keywords as guidance. Also count false positives: issues in the report that don't correspond to any planted bug AND aren't legitimate issues with the page. Respond with ONLY valid JSON: { "detected": ["bug-id-1", "bug-id-2"], "missed": ["bug-id-3"], "false_positives": 0, "detection_rate": 2, "evidence_quality": 4, "reasoning": "brief explanation" } Rules: - "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')} - detection_rate = length of detected array - evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references? 5 = excellent evidence for every bug, 1 = no evidence at all`); return result; }