feat: 3-tier eval suite with planted-bug outcome testing (EVALS=1)

Adds comprehensive eval infrastructure: - Tier 1 (free): 13 new static tests — cross-skill path consistency, QA structure validation, greptile format, planted-bug fixture validation - Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo, 3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs) - Tier 3 (LLM judge): QA workflow quality, health rubric clarity, cross-skill consistency, baseline score pinning New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON, review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY). Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks. `bun run test:evals` runs everything that costs money (~$4/run). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-21 12:18:24 +08:00 · 2026-03-14 01:17:36 -05:00
parent 5155fe3a28
commit 76803d789a
17 changed files with 1352 additions and 94 deletions
--- a/test/helpers/llm-judge.ts
+++ b/test/helpers/llm-judge.ts
@@ -0,0 +1,130 @@
+/**
+ * Shared LLM-as-judge helpers for eval and E2E tests.
+ *
+ * Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
+ * and outcomeJudge (planted-bug detection scorer).
+ *
+ * Requires: ANTHROPIC_API_KEY env var
+ */
+
+import Anthropic from '@anthropic-ai/sdk';
+
+export interface JudgeScore {
+  clarity: number;       // 1-5
+  completeness: number;  // 1-5
+  actionability: number; // 1-5
+  reasoning: string;
+}
+
+export interface OutcomeJudgeResult {
+  detected: string[];
+  missed: string[];
+  false_positives: number;
+  detection_rate: number;
+  evidence_quality: number;
+  reasoning: string;
+}
+
+/**
+ * Call claude-sonnet-4-6 with a prompt, extract JSON response.
+ * Retries once on 429 rate limit errors.
+ */
+export async function callJudge<T>(prompt: string): Promise<T> {
+  const client = new Anthropic();
+
+  const makeRequest = () => client.messages.create({
+    model: 'claude-sonnet-4-6',
+    max_tokens: 1024,
+    messages: [{ role: 'user', content: prompt }],
+  });
+
+  let response;
+  try {
+    response = await makeRequest();
+  } catch (err: any) {
+    if (err.status === 429) {
+      await new Promise(r => setTimeout(r, 1000));
+      response = await makeRequest();
+    } else {
+      throw err;
+    }
+  }
+
+  const text = response.content[0].type === 'text' ? response.content[0].text : '';
+  const jsonMatch = text.match(/\{[\s\S]*\}/);
+  if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
+  return JSON.parse(jsonMatch[0]) as T;
+}
+
+/**
+ * Score documentation quality on clarity/completeness/actionability (1-5).
+ */
+export async function judge(section: string, content: string): Promise<JudgeScore> {
+  return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
+
+The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
+1. Understand what each command does
+2. Know what arguments to pass
+3. Know valid values for enum-like parameters
+4. Construct correct command invocations without guessing
+
+Rate the following ${section} on three dimensions (1-5 scale):
+
+- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
+- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
+- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
+
+Scoring guide:
+- 5: Excellent — no ambiguity, all info present
+- 4: Good — minor gaps an experienced agent could infer
+- 3: Adequate — some guessing required
+- 2: Poor — significant info missing
+- 1: Unusable — agent would fail without external help
+
+Respond with ONLY valid JSON in this exact format:
+{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
+
+Here is the ${section} to evaluate:
+
+${content}`);
+}
+
+/**
+ * Evaluate a QA report against planted-bug ground truth.
+ * Returns detection metrics for the planted bugs.
+ */
+export async function outcomeJudge(
+  groundTruth: any,
+  report: string,
+): Promise<OutcomeJudgeResult> {
+  return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
+
+GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
+${JSON.stringify(groundTruth.bugs, null, 2)}
+
+QA REPORT (generated by an AI agent):
+${report}
+
+For each planted bug, determine if the report identified it. A bug counts as
+"detected" if the report describes the same defect, even if the wording differs.
+Use the detection_hint keywords as guidance.
+
+Also count false positives: issues in the report that don't correspond to any
+planted bug AND aren't legitimate issues with the page.
+
+Respond with ONLY valid JSON:
+{
+  "detected": ["bug-id-1", "bug-id-2"],
+  "missed": ["bug-id-3"],
+  "false_positives": 0,
+  "detection_rate": 2,
+  "evidence_quality": 4,
+  "reasoning": "brief explanation"
+}
+
+Rules:
+- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
+- detection_rate = length of detected array
+- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
+  5 = excellent evidence for every bug, 1 = no evidence at all`);
+}
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -9,12 +9,21 @@ import { query } from '@anthropic-ai/claude-agent-sdk';
 import * as fs from 'fs';
 import * as path from 'path';

+export interface CostEstimate {
+  inputChars: number;
+  outputChars: number;
+  estimatedTokens: number;
+  estimatedCost: number;  // USD (approximate)
+  turnsUsed: number;
+}
+
 export interface SkillTestResult {
  messages: any[];
  toolCalls: Array<{ tool: string; input: any; output: string }>;
  browseErrors: string[];
  exitReason: string;
  duration: number;
+  costEstimate: CostEstimate;
 }

 const BROWSE_ERROR_PATTERNS = [
@@ -36,7 +45,7 @@ export async function runSkillTest(options: {
  if (process.env.CLAUDECODE || process.env.CLAUDE_CODE_ENTRYPOINT) {
    throw new Error(
      'Cannot run E2E skill tests inside a Claude Code session. ' +
-      'Run from a plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts'
+      'Run from a plain terminal: EVALS=1 bun test test/skill-e2e.test.ts'
    );
  }

@@ -156,5 +165,39 @@ export async function runSkillTest(options: {
    }
  }

-  return { messages, toolCalls, browseErrors, exitReason, duration };
+  // Estimate cost from message sizes (chars / 4 ≈ tokens, approximate)
+  let inputChars = 0;
+  let outputChars = 0;
+  let turnsUsed = 0;
+
+  for (const msg of messages) {
+    const content = msg.message?.content;
+    if (!content) continue;
+    const text = typeof content === 'string'
+      ? content
+      : JSON.stringify(content);
+
+    if (msg.type === 'user') {
+      inputChars += text.length;
+    } else if (msg.type === 'assistant') {
+      outputChars += text.length;
+      turnsUsed++;
+    }
+  }
+
+  const estimatedTokens = Math.round((inputChars + outputChars) / 4);
+  // Approximate pricing: sonnet input ~$3/M, output ~$15/M tokens
+  const inputTokens = Math.round(inputChars / 4);
+  const outputTokens = Math.round(outputChars / 4);
+  const estimatedCost = (inputTokens * 3 + outputTokens * 15) / 1_000_000;
+
+  const costEstimate: CostEstimate = {
+    inputChars,
+    outputChars,
+    estimatedTokens,
+    estimatedCost: Math.round(estimatedCost * 100) / 100,
+    turnsUsed,
+  };
+
+  return { messages, toolCalls, browseErrors, exitReason, duration, costEstimate };
 }
--- a/test/helpers/skill-parser.ts
+++ b/test/helpers/skill-parser.ts
@@ -13,6 +13,7 @@
 import { ALL_COMMANDS } from '../../browse/src/commands';
 import { parseSnapshotArgs } from '../../browse/src/snapshot';
 import * as fs from 'fs';
+import * as path from 'path';

 export interface BrowseCommand {
  command: string;
@@ -131,3 +132,75 @@ export function validateSkill(skillPath: string): ValidationResult {

  return result;
 }
+
+/**
+ * Extract all REMOTE_SLUG=$(...) assignment patterns from .md files in given subdirectories.
+ * Returns a Map from filename → array of full assignment lines found.
+ */
+export function extractRemoteSlugPatterns(rootDir: string, subdirs: string[]): Map<string, string[]> {
+  const results = new Map<string, string[]>();
+  const pattern = /^REMOTE_SLUG=\$\(.*\)$/;
+
+  for (const subdir of subdirs) {
+    const dir = path.join(rootDir, subdir);
+    if (!fs.existsSync(dir)) continue;
+
+    const files = fs.readdirSync(dir).filter(f => f.endsWith('.md'));
+    for (const file of files) {
+      const filePath = path.join(dir, file);
+      const content = fs.readFileSync(filePath, 'utf-8');
+      const matches: string[] = [];
+
+      for (const line of content.split('\n')) {
+        const trimmed = line.trim();
+        if (pattern.test(trimmed)) {
+          matches.push(trimmed);
+        }
+      }
+
+      if (matches.length > 0) {
+        results.set(`${subdir}/${file}`, matches);
+      }
+    }
+  }
+
+  return results;
+}
+
+/**
+ * Parse a markdown weight table anchored to a "### Weights" heading.
+ * Expects rows like: | Category | 15% |
+ * Returns Map<category, number> where number is the percentage (e.g., 15).
+ */
+export function extractWeightsFromTable(content: string): Map<string, number> {
+  const weights = new Map<string, number>();
+
+  // Find the ### Weights section
+  const weightsIdx = content.indexOf('### Weights');
+  if (weightsIdx === -1) return weights;
+
+  // Find the table within that section (stop at next heading or end)
+  const section = content.slice(weightsIdx);
+  const lines = section.split('\n');
+
+  for (let i = 1; i < lines.length; i++) {
+    const line = lines[i].trim();
+
+    // Stop at next heading
+    if (line.startsWith('#') && !line.startsWith('###')) break;
+    if (line.startsWith('### ') && i > 0) break;
+
+    // Parse table rows: | Category | N% |
+    const match = line.match(/^\|\s*(\w[\w\s]*\w|\w+)\s*\|\s*(\d+)%\s*\|$/);
+    if (match) {
+      const category = match[1].trim();
+      const pct = parseInt(match[2], 10);
+      // Skip header row
+      if (category !== 'Category' && !isNaN(pct)) {
+        weights.set(category, pct);
+      }
+    }
+  }
+
+  return weights;
+}