feat: 3-tier eval suite with planted-bug outcome testing (EVALS=1)

Adds comprehensive eval infrastructure:
- Tier 1 (free): 13 new static tests — cross-skill path consistency, QA
  structure validation, greptile format, planted-bug fixture validation
- Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo,
  3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs)
- Tier 3 (LLM judge): QA workflow quality, health rubric clarity,
  cross-skill consistency, baseline score pinning

New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON,
review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY).

Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks.
`bun run test:evals` runs everything that costs money (~$4/run).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-14 01:17:36 -05:00
parent 5155fe3a28
commit 76803d789a
17 changed files with 1352 additions and 94 deletions

130
test/helpers/llm-judge.ts Normal file
View File

@@ -0,0 +1,130 @@
/**
* Shared LLM-as-judge helpers for eval and E2E tests.
*
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
* and outcomeJudge (planted-bug detection scorer).
*
* Requires: ANTHROPIC_API_KEY env var
*/
import Anthropic from '@anthropic-ai/sdk';
export interface JudgeScore {
clarity: number; // 1-5
completeness: number; // 1-5
actionability: number; // 1-5
reasoning: string;
}
export interface OutcomeJudgeResult {
detected: string[];
missed: string[];
false_positives: number;
detection_rate: number;
evidence_quality: number;
reasoning: string;
}
/**
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
* Retries once on 429 rate limit errors.
*/
export async function callJudge<T>(prompt: string): Promise<T> {
const client = new Anthropic();
const makeRequest = () => client.messages.create({
model: 'claude-sonnet-4-6',
max_tokens: 1024,
messages: [{ role: 'user', content: prompt }],
});
let response;
try {
response = await makeRequest();
} catch (err: any) {
if (err.status === 429) {
await new Promise(r => setTimeout(r, 1000));
response = await makeRequest();
} else {
throw err;
}
}
const text = response.content[0].type === 'text' ? response.content[0].text : '';
const jsonMatch = text.match(/\{[\s\S]*\}/);
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
return JSON.parse(jsonMatch[0]) as T;
}
/**
* Score documentation quality on clarity/completeness/actionability (1-5).
*/
export async function judge(section: string, content: string): Promise<JudgeScore> {
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
1. Understand what each command does
2. Know what arguments to pass
3. Know valid values for enum-like parameters
4. Construct correct command invocations without guessing
Rate the following ${section} on three dimensions (1-5 scale):
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
Scoring guide:
- 5: Excellent — no ambiguity, all info present
- 4: Good — minor gaps an experienced agent could infer
- 3: Adequate — some guessing required
- 2: Poor — significant info missing
- 1: Unusable — agent would fail without external help
Respond with ONLY valid JSON in this exact format:
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
Here is the ${section} to evaluate:
${content}`);
}
/**
* Evaluate a QA report against planted-bug ground truth.
* Returns detection metrics for the planted bugs.
*/
export async function outcomeJudge(
groundTruth: any,
report: string,
): Promise<OutcomeJudgeResult> {
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
${JSON.stringify(groundTruth.bugs, null, 2)}
QA REPORT (generated by an AI agent):
${report}
For each planted bug, determine if the report identified it. A bug counts as
"detected" if the report describes the same defect, even if the wording differs.
Use the detection_hint keywords as guidance.
Also count false positives: issues in the report that don't correspond to any
planted bug AND aren't legitimate issues with the page.
Respond with ONLY valid JSON:
{
"detected": ["bug-id-1", "bug-id-2"],
"missed": ["bug-id-3"],
"false_positives": 0,
"detection_rate": 2,
"evidence_quality": 4,
"reasoning": "brief explanation"
}
Rules:
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
- detection_rate = length of detected array
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
5 = excellent evidence for every bug, 1 = no evidence at all`);
}

View File

@@ -9,12 +9,21 @@ import { query } from '@anthropic-ai/claude-agent-sdk';
import * as fs from 'fs';
import * as path from 'path';
export interface CostEstimate {
inputChars: number;
outputChars: number;
estimatedTokens: number;
estimatedCost: number; // USD (approximate)
turnsUsed: number;
}
export interface SkillTestResult {
messages: any[];
toolCalls: Array<{ tool: string; input: any; output: string }>;
browseErrors: string[];
exitReason: string;
duration: number;
costEstimate: CostEstimate;
}
const BROWSE_ERROR_PATTERNS = [
@@ -36,7 +45,7 @@ export async function runSkillTest(options: {
if (process.env.CLAUDECODE || process.env.CLAUDE_CODE_ENTRYPOINT) {
throw new Error(
'Cannot run E2E skill tests inside a Claude Code session. ' +
'Run from a plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts'
'Run from a plain terminal: EVALS=1 bun test test/skill-e2e.test.ts'
);
}
@@ -156,5 +165,39 @@ export async function runSkillTest(options: {
}
}
return { messages, toolCalls, browseErrors, exitReason, duration };
// Estimate cost from message sizes (chars / 4 ≈ tokens, approximate)
let inputChars = 0;
let outputChars = 0;
let turnsUsed = 0;
for (const msg of messages) {
const content = msg.message?.content;
if (!content) continue;
const text = typeof content === 'string'
? content
: JSON.stringify(content);
if (msg.type === 'user') {
inputChars += text.length;
} else if (msg.type === 'assistant') {
outputChars += text.length;
turnsUsed++;
}
}
const estimatedTokens = Math.round((inputChars + outputChars) / 4);
// Approximate pricing: sonnet input ~$3/M, output ~$15/M tokens
const inputTokens = Math.round(inputChars / 4);
const outputTokens = Math.round(outputChars / 4);
const estimatedCost = (inputTokens * 3 + outputTokens * 15) / 1_000_000;
const costEstimate: CostEstimate = {
inputChars,
outputChars,
estimatedTokens,
estimatedCost: Math.round(estimatedCost * 100) / 100,
turnsUsed,
};
return { messages, toolCalls, browseErrors, exitReason, duration, costEstimate };
}

View File

@@ -13,6 +13,7 @@
import { ALL_COMMANDS } from '../../browse/src/commands';
import { parseSnapshotArgs } from '../../browse/src/snapshot';
import * as fs from 'fs';
import * as path from 'path';
export interface BrowseCommand {
command: string;
@@ -131,3 +132,75 @@ export function validateSkill(skillPath: string): ValidationResult {
return result;
}
/**
* Extract all REMOTE_SLUG=$(...) assignment patterns from .md files in given subdirectories.
* Returns a Map from filename → array of full assignment lines found.
*/
export function extractRemoteSlugPatterns(rootDir: string, subdirs: string[]): Map<string, string[]> {
const results = new Map<string, string[]>();
const pattern = /^REMOTE_SLUG=\$\(.*\)$/;
for (const subdir of subdirs) {
const dir = path.join(rootDir, subdir);
if (!fs.existsSync(dir)) continue;
const files = fs.readdirSync(dir).filter(f => f.endsWith('.md'));
for (const file of files) {
const filePath = path.join(dir, file);
const content = fs.readFileSync(filePath, 'utf-8');
const matches: string[] = [];
for (const line of content.split('\n')) {
const trimmed = line.trim();
if (pattern.test(trimmed)) {
matches.push(trimmed);
}
}
if (matches.length > 0) {
results.set(`${subdir}/${file}`, matches);
}
}
}
return results;
}
/**
* Parse a markdown weight table anchored to a "### Weights" heading.
* Expects rows like: | Category | 15% |
* Returns Map<category, number> where number is the percentage (e.g., 15).
*/
export function extractWeightsFromTable(content: string): Map<string, number> {
const weights = new Map<string, number>();
// Find the ### Weights section
const weightsIdx = content.indexOf('### Weights');
if (weightsIdx === -1) return weights;
// Find the table within that section (stop at next heading or end)
const section = content.slice(weightsIdx);
const lines = section.split('\n');
for (let i = 1; i < lines.length; i++) {
const line = lines[i].trim();
// Stop at next heading
if (line.startsWith('#') && !line.startsWith('###')) break;
if (line.startsWith('### ') && i > 0) break;
// Parse table rows: | Category | N% |
const match = line.match(/^\|\s*(\w[\w\s]*\w|\w+)\s*\|\s*(\d+)%\s*\|$/);
if (match) {
const category = match[1].trim();
const pct = parseInt(match[2], 10);
// Skip header row
if (category !== 'Category' && !isNaN(pct)) {
weights.set(category, pct);
}
}
}
return weights;
}