mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-21 12:18:24 +08:00
feat: 3-tier eval suite with planted-bug outcome testing (EVALS=1)
Adds comprehensive eval infrastructure: - Tier 1 (free): 13 new static tests — cross-skill path consistency, QA structure validation, greptile format, planted-bug fixture validation - Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo, 3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs) - Tier 3 (LLM judge): QA workflow quality, health rubric clarity, cross-skill consistency, baseline score pinning New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON, review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY). Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks. `bun run test:evals` runs everything that costs money (~$4/run). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
130
test/helpers/llm-judge.ts
Normal file
130
test/helpers/llm-judge.ts
Normal file
@@ -0,0 +1,130 @@
|
||||
/**
|
||||
* Shared LLM-as-judge helpers for eval and E2E tests.
|
||||
*
|
||||
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
||||
* and outcomeJudge (planted-bug detection scorer).
|
||||
*
|
||||
* Requires: ANTHROPIC_API_KEY env var
|
||||
*/
|
||||
|
||||
import Anthropic from '@anthropic-ai/sdk';
|
||||
|
||||
export interface JudgeScore {
|
||||
clarity: number; // 1-5
|
||||
completeness: number; // 1-5
|
||||
actionability: number; // 1-5
|
||||
reasoning: string;
|
||||
}
|
||||
|
||||
export interface OutcomeJudgeResult {
|
||||
detected: string[];
|
||||
missed: string[];
|
||||
false_positives: number;
|
||||
detection_rate: number;
|
||||
evidence_quality: number;
|
||||
reasoning: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call claude-sonnet-4-6 with a prompt, extract JSON response.
|
||||
* Retries once on 429 rate limit errors.
|
||||
*/
|
||||
export async function callJudge<T>(prompt: string): Promise<T> {
|
||||
const client = new Anthropic();
|
||||
|
||||
const makeRequest = () => client.messages.create({
|
||||
model: 'claude-sonnet-4-6',
|
||||
max_tokens: 1024,
|
||||
messages: [{ role: 'user', content: prompt }],
|
||||
});
|
||||
|
||||
let response;
|
||||
try {
|
||||
response = await makeRequest();
|
||||
} catch (err: any) {
|
||||
if (err.status === 429) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
response = await makeRequest();
|
||||
} else {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||
return JSON.parse(jsonMatch[0]) as T;
|
||||
}
|
||||
|
||||
/**
|
||||
* Score documentation quality on clarity/completeness/actionability (1-5).
|
||||
*/
|
||||
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
||||
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||
|
||||
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||
1. Understand what each command does
|
||||
2. Know what arguments to pass
|
||||
3. Know valid values for enum-like parameters
|
||||
4. Construct correct command invocations without guessing
|
||||
|
||||
Rate the following ${section} on three dimensions (1-5 scale):
|
||||
|
||||
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
||||
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
||||
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
||||
|
||||
Scoring guide:
|
||||
- 5: Excellent — no ambiguity, all info present
|
||||
- 4: Good — minor gaps an experienced agent could infer
|
||||
- 3: Adequate — some guessing required
|
||||
- 2: Poor — significant info missing
|
||||
- 1: Unusable — agent would fail without external help
|
||||
|
||||
Respond with ONLY valid JSON in this exact format:
|
||||
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||
|
||||
Here is the ${section} to evaluate:
|
||||
|
||||
${content}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate a QA report against planted-bug ground truth.
|
||||
* Returns detection metrics for the planted bugs.
|
||||
*/
|
||||
export async function outcomeJudge(
|
||||
groundTruth: any,
|
||||
report: string,
|
||||
): Promise<OutcomeJudgeResult> {
|
||||
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
||||
|
||||
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
||||
${JSON.stringify(groundTruth.bugs, null, 2)}
|
||||
|
||||
QA REPORT (generated by an AI agent):
|
||||
${report}
|
||||
|
||||
For each planted bug, determine if the report identified it. A bug counts as
|
||||
"detected" if the report describes the same defect, even if the wording differs.
|
||||
Use the detection_hint keywords as guidance.
|
||||
|
||||
Also count false positives: issues in the report that don't correspond to any
|
||||
planted bug AND aren't legitimate issues with the page.
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{
|
||||
"detected": ["bug-id-1", "bug-id-2"],
|
||||
"missed": ["bug-id-3"],
|
||||
"false_positives": 0,
|
||||
"detection_rate": 2,
|
||||
"evidence_quality": 4,
|
||||
"reasoning": "brief explanation"
|
||||
}
|
||||
|
||||
Rules:
|
||||
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
||||
- detection_rate = length of detected array
|
||||
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
||||
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
||||
}
|
||||
@@ -9,12 +9,21 @@ import { query } from '@anthropic-ai/claude-agent-sdk';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
export interface CostEstimate {
|
||||
inputChars: number;
|
||||
outputChars: number;
|
||||
estimatedTokens: number;
|
||||
estimatedCost: number; // USD (approximate)
|
||||
turnsUsed: number;
|
||||
}
|
||||
|
||||
export interface SkillTestResult {
|
||||
messages: any[];
|
||||
toolCalls: Array<{ tool: string; input: any; output: string }>;
|
||||
browseErrors: string[];
|
||||
exitReason: string;
|
||||
duration: number;
|
||||
costEstimate: CostEstimate;
|
||||
}
|
||||
|
||||
const BROWSE_ERROR_PATTERNS = [
|
||||
@@ -36,7 +45,7 @@ export async function runSkillTest(options: {
|
||||
if (process.env.CLAUDECODE || process.env.CLAUDE_CODE_ENTRYPOINT) {
|
||||
throw new Error(
|
||||
'Cannot run E2E skill tests inside a Claude Code session. ' +
|
||||
'Run from a plain terminal: SKILL_E2E=1 bun test test/skill-e2e.test.ts'
|
||||
'Run from a plain terminal: EVALS=1 bun test test/skill-e2e.test.ts'
|
||||
);
|
||||
}
|
||||
|
||||
@@ -156,5 +165,39 @@ export async function runSkillTest(options: {
|
||||
}
|
||||
}
|
||||
|
||||
return { messages, toolCalls, browseErrors, exitReason, duration };
|
||||
// Estimate cost from message sizes (chars / 4 ≈ tokens, approximate)
|
||||
let inputChars = 0;
|
||||
let outputChars = 0;
|
||||
let turnsUsed = 0;
|
||||
|
||||
for (const msg of messages) {
|
||||
const content = msg.message?.content;
|
||||
if (!content) continue;
|
||||
const text = typeof content === 'string'
|
||||
? content
|
||||
: JSON.stringify(content);
|
||||
|
||||
if (msg.type === 'user') {
|
||||
inputChars += text.length;
|
||||
} else if (msg.type === 'assistant') {
|
||||
outputChars += text.length;
|
||||
turnsUsed++;
|
||||
}
|
||||
}
|
||||
|
||||
const estimatedTokens = Math.round((inputChars + outputChars) / 4);
|
||||
// Approximate pricing: sonnet input ~$3/M, output ~$15/M tokens
|
||||
const inputTokens = Math.round(inputChars / 4);
|
||||
const outputTokens = Math.round(outputChars / 4);
|
||||
const estimatedCost = (inputTokens * 3 + outputTokens * 15) / 1_000_000;
|
||||
|
||||
const costEstimate: CostEstimate = {
|
||||
inputChars,
|
||||
outputChars,
|
||||
estimatedTokens,
|
||||
estimatedCost: Math.round(estimatedCost * 100) / 100,
|
||||
turnsUsed,
|
||||
};
|
||||
|
||||
return { messages, toolCalls, browseErrors, exitReason, duration, costEstimate };
|
||||
}
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
import { ALL_COMMANDS } from '../../browse/src/commands';
|
||||
import { parseSnapshotArgs } from '../../browse/src/snapshot';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
export interface BrowseCommand {
|
||||
command: string;
|
||||
@@ -131,3 +132,75 @@ export function validateSkill(skillPath: string): ValidationResult {
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract all REMOTE_SLUG=$(...) assignment patterns from .md files in given subdirectories.
|
||||
* Returns a Map from filename → array of full assignment lines found.
|
||||
*/
|
||||
export function extractRemoteSlugPatterns(rootDir: string, subdirs: string[]): Map<string, string[]> {
|
||||
const results = new Map<string, string[]>();
|
||||
const pattern = /^REMOTE_SLUG=\$\(.*\)$/;
|
||||
|
||||
for (const subdir of subdirs) {
|
||||
const dir = path.join(rootDir, subdir);
|
||||
if (!fs.existsSync(dir)) continue;
|
||||
|
||||
const files = fs.readdirSync(dir).filter(f => f.endsWith('.md'));
|
||||
for (const file of files) {
|
||||
const filePath = path.join(dir, file);
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
const matches: string[] = [];
|
||||
|
||||
for (const line of content.split('\n')) {
|
||||
const trimmed = line.trim();
|
||||
if (pattern.test(trimmed)) {
|
||||
matches.push(trimmed);
|
||||
}
|
||||
}
|
||||
|
||||
if (matches.length > 0) {
|
||||
results.set(`${subdir}/${file}`, matches);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a markdown weight table anchored to a "### Weights" heading.
|
||||
* Expects rows like: | Category | 15% |
|
||||
* Returns Map<category, number> where number is the percentage (e.g., 15).
|
||||
*/
|
||||
export function extractWeightsFromTable(content: string): Map<string, number> {
|
||||
const weights = new Map<string, number>();
|
||||
|
||||
// Find the ### Weights section
|
||||
const weightsIdx = content.indexOf('### Weights');
|
||||
if (weightsIdx === -1) return weights;
|
||||
|
||||
// Find the table within that section (stop at next heading or end)
|
||||
const section = content.slice(weightsIdx);
|
||||
const lines = section.split('\n');
|
||||
|
||||
for (let i = 1; i < lines.length; i++) {
|
||||
const line = lines[i].trim();
|
||||
|
||||
// Stop at next heading
|
||||
if (line.startsWith('#') && !line.startsWith('###')) break;
|
||||
if (line.startsWith('### ') && i > 0) break;
|
||||
|
||||
// Parse table rows: | Category | N% |
|
||||
const match = line.match(/^\|\s*(\w[\w\s]*\w|\w+)\s*\|\s*(\d+)%\s*\|$/);
|
||||
if (match) {
|
||||
const category = match[1].trim();
|
||||
const pct = parseInt(match[2], 10);
|
||||
// Skip header row
|
||||
if (category !== 'Category' && !isNaN(pct)) {
|
||||
weights.set(category, pct);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return weights;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user