mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-19 02:42:29 +08:00
feat: 3-tier eval suite with planted-bug outcome testing (EVALS=1)
Adds comprehensive eval infrastructure: - Tier 1 (free): 13 new static tests — cross-skill path consistency, QA structure validation, greptile format, planted-bug fixture validation - Tier 2 (Agent SDK E2E): /qa quick, /review with pre-built git repo, 3 planted-bug outcome evals (static, SPA, checkout — each with 5 bugs) - Tier 3 (LLM judge): QA workflow quality, health rubric clarity, cross-skill consistency, baseline score pinning New fixtures: 3 HTML pages with 15 total planted bugs, ground truth JSON, review-eval-vuln.rb, eval-baselines.json. Shared llm-judge.ts helper (DRY). Unified EVALS=1 flag replaces SKILL_E2E + ANTHROPIC_API_KEY checks. `bun run test:evals` runs everything that costs money (~$4/run). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import { describe, test, expect } from 'bun:test';
|
||||
import { validateSkill } from './helpers/skill-parser';
|
||||
import { validateSkill, extractRemoteSlugPatterns, extractWeightsFromTable } from './helpers/skill-parser';
|
||||
import { ALL_COMMANDS, COMMAND_DESCRIPTIONS, READ_COMMANDS, WRITE_COMMANDS, META_COMMANDS } from '../browse/src/commands';
|
||||
import { SNAPSHOT_FLAGS } from '../browse/src/snapshot';
|
||||
import * as fs from 'fs';
|
||||
@@ -151,3 +151,222 @@ describe('Generated SKILL.md freshness', () => {
|
||||
expect(content).toContain('AUTO-GENERATED');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Cross-skill path consistency (A1) ---
|
||||
|
||||
describe('Cross-skill path consistency', () => {
|
||||
test('REMOTE_SLUG derivation pattern is identical across files that use it', () => {
|
||||
const patterns = extractRemoteSlugPatterns(ROOT, ['qa', 'review']);
|
||||
const allPatterns: string[] = [];
|
||||
|
||||
for (const [, filePatterns] of patterns) {
|
||||
allPatterns.push(...filePatterns);
|
||||
}
|
||||
|
||||
// Should find at least 2 occurrences (qa/SKILL.md + review/greptile-triage.md)
|
||||
expect(allPatterns.length).toBeGreaterThanOrEqual(2);
|
||||
|
||||
// All occurrences must be character-for-character identical
|
||||
const unique = new Set(allPatterns);
|
||||
if (unique.size > 1) {
|
||||
const variants = Array.from(unique);
|
||||
throw new Error(
|
||||
`REMOTE_SLUG pattern differs across files:\n` +
|
||||
variants.map((v, i) => ` ${i + 1}: ${v}`).join('\n')
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test('all greptile-history write references specify both per-project and global paths', () => {
|
||||
const filesToCheck = [
|
||||
'review/SKILL.md',
|
||||
'ship/SKILL.md',
|
||||
'review/greptile-triage.md',
|
||||
];
|
||||
|
||||
for (const file of filesToCheck) {
|
||||
const filePath = path.join(ROOT, file);
|
||||
if (!fs.existsSync(filePath)) continue;
|
||||
const content = fs.readFileSync(filePath, 'utf-8');
|
||||
|
||||
const hasBoth = (content.includes('per-project') && content.includes('global')) ||
|
||||
(content.includes('$REMOTE_SLUG/greptile-history') && content.includes('~/.gstack/greptile-history'));
|
||||
|
||||
expect(hasBoth).toBe(true);
|
||||
}
|
||||
});
|
||||
|
||||
test('greptile-triage.md contains both project and global history paths', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
expect(content).toContain('$REMOTE_SLUG/greptile-history.md');
|
||||
expect(content).toContain('~/.gstack/greptile-history.md');
|
||||
});
|
||||
|
||||
test('retro/SKILL.md reads global greptile-history (not per-project)', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');
|
||||
expect(content).toContain('~/.gstack/greptile-history.md');
|
||||
// Should NOT reference per-project path for reads
|
||||
expect(content).not.toContain('$REMOTE_SLUG/greptile-history.md');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: QA skill structure validation (A2) ---
|
||||
|
||||
describe('QA skill structure validation', () => {
|
||||
const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');
|
||||
|
||||
test('qa/SKILL.md has all 7 phases', () => {
|
||||
const phases = [
|
||||
'Phase 1', 'Initialize',
|
||||
'Phase 2', 'Authenticate',
|
||||
'Phase 3', 'Recon',
|
||||
'Phase 4', 'Test Plan',
|
||||
'Phase 5', 'Execute',
|
||||
'Phase 6', 'Document',
|
||||
'Phase 7', 'Wrap',
|
||||
];
|
||||
for (const phase of phases) {
|
||||
expect(qaContent).toContain(phase);
|
||||
}
|
||||
});
|
||||
|
||||
test('risk heuristic table has all required patterns', () => {
|
||||
const patterns = [
|
||||
'Form/payment/auth/checkout',
|
||||
'Controller/route with mutations',
|
||||
'Config/env/deployment',
|
||||
'API endpoint handlers',
|
||||
'View/template/component',
|
||||
'Model/service with business logic',
|
||||
'CSS/style-only',
|
||||
'Docs/readme/comments',
|
||||
'Test files only',
|
||||
];
|
||||
for (const pattern of patterns) {
|
||||
expect(qaContent).toContain(pattern);
|
||||
}
|
||||
|
||||
// Risk levels
|
||||
for (const level of ['HIGH', 'MEDIUM', 'LOW', 'SKIP']) {
|
||||
expect(qaContent).toContain(level);
|
||||
}
|
||||
});
|
||||
|
||||
test('health score weights sum to 100%', () => {
|
||||
const weights = extractWeightsFromTable(qaContent);
|
||||
expect(weights.size).toBeGreaterThan(0);
|
||||
|
||||
let sum = 0;
|
||||
for (const pct of weights.values()) {
|
||||
sum += pct;
|
||||
}
|
||||
expect(sum).toBe(100);
|
||||
});
|
||||
|
||||
test('health score has all 8 categories', () => {
|
||||
const weights = extractWeightsFromTable(qaContent);
|
||||
const expectedCategories = [
|
||||
'Console', 'Links', 'Visual', 'Functional',
|
||||
'UX', 'Performance', 'Content', 'Accessibility',
|
||||
];
|
||||
for (const cat of expectedCategories) {
|
||||
expect(weights.has(cat)).toBe(true);
|
||||
}
|
||||
expect(weights.size).toBe(8);
|
||||
});
|
||||
|
||||
test('has three tier definitions (Quick/Standard/Exhaustive)', () => {
|
||||
expect(qaContent).toContain('Quick Depth');
|
||||
expect(qaContent).toContain('Standard Depth');
|
||||
expect(qaContent).toContain('Exhaustive Depth');
|
||||
});
|
||||
|
||||
test('output structure references report directory layout', () => {
|
||||
expect(qaContent).toContain('index.md');
|
||||
expect(qaContent).toContain('test-plan-');
|
||||
expect(qaContent).toContain('qa-report-');
|
||||
expect(qaContent).toContain('baseline.json');
|
||||
expect(qaContent).toContain('screenshots/');
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Greptile history format consistency (A3) ---
|
||||
|
||||
describe('Greptile history format consistency', () => {
|
||||
test('greptile-triage.md defines the canonical history format', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
expect(content).toContain('<YYYY-MM-DD>');
|
||||
expect(content).toContain('<owner/repo>');
|
||||
expect(content).toContain('<type');
|
||||
expect(content).toContain('<file-pattern>');
|
||||
expect(content).toContain('<category>');
|
||||
});
|
||||
|
||||
test('review/SKILL.md and ship/SKILL.md both reference greptile-triage.md for write details', () => {
|
||||
const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
|
||||
|
||||
expect(reviewContent.toLowerCase()).toContain('greptile-triage.md');
|
||||
expect(shipContent.toLowerCase()).toContain('greptile-triage.md');
|
||||
});
|
||||
|
||||
test('greptile-triage.md defines all 9 valid categories', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
|
||||
const categories = [
|
||||
'race-condition', 'null-check', 'error-handling', 'style',
|
||||
'type-safety', 'security', 'performance', 'correctness', 'other',
|
||||
];
|
||||
for (const cat of categories) {
|
||||
expect(content).toContain(cat);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// --- Part 7: Planted-bug fixture validation (A4) ---
|
||||
|
||||
describe('Planted-bug fixture validation', () => {
|
||||
test('qa-eval ground truth has exactly 5 planted bugs', () => {
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'qa-eval-ground-truth.json'), 'utf-8')
|
||||
);
|
||||
expect(groundTruth.bugs).toHaveLength(5);
|
||||
expect(groundTruth.total_bugs).toBe(5);
|
||||
});
|
||||
|
||||
test('qa-eval-spa ground truth has exactly 5 planted bugs', () => {
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'qa-eval-spa-ground-truth.json'), 'utf-8')
|
||||
);
|
||||
expect(groundTruth.bugs).toHaveLength(5);
|
||||
expect(groundTruth.total_bugs).toBe(5);
|
||||
});
|
||||
|
||||
test('qa-eval-checkout ground truth has exactly 5 planted bugs', () => {
|
||||
const groundTruth = JSON.parse(
|
||||
fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'qa-eval-checkout-ground-truth.json'), 'utf-8')
|
||||
);
|
||||
expect(groundTruth.bugs).toHaveLength(5);
|
||||
expect(groundTruth.total_bugs).toBe(5);
|
||||
});
|
||||
|
||||
test('qa-eval.html contains the planted bugs', () => {
|
||||
const html = fs.readFileSync(path.join(ROOT, 'browse', 'test', 'fixtures', 'qa-eval.html'), 'utf-8');
|
||||
// BUG 1: broken link
|
||||
expect(html).toContain('/nonexistent-404-page');
|
||||
// BUG 2: disabled submit
|
||||
expect(html).toContain('disabled');
|
||||
// BUG 3: overflow
|
||||
expect(html).toContain('overflow: hidden');
|
||||
// BUG 4: missing alt
|
||||
expect(html).toMatch(/<img[^>]*src="\/logo\.png"[^>]*>/);
|
||||
expect(html).not.toMatch(/<img[^>]*src="\/logo\.png"[^>]*alt=/);
|
||||
// BUG 5: console error
|
||||
expect(html).toContain("Cannot read properties of undefined");
|
||||
});
|
||||
|
||||
test('review-eval-vuln.rb contains expected vulnerability patterns', () => {
|
||||
const content = fs.readFileSync(path.join(ROOT, 'test', 'fixtures', 'review-eval-vuln.rb'), 'utf-8');
|
||||
expect(content).toContain('params[:id]');
|
||||
expect(content).toContain('update_column');
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user