feat: eval efficiency metrics — turns, duration, commentary across all surfaces

Add generateCommentary() for natural-language delta interpretation,
per-test turns/duration in comparison and summary output, judgePassed
unit tests, 3 new E2E tests (qa-only, qa fix loop, plan artifact).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-15 21:17:12 -05:00
parent 4120537d66
commit 03a6270b9c
5 changed files with 825 additions and 35 deletions

View File

@@ -47,6 +47,8 @@ interface RunSummary {
passed: number;
total: number;
cost: number;
duration: number;
turns: number;
}
const runs: RunSummary[] = [];
@@ -55,6 +57,7 @@ for (const file of files) {
const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
if (filterBranch && data.branch !== filterBranch) continue;
if (filterTier && data.tier !== filterTier) continue;
const totalTurns = (data.tests || []).reduce((s: number, t: any) => s + (t.turns_used || 0), 0);
runs.push({
file,
timestamp: data.timestamp || '',
@@ -64,6 +67,8 @@ for (const file of files) {
passed: data.passed || 0,
total: data.total_tests || 0,
cost: data.total_cost_usd || 0,
duration: data.total_duration_ms || 0,
turns: totalTurns,
});
} catch { continue; }
}
@@ -77,29 +82,35 @@ const displayed = runs.slice(0, limit);
// Print table
console.log('');
console.log(`Eval History (${runs.length} total runs)`);
console.log('═'.repeat(90));
console.log('═'.repeat(105));
console.log(
' ' +
'Date'.padEnd(17) +
'Branch'.padEnd(28) +
'Branch'.padEnd(25) +
'Tier'.padEnd(12) +
'Pass'.padEnd(8) +
'Cost'.padEnd(8) +
'Turns'.padEnd(7) +
'Duration'.padEnd(10) +
'Version'
);
console.log('─'.repeat(90));
console.log('─'.repeat(105));
for (const run of displayed) {
const date = run.timestamp.replace('T', ' ').slice(0, 16);
const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28);
const branch = run.branch.length > 23 ? run.branch.slice(0, 20) + '...' : run.branch.padEnd(25);
const pass = `${run.passed}/${run.total}`.padEnd(8);
const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`);
const turns = run.turns > 0 ? `${run.turns}t`.padEnd(7) : ''.padEnd(7);
const dur = run.duration > 0 ? `${Math.round(run.duration / 1000)}s`.padEnd(10) : ''.padEnd(10);
console.log(` ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}${turns}${dur}v${run.version}`);
}
console.log('─'.repeat(90));
console.log('─'.repeat(105));
const totalCost = runs.reduce((s, r) => s + r.cost, 0);
console.log(` ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`);
const totalDur = runs.reduce((s, r) => s + r.duration, 0);
const totalTurns = runs.reduce((s, r) => s + r.turns, 0);
console.log(` ${runs.length} runs | $${totalCost.toFixed(2)} total | ${totalTurns} turns | ${Math.round(totalDur / 1000)}s | Showing: ${displayed.length}`);
console.log(` Dir: ${EVAL_DIR}`);
console.log('');