feat: eval persistence with auto-compare against previous run

EvalCollector accumulates test results during eval runs, writes JSON to ~/.gstack-dev/evals/{version}-{branch}-{tier}-{timestamp}.json, prints a summary table, and automatically compares against the previous run. - EvalCollector class with addTest() / finalize() / summary table - findPreviousRun() prefers same branch, falls back to any branch - compareEvalResults() matches tests by name, detects improved/regressed - extractToolSummary() counts tool types from transcript events - formatComparison() renders delta table with per-test + aggregate diffs - Wire into skill-e2e.test.ts (recordE2E helper) and skill-llm-eval.test.ts - 19 unit tests for collector + comparison functions - schema_version: 1 for forward compatibility Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-18 10:31:30 +08:00 · 2026-03-14 03:49:47 -05:00
parent e7347c2f8f
commit 84f52f3bad
4 changed files with 975 additions and 20 deletions
--- a/test/skill-llm-eval.test.ts
+++ b/test/skill-llm-eval.test.ts
@@ -10,21 +10,26 @@
 * Cost: ~$0.05-0.15 per run (sonnet)
 */

-import { describe, test, expect } from 'bun:test';
+import { describe, test, expect, afterAll } from 'bun:test';
 import Anthropic from '@anthropic-ai/sdk';
 import * as fs from 'fs';
 import * as path from 'path';
 import { callJudge, judge } from './helpers/llm-judge';
 import type { JudgeScore } from './helpers/llm-judge';
+import { EvalCollector } from './helpers/eval-store';

 const ROOT = path.resolve(import.meta.dir, '..');
 // Run when EVALS=1 is set (requires ANTHROPIC_API_KEY in env)
-const describeEval = process.env.EVALS ? describe : describe.skip;
+const evalsEnabled = !!process.env.EVALS;
+const describeEval = evalsEnabled ? describe : describe.skip;
+
+// Eval result collector
+const evalCollector = evalsEnabled ? new EvalCollector('llm-judge') : null;

 describeEval('LLM-as-judge quality evals', () => {
  test('command reference table scores >= 4 on all dimensions', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
-    // Extract just the command reference section
    const start = content.indexOf('## Command Reference');
    const end = content.indexOf('## Tips');
    const section = content.slice(start, end);
@@ -32,12 +37,24 @@ describeEval('LLM-as-judge quality evals', () => {
    const scores = await judge('command reference table', section);
    console.log('Command reference scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'command reference table',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('snapshot flags section scores >= 4 on all dimensions', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const start = content.indexOf('## Snapshot System');
    const end = content.indexOf('## Command Reference');
@@ -46,26 +63,49 @@ describeEval('LLM-as-judge quality evals', () => {
    const scores = await judge('snapshot flags reference', section);
    console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'snapshot flags reference',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('browse/SKILL.md overall scores >= 4', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
-    // Just the reference sections (skip examples/patterns)
    const start = content.indexOf('## Snapshot Flags');
    const section = content.slice(start);

    const scores = await judge('browse skill reference (flags + commands)', section);
    console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'browse/SKILL.md reference',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('setup block scores >= 4 on actionability and clarity', async () => {
+    const t0 = Date.now();
    const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const setupStart = content.indexOf('## SETUP');
    const setupEnd = content.indexOf('## IMPORTANT');
@@ -74,13 +114,23 @@ describeEval('LLM-as-judge quality evals', () => {
    const scores = await judge('setup/binary discovery instructions', section);
    console.log('Setup block scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'setup block',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: scores.actionability >= 4 && scores.clarity >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('regression check: compare branch vs baseline quality', async () => {
-    // This test compares the generated output against the hand-maintained
-    // baseline from main. The generated version should score equal or higher.
+    const t0 = Date.now();
    const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const genStart = generated.indexOf('## Command Reference');
    const genEnd = generated.indexOf('## Tips');
@@ -151,7 +201,17 @@ Scores are 1-5 overall quality.`,
    const result = JSON.parse(jsonMatch[0]);
    console.log('Regression comparison:', JSON.stringify(result, null, 2));

-    // Generated version should be at least as good as hand-maintained
+    evalCollector?.addTest({
+      name: 'regression vs baseline',
+      suite: 'LLM-as-judge quality evals',
+      tier: 'llm-judge',
+      passed: result.b_score >= result.a_score,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { a_score: result.a_score, b_score: result.b_score },
+      judge_reasoning: result.reasoning,
+    });
+
    expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
  }, 30_000);
 });
@@ -162,13 +222,11 @@ describeEval('QA skill quality evals', () => {
  const qaContent = fs.readFileSync(path.join(ROOT, 'qa', 'SKILL.md'), 'utf-8');

  test('qa/SKILL.md workflow quality scores >= 4', async () => {
-    // Extract the workflow section (Phases 1-7)
+    const t0 = Date.now();
    const start = qaContent.indexOf('## Workflow');
    const end = qaContent.indexOf('## Health Score Rubric');
    const section = qaContent.slice(start, end);

-    // Use workflow-specific prompt (not the CLI-reference judge, since this is a
-    // workflow doc that references $B commands defined in a separate browse SKILL.md)
    const scores = await callJudge<JudgeScore>(`You are evaluating the quality of a QA testing workflow document for an AI coding agent.

 The agent reads this document to learn how to systematically QA test a web application. The workflow references
@@ -188,16 +246,27 @@ Here is the QA workflow to evaluate:
 ${section}`);
    console.log('QA workflow scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'qa/SKILL.md workflow',
+      suite: 'QA skill quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
    expect(scores.completeness).toBeGreaterThanOrEqual(4);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);

  test('qa/SKILL.md health score rubric is unambiguous', async () => {
+    const t0 = Date.now();
    const start = qaContent.indexOf('## Health Score Rubric');
    const section = qaContent.slice(start);

-    // Use rubric-specific prompt
    const scores = await callJudge<JudgeScore>(`You are evaluating a health score rubric that an AI agent must follow to compute a numeric QA score.

 The agent uses this rubric after QA testing a website. It needs to:
@@ -218,11 +287,18 @@ Here is the rubric to evaluate:
 ${section}`);
    console.log('QA health rubric scores:', JSON.stringify(scores, null, 2));

+    evalCollector?.addTest({
+      name: 'qa/SKILL.md health rubric',
+      suite: 'QA skill quality evals',
+      tier: 'llm-judge',
+      passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
+      judge_reasoning: scores.reasoning,
+    });
+
    expect(scores.clarity).toBeGreaterThanOrEqual(4);
-    // Completeness threshold is 3 — the rubric intentionally leaves some edge cases
-    // to agent judgment (e.g., partial testing, cross-category findings). The judge
-    // consistently flags these as gaps, but over-specifying would make the rubric
-    // rigid and harder to follow. Clarity + actionability >= 4 is what matters.
    expect(scores.completeness).toBeGreaterThanOrEqual(3);
    expect(scores.actionability).toBeGreaterThanOrEqual(4);
  }, 30_000);
@@ -232,12 +308,12 @@ ${section}`);

 describeEval('Cross-skill consistency evals', () => {
  test('greptile-history patterns are consistent across all skills', async () => {
+    const t0 = Date.now();
    const reviewContent = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
    const shipContent = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8');
    const triageContent = fs.readFileSync(path.join(ROOT, 'review', 'greptile-triage.md'), 'utf-8');
    const retroContent = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8');

-    // Extract greptile-related lines from each file
    const extractGrepLines = (content: string, filename: string) => {
      const lines = content.split('\n')
        .filter(l => /greptile|history\.md|REMOTE_SLUG/i.test(l))
@@ -277,6 +353,17 @@ score (1-5): 5 = perfectly consistent, 1 = contradictory`);

    console.log('Cross-skill consistency:', JSON.stringify(result, null, 2));

+    evalCollector?.addTest({
+      name: 'cross-skill greptile consistency',
+      suite: 'Cross-skill consistency evals',
+      tier: 'llm-judge',
+      passed: result.consistent && result.score >= 4,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { consistency_score: result.score },
+      judge_reasoning: result.reasoning,
+    });
+
    expect(result.consistent).toBe(true);
    expect(result.score).toBeGreaterThanOrEqual(4);
  }, 30_000);
@@ -288,6 +375,7 @@ describeEval('Baseline score pinning', () => {
  const baselinesPath = path.join(ROOT, 'test', 'fixtures', 'eval-baselines.json');

  test('LLM eval scores do not regress below baselines', async () => {
+    const t0 = Date.now();
    if (!fs.existsSync(baselinesPath)) {
      console.log('No baseline file found — skipping pinning check');
      return;
@@ -296,7 +384,6 @@ describeEval('Baseline score pinning', () => {
    const baselines = JSON.parse(fs.readFileSync(baselinesPath, 'utf-8'));
    const regressions: string[] = [];

-    // Test command reference
    const skillContent = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
    const cmdStart = skillContent.indexOf('## Command Reference');
    const cmdEnd = skillContent.indexOf('## Tips');
@@ -309,7 +396,6 @@ describeEval('Baseline score pinning', () => {
      }
    }

-    // Update baselines if requested
    if (process.env.UPDATE_BASELINES) {
      baselines.command_reference = {
        clarity: cmdScores.clarity,
@@ -320,8 +406,31 @@ describeEval('Baseline score pinning', () => {
      console.log('Updated eval baselines');
    }

-    if (regressions.length > 0) {
+    const passed = regressions.length === 0;
+    evalCollector?.addTest({
+      name: 'baseline score pinning',
+      suite: 'Baseline score pinning',
+      tier: 'llm-judge',
+      passed,
+      duration_ms: Date.now() - t0,
+      cost_usd: 0.02,
+      judge_scores: { clarity: cmdScores.clarity, completeness: cmdScores.completeness, actionability: cmdScores.actionability },
+      judge_reasoning: passed ? 'All scores at or above baseline' : regressions.join('; '),
+    });
+
+    if (!passed) {
      throw new Error(`Score regressions detected:\n${regressions.join('\n')}`);
    }
  }, 60_000);
 });
+
+// Module-level afterAll — finalize eval collector after all tests complete
+afterAll(async () => {
+  if (evalCollector) {
+    try {
+      await evalCollector.finalize();
+    } catch (err) {
+      console.error('Failed to save eval results:', err);
+    }
+  }
+});