feat: QA restructure, browser ref staleness, eval efficiency metrics (v0.4.0) (#83)

* feat: browser ref staleness detection via async count() validation resolveRef() now checks element count to detect stale refs after page mutations (e.g. SPA navigation). RefEntry stores role+name metadata for better diagnostics. 3 new snapshot tests for staleness detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: qa-only skill, qa fix loop, plan-to-QA artifact flow Add /qa-only (report-only, Edit tool blocked), restructure /qa with find-fix-verify cycle, add {{QA_METHODOLOGY}} DRY placeholder for shared methodology. /plan-eng-review now writes test-plan artifacts to ~/.gstack/projects/<slug>/ for QA consumption. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: eval efficiency metrics — turns, duration, commentary across all surfaces Add generateCommentary() for natural-language delta interpretation, per-test turns/duration in comparison and summary output, judgePassed unit tests, 3 new E2E tests (qa-only, qa fix loop, plan artifact). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.4.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: update ARCHITECTURE, BROWSER, CONTRIBUTING, README for v0.4.0 - ARCHITECTURE: add ref staleness detection section, update RefEntry type - BROWSER: add ref staleness paragraph to snapshot system docs - CONTRIBUTING: update eval tool descriptions with commentary feature - README: fix missing qa-only in project-local uninstall command Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * docs: add user-facing benefit descriptions to v0.4.0 changelog Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-17 09:41:28 +08:00 · 2026-03-15 23:55:39 -05:00
parent bb46ca6b21
commit f3ee0ee28a
30 changed files with 2317 additions and 354 deletions
--- a/scripts/eval-list.ts
+++ b/scripts/eval-list.ts
@@ -47,6 +47,8 @@ interface RunSummary {
  passed: number;
  total: number;
  cost: number;
+  duration: number;
+  turns: number;
 }

 const runs: RunSummary[] = [];
@@ -55,6 +57,7 @@ for (const file of files) {
    const data = JSON.parse(fs.readFileSync(path.join(EVAL_DIR, file), 'utf-8'));
    if (filterBranch && data.branch !== filterBranch) continue;
    if (filterTier && data.tier !== filterTier) continue;
+    const totalTurns = (data.tests || []).reduce((s: number, t: any) => s + (t.turns_used || 0), 0);
    runs.push({
      file,
      timestamp: data.timestamp || '',
@@ -64,6 +67,8 @@ for (const file of files) {
      passed: data.passed || 0,
      total: data.total_tests || 0,
      cost: data.total_cost_usd || 0,
+      duration: data.total_duration_ms || 0,
+      turns: totalTurns,
    });
  } catch { continue; }
 }
@@ -77,29 +82,35 @@ const displayed = runs.slice(0, limit);
 // Print table
 console.log('');
 console.log(`Eval History (${runs.length} total runs)`);
-console.log('═'.repeat(90));
+console.log('═'.repeat(105));
 console.log(
  '  ' +
  'Date'.padEnd(17) +
-  'Branch'.padEnd(28) +
+  'Branch'.padEnd(25) +
  'Tier'.padEnd(12) +
  'Pass'.padEnd(8) +
  'Cost'.padEnd(8) +
+  'Turns'.padEnd(7) +
+  'Duration'.padEnd(10) +
  'Version'
 );
-console.log('─'.repeat(90));
+console.log('─'.repeat(105));

 for (const run of displayed) {
  const date = run.timestamp.replace('T', ' ').slice(0, 16);
-  const branch = run.branch.length > 26 ? run.branch.slice(0, 23) + '...' : run.branch.padEnd(28);
+  const branch = run.branch.length > 23 ? run.branch.slice(0, 20) + '...' : run.branch.padEnd(25);
  const pass = `${run.passed}/${run.total}`.padEnd(8);
  const cost = `$${run.cost.toFixed(2)}`.padEnd(8);
-  console.log(`  ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}v${run.version}`);
+  const turns = run.turns > 0 ? `${run.turns}t`.padEnd(7) : ''.padEnd(7);
+  const dur = run.duration > 0 ? `${Math.round(run.duration / 1000)}s`.padEnd(10) : ''.padEnd(10);
+  console.log(`  ${date.padEnd(17)}${branch}${run.tier.padEnd(12)}${pass}${cost}${turns}${dur}v${run.version}`);
 }

-console.log('─'.repeat(90));
+console.log('─'.repeat(105));

 const totalCost = runs.reduce((s, r) => s + r.cost, 0);
-console.log(`  ${runs.length} runs | Total spend: $${totalCost.toFixed(2)} | Showing: ${displayed.length}`);
+const totalDur = runs.reduce((s, r) => s + r.duration, 0);
+const totalTurns = runs.reduce((s, r) => s + r.turns, 0);
+console.log(`  ${runs.length} runs | $${totalCost.toFixed(2)} total | ${totalTurns} turns | ${Math.round(totalDur / 1000)}s | Showing: ${displayed.length}`);
 console.log(`  Dir: ${EVAL_DIR}`);
 console.log('');