feat(test): 3 gate-tier real-PTY E2E tests

skill-e2e-auq-format-compliance.test.ts (~$0.50/run, 90-130s): - Asserts /plan-ceo-review's first AUQ contains all 7 mandated format elements (ELI10, Recommendation, Pros/Cons with ✅/❌, Net, (recommended) label). Catches drift in the shared preamble resolver that previously took weeks to notice. - Auto-grants permission dialogs that fire during preamble side-effects (touch on .feature-prompted markers in fresh user environments). - Verified PASS in 126s. skill-e2e-plan-design-with-ui.test.ts (~$0.80/run, 50-90s): - Counterpart to the existing no-UI early-exit test. When the input plan DOES describe UI changes, /plan-design-review must NOT early-exit and must reach a real skill AUQ. - Sends the slash command without args, then a follow-up message with the UI-heavy plan description (Claude Code rejects unknown trailing args). Asserts evidence does NOT contain "no UI scope". - Verified PASS in 54s. skill-budget-regression.test.ts (free, gate): - Library-only assertion. Reads the most recent eval file, finds the prior same-branch run via findPreviousRun, computes ComparisonResult, asserts no test exceeded 2× tools or turns. - Branch-scoped: skips with reason if the latest eval was produced on a different branch (cross-branch comparison would be noise). - First-run grace (vacuous pass) when no prior data exists. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 10:31:30 +08:00 · 2026-04-26 04:36:35 -07:00
parent 137b11f39a
commit 2b1a0da7c1
3 changed files with 487 additions and 0 deletions
--- a/test/skill-e2e-plan-design-with-ui.test.ts
+++ b/test/skill-e2e-plan-design-with-ui.test.ts
@@ -0,0 +1,143 @@
+/**
+ * /plan-design-review with UI scope (gate, paid, real-PTY).
+ *
+ * Counterpart to the existing no-UI early-exit test. When the input plan
+ * DOES describe UI changes, /plan-design-review must NOT early-exit and
+ * must reach a real skill numbered-option AUQ (its first design-rating
+ * question), with the captured evidence NOT echoing the early-exit phrase.
+ *
+ * Why: today we only test the negative path (no-UI → early-exit). A
+ * regression that flips the UI-detection logic — making EVERY plan early-
+ * exit — would pass the no-UI test (vacuously) and ship undetected. This
+ * test is the positive coverage.
+ *
+ * How: launch claude in plan mode in the gstack repo cwd (so the skill
+ * registry is loaded). Send /plan-design-review with the fixture path
+ * inline so the skill reviews the UI-heavy plan rather than git diff or
+ * .claude/plans/. Drive past permission dialogs. Wait for a numbered-
+ * option list that is NOT a permission dialog. Assert evidence does NOT
+ * contain "no UI scope".
+ */
+
+import { describe, test } from 'bun:test';
+import * as path from 'path';
+import {
+  launchClaudePty,
+  isNumberedOptionListVisible,
+  isPermissionDialogVisible,
+  parseNumberedOptions,
+  isPlanReadyVisible,
+} from './helpers/claude-pty-runner';
+
+const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
+const describeE2E = shouldRun ? describe : describe.skip;
+
+const ROOT = path.resolve(import.meta.dir, '..');
+const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
+
+describeE2E('/plan-design-review with UI scope (gate)', () => {
+  test(
+    'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase',
+    async () => {
+      const fixtureRelPath = path.relative(ROOT, FIXTURE);
+
+      const session = await launchClaudePty({
+        permissionMode: 'plan',
+        cwd: ROOT,
+        timeoutMs: 480_000,
+      });
+
+      let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
+      let evidence = '';
+      let debugBuffer = ''; // captured at end so timeout error has data
+
+      try {
+        await Bun.sleep(8000);
+        const since = session.mark();
+        // Send the slash command alone first; then provide the UI-heavy
+        // plan content as a follow-up message. Claude Code rejects slash
+        // commands with trailing arguments unless the skill defines them.
+        session.send('/plan-design-review\r');
+        await Bun.sleep(3000);
+        session.send(
+          `Please review this plan for UI scope:\n\n` +
+          `Title: User Dashboard Page\n` +
+          `New React page UserDashboard.tsx with three subcomponents: ` +
+          `ActivityFeed, NotificationsPanel, QuickActions. ` +
+          `Tailwind CSS responsive layout (mobile/desktop breakpoints), ` +
+          `loading skeletons, empty states, hover states on every interactive element, ` +
+          `modal dialog for "mark all read", toast notifications for action feedback. ` +
+          `Reference plan file: ${fixtureRelPath}\r`
+        );
+
+        const budgetMs = 360_000;
+        const start = Date.now();
+        let lastPermSig = '';
+        while (Date.now() - start < budgetMs) {
+          await Bun.sleep(2500);
+          if (session.exited()) {
+            outcome = 'exited';
+            evidence = session.visibleSince(since).slice(-3000);
+            break;
+          }
+          const visible = session.visibleSince(since);
+
+          // Classify the recent tail only — old permission text persists
+          // in visibleSince(since) and would otherwise re-trigger forever.
+          const recentTail = visible.slice(-2500);
+
+          // Real skill AUQ visible (not a permission dialog)?
+          if (
+            isNumberedOptionListVisible(recentTail) &&
+            parseNumberedOptions(recentTail).length >= 2 &&
+            !isPermissionDialogVisible(recentTail)
+          ) {
+            outcome = 'real_auq';
+            evidence = visible.slice(-3000);
+            break;
+          }
+
+          // Permission dialog: grant once per unique rendering.
+          if (isPermissionDialogVisible(recentTail)) {
+            const sig = visible.slice(-500);
+            if (sig !== lastPermSig) {
+              lastPermSig = sig;
+              session.send('1\r');
+              await Bun.sleep(1500);
+              continue;
+            }
+          }
+
+          // Plan-ready terminal — also acceptable (skill ran end-to-end
+          // and surfaced claude's "Ready to execute" prompt).
+          if (isPlanReadyVisible(visible)) {
+            outcome = 'plan_ready';
+            evidence = visible.slice(-3000);
+            break;
+          }
+        }
+        // Capture buffer state at end so a timeout error has diagnostic data.
+        debugBuffer = session.visibleSince(since).slice(-4000);
+      } finally {
+        await session.close();
+      }
+
+      // PASS: real_auq or plan_ready, AND evidence does NOT echo the
+      // early-exit phrase.
+      if (outcome === 'exited' || outcome === 'timeout') {
+        throw new Error(
+          `plan-design-review with UI scope FAILED: outcome=${outcome}\n` +
+            `--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`,
+        );
+      }
+      const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i;
+      if (NO_UI_PHRASE.test(evidence)) {
+        throw new Error(
+          `plan-design-review early-exited despite UI-heavy fixture.\n` +
+            `--- evidence (last 3KB) ---\n${evidence}`,
+        );
+      }
+    },
+    540_000,
+  );
+});