mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 10:31:30 +08:00
feat(test): 3 gate-tier real-PTY E2E tests
skill-e2e-auq-format-compliance.test.ts (~$0.50/run, 90-130s): - Asserts /plan-ceo-review's first AUQ contains all 7 mandated format elements (ELI10, Recommendation, Pros/Cons with ✅/❌, Net, (recommended) label). Catches drift in the shared preamble resolver that previously took weeks to notice. - Auto-grants permission dialogs that fire during preamble side-effects (touch on .feature-prompted markers in fresh user environments). - Verified PASS in 126s. skill-e2e-plan-design-with-ui.test.ts (~$0.80/run, 50-90s): - Counterpart to the existing no-UI early-exit test. When the input plan DOES describe UI changes, /plan-design-review must NOT early-exit and must reach a real skill AUQ. - Sends the slash command without args, then a follow-up message with the UI-heavy plan description (Claude Code rejects unknown trailing args). Asserts evidence does NOT contain "no UI scope". - Verified PASS in 54s. skill-budget-regression.test.ts (free, gate): - Library-only assertion. Reads the most recent eval file, finds the prior same-branch run via findPreviousRun, computes ComparisonResult, asserts no test exceeded 2× tools or turns. - Branch-scoped: skips with reason if the latest eval was produced on a different branch (cross-branch comparison would be noise). - First-run grace (vacuous pass) when no prior data exists. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
143
test/skill-e2e-plan-design-with-ui.test.ts
Normal file
143
test/skill-e2e-plan-design-with-ui.test.ts
Normal file
@@ -0,0 +1,143 @@
|
||||
/**
|
||||
* /plan-design-review with UI scope (gate, paid, real-PTY).
|
||||
*
|
||||
* Counterpart to the existing no-UI early-exit test. When the input plan
|
||||
* DOES describe UI changes, /plan-design-review must NOT early-exit and
|
||||
* must reach a real skill numbered-option AUQ (its first design-rating
|
||||
* question), with the captured evidence NOT echoing the early-exit phrase.
|
||||
*
|
||||
* Why: today we only test the negative path (no-UI → early-exit). A
|
||||
* regression that flips the UI-detection logic — making EVERY plan early-
|
||||
* exit — would pass the no-UI test (vacuously) and ship undetected. This
|
||||
* test is the positive coverage.
|
||||
*
|
||||
* How: launch claude in plan mode in the gstack repo cwd (so the skill
|
||||
* registry is loaded). Send /plan-design-review with the fixture path
|
||||
* inline so the skill reviews the UI-heavy plan rather than git diff or
|
||||
* .claude/plans/. Drive past permission dialogs. Wait for a numbered-
|
||||
* option list that is NOT a permission dialog. Assert evidence does NOT
|
||||
* contain "no UI scope".
|
||||
*/
|
||||
|
||||
import { describe, test } from 'bun:test';
|
||||
import * as path from 'path';
|
||||
import {
|
||||
launchClaudePty,
|
||||
isNumberedOptionListVisible,
|
||||
isPermissionDialogVisible,
|
||||
parseNumberedOptions,
|
||||
isPlanReadyVisible,
|
||||
} from './helpers/claude-pty-runner';
|
||||
|
||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate';
|
||||
const describeE2E = shouldRun ? describe : describe.skip;
|
||||
|
||||
const ROOT = path.resolve(import.meta.dir, '..');
|
||||
const FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
|
||||
|
||||
describeE2E('/plan-design-review with UI scope (gate)', () => {
|
||||
test(
|
||||
'reaches a real skill AUQ (or plan_ready) without echoing the no-UI early-exit phrase',
|
||||
async () => {
|
||||
const fixtureRelPath = path.relative(ROOT, FIXTURE);
|
||||
|
||||
const session = await launchClaudePty({
|
||||
permissionMode: 'plan',
|
||||
cwd: ROOT,
|
||||
timeoutMs: 480_000,
|
||||
});
|
||||
|
||||
let outcome: 'real_auq' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
||||
let evidence = '';
|
||||
let debugBuffer = ''; // captured at end so timeout error has data
|
||||
|
||||
try {
|
||||
await Bun.sleep(8000);
|
||||
const since = session.mark();
|
||||
// Send the slash command alone first; then provide the UI-heavy
|
||||
// plan content as a follow-up message. Claude Code rejects slash
|
||||
// commands with trailing arguments unless the skill defines them.
|
||||
session.send('/plan-design-review\r');
|
||||
await Bun.sleep(3000);
|
||||
session.send(
|
||||
`Please review this plan for UI scope:\n\n` +
|
||||
`Title: User Dashboard Page\n` +
|
||||
`New React page UserDashboard.tsx with three subcomponents: ` +
|
||||
`ActivityFeed, NotificationsPanel, QuickActions. ` +
|
||||
`Tailwind CSS responsive layout (mobile/desktop breakpoints), ` +
|
||||
`loading skeletons, empty states, hover states on every interactive element, ` +
|
||||
`modal dialog for "mark all read", toast notifications for action feedback. ` +
|
||||
`Reference plan file: ${fixtureRelPath}\r`
|
||||
);
|
||||
|
||||
const budgetMs = 360_000;
|
||||
const start = Date.now();
|
||||
let lastPermSig = '';
|
||||
while (Date.now() - start < budgetMs) {
|
||||
await Bun.sleep(2500);
|
||||
if (session.exited()) {
|
||||
outcome = 'exited';
|
||||
evidence = session.visibleSince(since).slice(-3000);
|
||||
break;
|
||||
}
|
||||
const visible = session.visibleSince(since);
|
||||
|
||||
// Classify the recent tail only — old permission text persists
|
||||
// in visibleSince(since) and would otherwise re-trigger forever.
|
||||
const recentTail = visible.slice(-2500);
|
||||
|
||||
// Real skill AUQ visible (not a permission dialog)?
|
||||
if (
|
||||
isNumberedOptionListVisible(recentTail) &&
|
||||
parseNumberedOptions(recentTail).length >= 2 &&
|
||||
!isPermissionDialogVisible(recentTail)
|
||||
) {
|
||||
outcome = 'real_auq';
|
||||
evidence = visible.slice(-3000);
|
||||
break;
|
||||
}
|
||||
|
||||
// Permission dialog: grant once per unique rendering.
|
||||
if (isPermissionDialogVisible(recentTail)) {
|
||||
const sig = visible.slice(-500);
|
||||
if (sig !== lastPermSig) {
|
||||
lastPermSig = sig;
|
||||
session.send('1\r');
|
||||
await Bun.sleep(1500);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Plan-ready terminal — also acceptable (skill ran end-to-end
|
||||
// and surfaced claude's "Ready to execute" prompt).
|
||||
if (isPlanReadyVisible(visible)) {
|
||||
outcome = 'plan_ready';
|
||||
evidence = visible.slice(-3000);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// Capture buffer state at end so a timeout error has diagnostic data.
|
||||
debugBuffer = session.visibleSince(since).slice(-4000);
|
||||
} finally {
|
||||
await session.close();
|
||||
}
|
||||
|
||||
// PASS: real_auq or plan_ready, AND evidence does NOT echo the
|
||||
// early-exit phrase.
|
||||
if (outcome === 'exited' || outcome === 'timeout') {
|
||||
throw new Error(
|
||||
`plan-design-review with UI scope FAILED: outcome=${outcome}\n` +
|
||||
`--- buffer at timeout (last 4KB) ---\n${debugBuffer || evidence}`,
|
||||
);
|
||||
}
|
||||
const NO_UI_PHRASE = /no\s+UI\s+scope|isn'?t\s+applicable/i;
|
||||
if (NO_UI_PHRASE.test(evidence)) {
|
||||
throw new Error(
|
||||
`plan-design-review early-exited despite UI-heavy fixture.\n` +
|
||||
`--- evidence (last 3KB) ---\n${evidence}`,
|
||||
);
|
||||
}
|
||||
},
|
||||
540_000,
|
||||
);
|
||||
});
|
||||
Reference in New Issue
Block a user