mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 18:32:28 +08:00
feat(test): 3 periodic-tier real-PTY E2E tests
skill-e2e-plan-ceo-mode-routing.test.ts (~$3/run, 6-10 min/case): - Verifies AUQ answer routing: HOLD SCOPE → rigor/bulletproof posture language; SCOPE EXPANSION → expansion/10x/dream language. Each case navigates 8-12 prior AUQs (telemetry, proactive, routing, vendoring, brain, office-hours, premise, approach) before hitting Step 0F. - Periodic, not gate: navigation phase too slow for PR-blocking. V2 expansion to 4 modes (SELECTIVE + REDUCTION) when nav is faster. skill-e2e-ship-idempotency.test.ts (~$3/run, 5-10 min): - Builds a real git fixture with VERSION 0.0.2 already bumped, matching package.json, CHANGELOG entry, pushed to a local bare remote. Runs /ship in plan mode and asserts STATE: ALREADY_BUMPED echoes from the Step 12 idempotency check, OR plan_ready terminates without mutation. - Snapshots VERSION + package.json + CHANGELOG entry count + commit count + branch HEAD before/after; fails if any changed. skill-e2e-autoplan-chain.test.ts (~$8/run, 12-18 min): - Asserts /autoplan phases run sequentially: tees timestamps as each "**Phase N complete.**" marker first appears. Phase 1 (CEO) must precede Phase 3 (Eng); Phase 2 (Design) is optional but if it appears, must sit between 1 and 3. - Auto-grants permission dialogs that fire during phase transitions. All three auto-handle permission dialogs (preamble side-effects on fresh user envs without .feature-prompted-* markers). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
176
test/skill-e2e-autoplan-chain.test.ts
Normal file
176
test/skill-e2e-autoplan-chain.test.ts
Normal file
@@ -0,0 +1,176 @@
|
|||||||
|
/**
|
||||||
|
* /autoplan cross-skill chain (periodic, paid, real-PTY).
|
||||||
|
*
|
||||||
|
* Asserts: when /autoplan runs against a plan fixture, the phase markers
|
||||||
|
* the autoplan template emits appear in the correct order:
|
||||||
|
*
|
||||||
|
* "**Phase 1 complete." (CEO) →
|
||||||
|
* "**Phase 2 complete." (Design — only if UI scope detected) →
|
||||||
|
* "**Phase 3 complete." (Eng) →
|
||||||
|
* "**Phase 3.5 complete." (DX — optional, skipped if no DX scope)
|
||||||
|
*
|
||||||
|
* Why this exists: each individual phase has its own plan-mode smoke
|
||||||
|
* test. Nothing verifies the SEQUENCING — that phases don't run in
|
||||||
|
* parallel, that Phase 3 doesn't start before Phase 1 ends, that
|
||||||
|
* conditional phases (Design, DX) are skipped when their scope is absent.
|
||||||
|
* A regression where the autoplan template wires phases concurrently
|
||||||
|
* would not be caught by per-phase tests.
|
||||||
|
*
|
||||||
|
* Approach: tee timestamps as each "**Phase N complete." marker first
|
||||||
|
* appears in the visible buffer. Assert observed ordering. Phase 2 is
|
||||||
|
* optional — UI-heavy fixture should make it run; backend-only fixtures
|
||||||
|
* should make it skip.
|
||||||
|
*
|
||||||
|
* Cost: ~$5-8/run, 10-15 min wall clock. Periodic — runs weekly.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, test, expect } from 'bun:test';
|
||||||
|
import { spawnSync } from 'child_process';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as os from 'os';
|
||||||
|
import {
|
||||||
|
launchClaudePty,
|
||||||
|
isPlanReadyVisible,
|
||||||
|
isPermissionDialogVisible,
|
||||||
|
isNumberedOptionListVisible,
|
||||||
|
} from './helpers/claude-pty-runner';
|
||||||
|
|
||||||
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||||
|
const describeE2E = shouldRun ? describe : describe.skip;
|
||||||
|
|
||||||
|
const ROOT = path.resolve(import.meta.dir, '..');
|
||||||
|
const UI_FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
|
||||||
|
|
||||||
|
interface PhaseHit {
|
||||||
|
phase: number;
|
||||||
|
ts: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
describeE2E('/autoplan chain ordering (periodic)', () => {
|
||||||
|
test(
|
||||||
|
'phases run sequentially: Phase 1 (CEO) before Phase 3 (Eng), Phase 2 (Design) between when present',
|
||||||
|
async () => {
|
||||||
|
// UI-heavy fixture so Phase 2 runs.
|
||||||
|
const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-autoplan-chain-'));
|
||||||
|
try {
|
||||||
|
const gitRun = (args: string[]) =>
|
||||||
|
spawnSync('git', args, { cwd: tempDir, stdio: 'pipe', timeout: 5000 });
|
||||||
|
gitRun(['init', '-b', 'main']);
|
||||||
|
gitRun(['config', 'user.email', 'test@test.com']);
|
||||||
|
gitRun(['config', 'user.name', 'Test']);
|
||||||
|
|
||||||
|
const plansDir = path.join(tempDir, '.claude', 'plans');
|
||||||
|
fs.mkdirSync(plansDir, { recursive: true });
|
||||||
|
fs.copyFileSync(UI_FIXTURE, path.join(plansDir, 'ui-heavy-feature.md'));
|
||||||
|
fs.writeFileSync(path.join(tempDir, 'README.md'), '# Autoplan chain fixture\n');
|
||||||
|
gitRun(['add', '.']);
|
||||||
|
gitRun(['commit', '-m', 'init UI-heavy fixture']);
|
||||||
|
|
||||||
|
const session = await launchClaudePty({
|
||||||
|
permissionMode: 'plan',
|
||||||
|
cwd: tempDir,
|
||||||
|
timeoutMs: 1_080_000, // 18 min, slightly above test budget
|
||||||
|
});
|
||||||
|
|
||||||
|
const hits: PhaseHit[] = [];
|
||||||
|
let outcome: 'chain_complete' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
|
||||||
|
let evidence = '';
|
||||||
|
|
||||||
|
try {
|
||||||
|
await Bun.sleep(8000);
|
||||||
|
const since = session.mark();
|
||||||
|
session.send('/autoplan\r');
|
||||||
|
|
||||||
|
const budgetMs = 900_000; // 15 min
|
||||||
|
const start = Date.now();
|
||||||
|
// Phase markers in autoplan/SKILL.md (lines 1126, 1211, 1331, 1437):
|
||||||
|
// "**Phase 1 complete." / "**Phase 2 complete." / "**Phase 3 complete." / "**Phase 3.5 complete."
|
||||||
|
const phasePattern = /\*\*Phase\s+(\d+(?:\.\d+)?)\s+complete\.?\*\*/g;
|
||||||
|
|
||||||
|
let lastPermSig = '';
|
||||||
|
while (Date.now() - start < budgetMs) {
|
||||||
|
await Bun.sleep(5000);
|
||||||
|
if (session.exited()) {
|
||||||
|
outcome = 'exited';
|
||||||
|
evidence = session.visibleSince(since).slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const visible = session.visibleSince(since);
|
||||||
|
|
||||||
|
// Auto-grant any permission dialog so autoplan can keep moving
|
||||||
|
// through its phases. The autoplan template auto-decides AUQs
|
||||||
|
// it owns; only permission prompts (file/tool grants) need our
|
||||||
|
// hand-pressing. Classify on tail to avoid stale matches.
|
||||||
|
const recentTail = visible.slice(-1500);
|
||||||
|
if (isNumberedOptionListVisible(recentTail) && isPermissionDialogVisible(recentTail)) {
|
||||||
|
const sig = visible.slice(-500);
|
||||||
|
if (sig !== lastPermSig) {
|
||||||
|
lastPermSig = sig;
|
||||||
|
session.send('1\r');
|
||||||
|
await Bun.sleep(2000);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Re-scan for any phase markers we haven't yet recorded.
|
||||||
|
phasePattern.lastIndex = 0;
|
||||||
|
let m: RegExpExecArray | null;
|
||||||
|
while ((m = phasePattern.exec(visible)) !== null) {
|
||||||
|
const phaseNum = parseFloat(m[1] ?? '0');
|
||||||
|
if (Number.isNaN(phaseNum)) continue;
|
||||||
|
if (hits.some(h => h.phase === phaseNum)) continue;
|
||||||
|
hits.push({ phase: phaseNum, ts: Date.now() });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Terminal: Phase 3 (Eng) seen — chain reached the required end.
|
||||||
|
if (hits.some(h => h.phase === 3)) {
|
||||||
|
outcome = 'chain_complete';
|
||||||
|
evidence = visible.slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plan-ready as a fallback terminal — autoplan finished without
|
||||||
|
// surfacing a Phase 3 marker. This is a regression surface.
|
||||||
|
if (isPlanReadyVisible(visible)) {
|
||||||
|
outcome = 'plan_ready';
|
||||||
|
evidence = visible.slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await session.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outcome === 'exited' || outcome === 'timeout') {
|
||||||
|
throw new Error(
|
||||||
|
`autoplan chain test FAILED: outcome=${outcome}, hits=${JSON.stringify(hits)}\n` +
|
||||||
|
`--- evidence (last 3KB) ---\n${evidence}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Phase 3 (Eng) MUST have been seen.
|
||||||
|
const ceo = hits.find(h => h.phase === 1);
|
||||||
|
const design = hits.find(h => h.phase === 2);
|
||||||
|
const eng = hits.find(h => h.phase === 3);
|
||||||
|
if (!ceo || !eng) {
|
||||||
|
throw new Error(
|
||||||
|
`Required phase markers missing. Saw: ${JSON.stringify(hits)}\n` +
|
||||||
|
`--- evidence ---\n${evidence}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sequencing: CEO must end before Eng ends. Design (if observed)
|
||||||
|
// must end after CEO and before Eng.
|
||||||
|
expect(ceo.ts).toBeLessThan(eng.ts);
|
||||||
|
if (design) {
|
||||||
|
expect(design.ts).toBeGreaterThan(ceo.ts);
|
||||||
|
expect(design.ts).toBeLessThan(eng.ts);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
try { fs.rmSync(tempDir, { recursive: true, force: true }); } catch { /* ignore */ }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
1_200_000, // 20 min absolute test ceiling
|
||||||
|
);
|
||||||
|
});
|
||||||
204
test/skill-e2e-plan-ceo-mode-routing.test.ts
Normal file
204
test/skill-e2e-plan-ceo-mode-routing.test.ts
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
/**
|
||||||
|
* /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
|
||||||
|
*
|
||||||
|
* Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
|
||||||
|
* AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
|
||||||
|
* the downstream rendered output reflects that mode's distinctive
|
||||||
|
* posture language.
|
||||||
|
*
|
||||||
|
* Why this exists: existing tests verify that the question fires. Nothing
|
||||||
|
* verifies the answer actually routes. A regression where Step 0F shows
|
||||||
|
* the question but the agent ignores the choice (e.g. always defaults
|
||||||
|
* to EXPANSION) would not be caught by any prior test.
|
||||||
|
*
|
||||||
|
* Tier: periodic (not gate). Each run navigates 8-12 prior AUQs (telemetry,
|
||||||
|
* proactive, routing, vendoring, brain, office-hours, premise×3, approach)
|
||||||
|
* before reaching Step 0F. At ~30s per AUQ that's a 4-6 min navigation
|
||||||
|
* phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
|
||||||
|
* for gate-tier; weekly is fine.
|
||||||
|
*
|
||||||
|
* Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
|
||||||
|
* (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
|
||||||
|
* the navigation phase is shorter or has a deterministic fast-path through
|
||||||
|
* Step 0A/0C-bis.
|
||||||
|
*
|
||||||
|
* Posture assertions: each mode has distinct downstream language. The
|
||||||
|
* checks below are deliberately permissive — they catch the binary
|
||||||
|
* "did the mode posture even apply" question, not Opus-specific phrasing.
|
||||||
|
*
|
||||||
|
* HOLD SCOPE — "rigor" or "bulletproof" or "hold scope"
|
||||||
|
* SCOPE EXPANSION — "expansion" or "10x" or "delight" or "dream"
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, test } from 'bun:test';
|
||||||
|
import {
|
||||||
|
launchClaudePty,
|
||||||
|
isNumberedOptionListVisible,
|
||||||
|
isPermissionDialogVisible,
|
||||||
|
parseNumberedOptions,
|
||||||
|
isPlanReadyVisible,
|
||||||
|
type ClaudePtySession,
|
||||||
|
} from './helpers/claude-pty-runner';
|
||||||
|
|
||||||
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||||
|
const describeE2E = shouldRun ? describe : describe.skip;
|
||||||
|
|
||||||
|
const MODE_RE = /HOLD SCOPE|SCOPE EXPANSION|SELECTIVE EXPANSION|SCOPE REDUCTION/i;
|
||||||
|
|
||||||
|
interface ModeCase {
|
||||||
|
mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
|
||||||
|
/** Regex applied to visible-since-mode-pick text. At least one must match. */
|
||||||
|
postureRe: RegExp;
|
||||||
|
}
|
||||||
|
|
||||||
|
const CASES: ModeCase[] = [
|
||||||
|
{ mode: 'HOLD SCOPE', postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
|
||||||
|
{ mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Navigate prior AUQs by picking option 1 until we hit an AUQ whose
|
||||||
|
* options match one of the 4 mode names. Returns the option index
|
||||||
|
* matching `targetMode`, with the buffer marker pointing AT that AUQ.
|
||||||
|
*
|
||||||
|
* Throws if we don't reach the mode AUQ within `maxNav` prior AUQs or
|
||||||
|
* the overall budget.
|
||||||
|
*/
|
||||||
|
async function navigateToModeAuq(
|
||||||
|
session: ClaudePtySession,
|
||||||
|
since: number,
|
||||||
|
targetMode: ModeCase['mode'],
|
||||||
|
opts: { maxNav?: number; budgetMs?: number } = {},
|
||||||
|
): Promise<{ modeIndex: number; visibleAtMode: string }> {
|
||||||
|
// /plan-ceo-review's mode AUQ (Step 0F) sits behind several preamble
|
||||||
|
// and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
|
||||||
|
// brain privacy, office-hours offer, premise challenge (3 questions),
|
||||||
|
// approach selection. 12 hops is the conservative ceiling.
|
||||||
|
const maxNav = opts.maxNav ?? 12;
|
||||||
|
const budgetMs = opts.budgetMs ?? 420_000;
|
||||||
|
const start = Date.now();
|
||||||
|
let priorAnswered = 0;
|
||||||
|
let lastSeenList: Array<{ index: number; label: string }> = [];
|
||||||
|
|
||||||
|
while (Date.now() - start < budgetMs) {
|
||||||
|
if (session.exited()) {
|
||||||
|
throw new Error(
|
||||||
|
`claude exited (code=${session.exitCode()}) during nav.\n` +
|
||||||
|
`Last visible:\n${session.visibleSince(since).slice(-2000)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
await Bun.sleep(2000);
|
||||||
|
const visible = session.visibleSince(since);
|
||||||
|
if (!isNumberedOptionListVisible(visible)) continue;
|
||||||
|
const opts = parseNumberedOptions(visible);
|
||||||
|
if (opts.length < 2) continue;
|
||||||
|
|
||||||
|
// Has the rendered list changed since last poll? If not, we're seeing
|
||||||
|
// the same prompt and shouldn't double-press.
|
||||||
|
const sig = opts.map(o => `${o.index}:${o.label}`).join('|');
|
||||||
|
const lastSig = lastSeenList.map(o => `${o.index}:${o.label}`).join('|');
|
||||||
|
if (sig === lastSig) continue;
|
||||||
|
lastSeenList = opts;
|
||||||
|
|
||||||
|
// Is THIS the mode AUQ?
|
||||||
|
if (opts.some(o => MODE_RE.test(o.label))) {
|
||||||
|
const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
|
||||||
|
if (!target) {
|
||||||
|
throw new Error(
|
||||||
|
`Mode AUQ rendered but target "${targetMode}" not in option labels:\n` +
|
||||||
|
opts.map(o => ` ${o.index}. ${o.label}`).join('\n'),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return { modeIndex: target.index, visibleAtMode: visible };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Permission dialog? Grant with "1" but don't count it against nav budget.
|
||||||
|
// Classify on the recent tail only — old permission text persists in
|
||||||
|
// visibleSince and would re-trigger forever.
|
||||||
|
if (isPermissionDialogVisible(visible.slice(-1500))) {
|
||||||
|
session.send('1\r');
|
||||||
|
await Bun.sleep(1500);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not the mode AUQ — answer with option 1 (recommended) and continue.
|
||||||
|
if (priorAnswered >= maxNav) {
|
||||||
|
throw new Error(
|
||||||
|
`Navigated ${maxNav} prior AUQs without reaching the mode AUQ. ` +
|
||||||
|
`Last list:\n${opts.map(o => ` ${o.index}. ${o.label}`).join('\n')}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
priorAnswered++;
|
||||||
|
session.send('1\r');
|
||||||
|
// Give the agent a beat to advance before re-polling.
|
||||||
|
await Bun.sleep(2000);
|
||||||
|
}
|
||||||
|
throw new Error(`Mode AUQ not reached within ${budgetMs}ms`);
|
||||||
|
}
|
||||||
|
|
||||||
|
describeE2E('/plan-ceo-review mode routing (gate)', () => {
|
||||||
|
for (const c of CASES) {
|
||||||
|
test(
|
||||||
|
`mode "${c.mode}" routes to its distinctive posture`,
|
||||||
|
async () => {
|
||||||
|
const session = await launchClaudePty({
|
||||||
|
permissionMode: 'plan',
|
||||||
|
timeoutMs: 540_000,
|
||||||
|
});
|
||||||
|
try {
|
||||||
|
await Bun.sleep(8000);
|
||||||
|
const since = session.mark();
|
||||||
|
session.send('/plan-ceo-review\r');
|
||||||
|
|
||||||
|
const { modeIndex } = await navigateToModeAuq(session, since, c.mode);
|
||||||
|
|
||||||
|
// Snapshot the visible buffer at mode-pick time, then send the index.
|
||||||
|
const sincePick = session.rawOutput().length;
|
||||||
|
session.send(`${modeIndex}\r`);
|
||||||
|
|
||||||
|
// Wait for downstream evidence: either next AUQ or plan_ready or
|
||||||
|
// a posture-distinctive substring shows up.
|
||||||
|
const budgetMs = 240_000;
|
||||||
|
const start = Date.now();
|
||||||
|
let postureMatched = false;
|
||||||
|
let downstreamSnapshot = '';
|
||||||
|
while (Date.now() - start < budgetMs) {
|
||||||
|
await Bun.sleep(2500);
|
||||||
|
if (session.exited()) {
|
||||||
|
throw new Error(
|
||||||
|
`claude exited (code=${session.exitCode()}) after mode pick.\n` +
|
||||||
|
`Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
downstreamSnapshot = session.visibleSince(sincePick);
|
||||||
|
if (c.postureRe.test(downstreamSnapshot)) {
|
||||||
|
postureMatched = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Don't bail early on plan_ready alone — the posture text may
|
||||||
|
// arrive as the agent finishes writing the plan. Only break
|
||||||
|
// once we either match posture or run the clock.
|
||||||
|
if (
|
||||||
|
isPlanReadyVisible(downstreamSnapshot) &&
|
||||||
|
isNumberedOptionListVisible(downstreamSnapshot) &&
|
||||||
|
!c.postureRe.test(downstreamSnapshot)
|
||||||
|
) {
|
||||||
|
// Plan-ready AND a follow-up AUQ are both visible but
|
||||||
|
// posture text has not appeared yet. Keep polling for a bit.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!postureMatched) {
|
||||||
|
throw new Error(
|
||||||
|
`Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
|
||||||
|
`--- downstream visible since mode pick (last 3KB) ---\n` +
|
||||||
|
downstreamSnapshot.slice(-3000),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await session.close();
|
||||||
|
}
|
||||||
|
},
|
||||||
|
600_000,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
271
test/skill-e2e-ship-idempotency.test.ts
Normal file
271
test/skill-e2e-ship-idempotency.test.ts
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
/**
|
||||||
|
* /ship idempotency E2E (periodic, paid, real-PTY).
|
||||||
|
*
|
||||||
|
* Asserts: when /ship runs against a branch that has ALREADY been bumped
|
||||||
|
* (VERSION ahead of base AND package.json synced AND a CHANGELOG entry
|
||||||
|
* exists for the bumped version), the workflow:
|
||||||
|
*
|
||||||
|
* 1. Detects ALREADY_BUMPED state via the Step 12 idempotency check
|
||||||
|
* 2. Does NOT echo STATE: FRESH (which would trigger a second bump)
|
||||||
|
* 3. Does NOT mutate the fixture's VERSION file
|
||||||
|
* 4. Does NOT append a duplicate CHANGELOG [0.0.2] entry
|
||||||
|
* 5. Does NOT create a new "chore: bump version" commit
|
||||||
|
*
|
||||||
|
* Why real-PTY: the existing ship-idempotency test in skill-e2e.test.ts
|
||||||
|
* uses the SDK harness with a synthetic prompt asking the agent to "run
|
||||||
|
* ONLY the idempotency checks." This test exercises the actual /ship
|
||||||
|
* skill end-to-end against a real git fixture so a regression that
|
||||||
|
* silently re-bumps despite the check passing would be caught.
|
||||||
|
*
|
||||||
|
* Plan-mode framing: we run /ship in plan mode so the agent cannot push,
|
||||||
|
* commit, or open PRs. The Step 12 idempotency check is read-only
|
||||||
|
* (reads VERSION + package.json + git rev-parse) and runs fine in plan
|
||||||
|
* mode. The plan-ready output serves as the terminal signal — the agent
|
||||||
|
* has done its analysis and produced a plan describing what it would do.
|
||||||
|
*
|
||||||
|
* If the agent decides to bump or push despite the fixture's
|
||||||
|
* ALREADY_BUMPED state, that intent surfaces in the plan or in
|
||||||
|
* tool-call attempts, which we detect.
|
||||||
|
*
|
||||||
|
* Cost: ~$2-4/run. Periodic tier — long, runs weekly.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, test, expect } from 'bun:test';
|
||||||
|
import { spawnSync } from 'child_process';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
import * as os from 'os';
|
||||||
|
import {
|
||||||
|
launchClaudePty,
|
||||||
|
isPermissionDialogVisible,
|
||||||
|
isNumberedOptionListVisible,
|
||||||
|
} from './helpers/claude-pty-runner';
|
||||||
|
|
||||||
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||||
|
const describeE2E = shouldRun ? describe : describe.skip;
|
||||||
|
|
||||||
|
interface ShipFixture {
|
||||||
|
workTree: string;
|
||||||
|
bareRemote: string;
|
||||||
|
/** Full bash log of `git` and helper commands run during setup. */
|
||||||
|
setupLog: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a self-contained git fixture representing an already-shipped state:
|
||||||
|
* - main branch at VERSION 0.0.1, with one CHANGELOG entry [0.0.1]
|
||||||
|
* - feat/already-shipped branch at VERSION 0.0.2 (bumped + synced),
|
||||||
|
* CHANGELOG has [0.0.2] entry on top of [0.0.1], one feature commit
|
||||||
|
* - bareRemote is the origin; both branches are pushed
|
||||||
|
*
|
||||||
|
* Returns the work-tree dir for /ship to operate on.
|
||||||
|
*/
|
||||||
|
function buildShippedFixture(): ShipFixture {
|
||||||
|
const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-fixture-'));
|
||||||
|
const workTree = path.join(root, 'workspace');
|
||||||
|
const bareRemote = path.join(root, 'origin.git');
|
||||||
|
fs.mkdirSync(workTree, { recursive: true });
|
||||||
|
|
||||||
|
const setupLog: string[] = [];
|
||||||
|
const sh = (cmd: string, cwd: string): void => {
|
||||||
|
setupLog.push(`[${cwd}] ${cmd}`);
|
||||||
|
const result = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 });
|
||||||
|
if (result.status !== 0) {
|
||||||
|
const stderr = result.stderr?.toString() ?? '';
|
||||||
|
throw new Error(`fixture setup failed at "${cmd}":\n${stderr}\n--- log ---\n${setupLog.join('\n')}`);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Bare remote.
|
||||||
|
sh(`git init --bare "${bareRemote}"`, root);
|
||||||
|
|
||||||
|
// Initial commit on main.
|
||||||
|
sh('git init -b main', workTree);
|
||||||
|
sh('git config user.email "test@test.com"', workTree);
|
||||||
|
sh('git config user.name "Test"', workTree);
|
||||||
|
sh('git config commit.gpgsign false', workTree);
|
||||||
|
|
||||||
|
fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n');
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(workTree, 'package.json'),
|
||||||
|
JSON.stringify({ name: 'fixture', version: '0.0.1', private: true }, null, 2) + '\n',
|
||||||
|
);
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(workTree, 'CHANGELOG.md'),
|
||||||
|
`# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`,
|
||||||
|
);
|
||||||
|
fs.writeFileSync(path.join(workTree, 'README.md'), '# Fixture\n');
|
||||||
|
|
||||||
|
sh('git add VERSION package.json CHANGELOG.md README.md', workTree);
|
||||||
|
sh('git commit -m "chore: initial release v0.0.1"', workTree);
|
||||||
|
sh(`git remote add origin "${bareRemote}"`, workTree);
|
||||||
|
sh('git push -u origin main', workTree);
|
||||||
|
|
||||||
|
// Feature branch with ALREADY_BUMPED state.
|
||||||
|
sh('git checkout -b feat/already-shipped', workTree);
|
||||||
|
fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.2\n');
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(workTree, 'package.json'),
|
||||||
|
JSON.stringify({ name: 'fixture', version: '0.0.2', private: true }, null, 2) + '\n',
|
||||||
|
);
|
||||||
|
fs.writeFileSync(
|
||||||
|
path.join(workTree, 'CHANGELOG.md'),
|
||||||
|
`# Changelog\n\n## [0.0.2] - 2026-04-25\n\n**Feature shipped.**\n\nAdded the new feature.\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`,
|
||||||
|
);
|
||||||
|
fs.writeFileSync(path.join(workTree, 'feature.md'), '# Feature\n\nAlready shipped.\n');
|
||||||
|
|
||||||
|
sh('git add VERSION package.json CHANGELOG.md feature.md', workTree);
|
||||||
|
sh('git commit -m "feat: add new feature\n\nbumps VERSION to 0.0.2"', workTree);
|
||||||
|
sh('git push -u origin feat/already-shipped', workTree);
|
||||||
|
|
||||||
|
return { workTree, bareRemote, setupLog };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Snapshot the load-bearing fixture state so we can compare post-run. */
|
||||||
|
interface FixtureSnapshot {
|
||||||
|
versionFile: string;
|
||||||
|
packageVersion: string;
|
||||||
|
changelogEntryCount: number;
|
||||||
|
bumpCommitCount: number;
|
||||||
|
branchHead: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
function snapshotFixture(workTree: string): FixtureSnapshot {
|
||||||
|
const versionFile = fs.readFileSync(path.join(workTree, 'VERSION'), 'utf-8').trim();
|
||||||
|
const pkg = JSON.parse(fs.readFileSync(path.join(workTree, 'package.json'), 'utf-8'));
|
||||||
|
const changelog = fs.readFileSync(path.join(workTree, 'CHANGELOG.md'), 'utf-8');
|
||||||
|
// Count `## [0.0.2]` headings — should stay at 1 across re-runs.
|
||||||
|
const changelogEntryCount = (changelog.match(/^##\s*\[0\.0\.2\]/gm) ?? []).length;
|
||||||
|
const head = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: workTree, stdio: 'pipe' });
|
||||||
|
const branchHead = head.stdout?.toString().trim() ?? '';
|
||||||
|
// Count "chore: bump version" commits on this branch since main.
|
||||||
|
const log = spawnSync(
|
||||||
|
'git', ['log', '--format=%s', 'main..HEAD'],
|
||||||
|
{ cwd: workTree, stdio: 'pipe' },
|
||||||
|
);
|
||||||
|
const subjects = log.stdout?.toString() ?? '';
|
||||||
|
const bumpCommitCount = subjects.split('\n').filter(s => /chore:\s*bump\s+version/i.test(s)).length;
|
||||||
|
return { versionFile, packageVersion: pkg.version, changelogEntryCount, bumpCommitCount, branchHead };
|
||||||
|
}
|
||||||
|
|
||||||
|
describeE2E('/ship idempotency E2E (periodic, real-PTY)', () => {
|
||||||
|
test(
|
||||||
|
'rerunning /ship on an already-shipped branch detects ALREADY_BUMPED and does not mutate fixture',
|
||||||
|
async () => {
|
||||||
|
const fixture = buildShippedFixture();
|
||||||
|
const before = snapshotFixture(fixture.workTree);
|
||||||
|
|
||||||
|
const session = await launchClaudePty({
|
||||||
|
permissionMode: 'plan',
|
||||||
|
cwd: fixture.workTree,
|
||||||
|
timeoutMs: 720_000,
|
||||||
|
// Disable network-y pieces so the agent can't reach actual github.
|
||||||
|
env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' },
|
||||||
|
});
|
||||||
|
|
||||||
|
let outcome: 'detected' | 'plan_ready' | 'attempted_mutation' | 'timeout' | 'exited' = 'timeout';
|
||||||
|
let evidence = '';
|
||||||
|
|
||||||
|
try {
|
||||||
|
await Bun.sleep(8000);
|
||||||
|
const since = session.mark();
|
||||||
|
session.send('/ship\r');
|
||||||
|
|
||||||
|
const budgetMs = 600_000;
|
||||||
|
const start = Date.now();
|
||||||
|
let lastPermSig = '';
|
||||||
|
while (Date.now() - start < budgetMs) {
|
||||||
|
await Bun.sleep(3000);
|
||||||
|
if (session.exited()) {
|
||||||
|
outcome = 'exited';
|
||||||
|
evidence = session.visibleSince(since).slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const visible = session.visibleSince(since);
|
||||||
|
|
||||||
|
// Auto-grant any permission dialogs the preamble triggers
|
||||||
|
// (e.g. touch on a marker file claude considers sensitive).
|
||||||
|
// Classify on the recent tail; don't double-press the same render.
|
||||||
|
const tail = visible.slice(-1500);
|
||||||
|
if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) {
|
||||||
|
const sig = visible.slice(-500);
|
||||||
|
if (sig !== lastPermSig) {
|
||||||
|
lastPermSig = sig;
|
||||||
|
session.send('1\r');
|
||||||
|
await Bun.sleep(1500);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Positive: the idempotency-check echoed ALREADY_BUMPED.
|
||||||
|
if (/STATE:\s*ALREADY_BUMPED/.test(visible)) {
|
||||||
|
outcome = 'detected';
|
||||||
|
evidence = visible.slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Negative regressions:
|
||||||
|
// - bump-action bash block ran (would echo on FRESH path)
|
||||||
|
// - agent attempted git commit -m "chore: bump version"
|
||||||
|
// - agent attempted git push
|
||||||
|
// - agent rendered an Edit/Write to CHANGELOG.md or VERSION (acceptable in plan mode but flagged here)
|
||||||
|
if (
|
||||||
|
/STATE:\s*FRESH(?![\w-])/i.test(visible) ||
|
||||||
|
/git\s+commit\s+.*chore:\s*bump\s+version/i.test(visible) ||
|
||||||
|
/git\s+push.*origin/i.test(visible)
|
||||||
|
) {
|
||||||
|
outcome = 'attempted_mutation';
|
||||||
|
evidence = visible.slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plan-ready outcome (acceptable terminal): the agent finished
|
||||||
|
// analysis. We'll accept this if no mutation signals showed up.
|
||||||
|
if (/ready to execute|Would you like to proceed/i.test(visible)) {
|
||||||
|
outcome = 'plan_ready';
|
||||||
|
evidence = visible.slice(-3000);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
await session.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify fixture was not mutated regardless of outcome.
|
||||||
|
const after = snapshotFixture(fixture.workTree);
|
||||||
|
const fixtureStable =
|
||||||
|
after.versionFile === before.versionFile &&
|
||||||
|
after.packageVersion === before.packageVersion &&
|
||||||
|
after.changelogEntryCount === before.changelogEntryCount &&
|
||||||
|
after.bumpCommitCount === before.bumpCommitCount &&
|
||||||
|
after.branchHead === before.branchHead;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (outcome === 'attempted_mutation') {
|
||||||
|
throw new Error(
|
||||||
|
`/ship attempted to mutate already-shipped state.\n` +
|
||||||
|
`--- evidence (last 3KB) ---\n${evidence}\n` +
|
||||||
|
`--- before ---\n${JSON.stringify(before, null, 2)}\n` +
|
||||||
|
`--- after ---\n${JSON.stringify(after, null, 2)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (outcome === 'exited') {
|
||||||
|
throw new Error(`claude exited unexpectedly.\n--- evidence ---\n${evidence}`);
|
||||||
|
}
|
||||||
|
if (outcome === 'timeout') {
|
||||||
|
throw new Error(
|
||||||
|
`Timed out before any terminal outcome.\n--- evidence (last 3KB) ---\n${evidence}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
// Detected or plan_ready — both are acceptable terminal outcomes.
|
||||||
|
expect(['detected', 'plan_ready']).toContain(outcome);
|
||||||
|
// Fixture must not have been mutated regardless of outcome.
|
||||||
|
expect(fixtureStable).toBe(true);
|
||||||
|
} finally {
|
||||||
|
// Clean up fixture root.
|
||||||
|
try { fs.rmSync(path.dirname(fixture.workTree), { recursive: true, force: true }); } catch { /* ignore */ }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
900_000, // 15 min wall clock
|
||||||
|
);
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user