feat(test): 3 periodic-tier real-PTY E2E tests

skill-e2e-plan-ceo-mode-routing.test.ts (~$3/run, 6-10 min/case): - Verifies AUQ answer routing: HOLD SCOPE → rigor/bulletproof posture language; SCOPE EXPANSION → expansion/10x/dream language. Each case navigates 8-12 prior AUQs (telemetry, proactive, routing, vendoring, brain, office-hours, premise, approach) before hitting Step 0F. - Periodic, not gate: navigation phase too slow for PR-blocking. V2 expansion to 4 modes (SELECTIVE + REDUCTION) when nav is faster. skill-e2e-ship-idempotency.test.ts (~$3/run, 5-10 min): - Builds a real git fixture with VERSION 0.0.2 already bumped, matching package.json, CHANGELOG entry, pushed to a local bare remote. Runs /ship in plan mode and asserts STATE: ALREADY_BUMPED echoes from the Step 12 idempotency check, OR plan_ready terminates without mutation. - Snapshots VERSION + package.json + CHANGELOG entry count + commit count + branch HEAD before/after; fails if any changed. skill-e2e-autoplan-chain.test.ts (~$8/run, 12-18 min): - Asserts /autoplan phases run sequentially: tees timestamps as each "**Phase N complete.**" marker first appears. Phase 1 (CEO) must precede Phase 3 (Eng); Phase 2 (Design) is optional but if it appears, must sit between 1 and 3. - Auto-grants permission dialogs that fire during phase transitions. All three auto-handle permission dialogs (preamble side-effects on fresh user envs without .feature-prompted-* markers). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 18:32:28 +08:00 · 2026-04-26 04:36:51 -07:00
parent 2b1a0da7c1
commit e6fd776a37
3 changed files with 651 additions and 0 deletions
--- a/test/skill-e2e-autoplan-chain.test.ts
+++ b/test/skill-e2e-autoplan-chain.test.ts
@@ -0,0 +1,176 @@
 /**
 * /autoplan cross-skill chain (periodic, paid, real-PTY).
 *
 * Asserts: when /autoplan runs against a plan fixture, the phase markers
 * the autoplan template emits appear in the correct order:
 *
 *   "**Phase 1 complete." (CEO)        →
 *   "**Phase 2 complete." (Design — only if UI scope detected) →
 *   "**Phase 3 complete." (Eng)        →
 *   "**Phase 3.5 complete." (DX — optional, skipped if no DX scope)
 *
 * Why this exists: each individual phase has its own plan-mode smoke
 * test. Nothing verifies the SEQUENCING — that phases don't run in
 * parallel, that Phase 3 doesn't start before Phase 1 ends, that
 * conditional phases (Design, DX) are skipped when their scope is absent.
 * A regression where the autoplan template wires phases concurrently
 * would not be caught by per-phase tests.
 *
 * Approach: tee timestamps as each "**Phase N complete." marker first
 * appears in the visible buffer. Assert observed ordering. Phase 2 is
 * optional — UI-heavy fixture should make it run; backend-only fixtures
 * should make it skip.
 *
 * Cost: ~$5-8/run, 10-15 min wall clock. Periodic — runs weekly.
 */
 import { describe, test, expect } from 'bun:test';
 import { spawnSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import {
  launchClaudePty,
  isPlanReadyVisible,
  isPermissionDialogVisible,
  isNumberedOptionListVisible,
 } from './helpers/claude-pty-runner';
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
 const describeE2E = shouldRun ? describe : describe.skip;
 const ROOT = path.resolve(import.meta.dir, '..');
 const UI_FIXTURE = path.join(ROOT, 'test', 'fixtures', 'plans', 'ui-heavy-feature.md');
 interface PhaseHit {
  phase: number;
  ts: number;
 }
 describeE2E('/autoplan chain ordering (periodic)', () => {
  test(
    'phases run sequentially: Phase 1 (CEO) before Phase 3 (Eng), Phase 2 (Design) between when present',
    async () => {
      // UI-heavy fixture so Phase 2 runs.
      const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-autoplan-chain-'));
      try {
        const gitRun = (args: string[]) =>
          spawnSync('git', args, { cwd: tempDir, stdio: 'pipe', timeout: 5000 });
        gitRun(['init', '-b', 'main']);
        gitRun(['config', 'user.email', 'test@test.com']);
        gitRun(['config', 'user.name', 'Test']);
        const plansDir = path.join(tempDir, '.claude', 'plans');
        fs.mkdirSync(plansDir, { recursive: true });
        fs.copyFileSync(UI_FIXTURE, path.join(plansDir, 'ui-heavy-feature.md'));
        fs.writeFileSync(path.join(tempDir, 'README.md'), '# Autoplan chain fixture\n');
        gitRun(['add', '.']);
        gitRun(['commit', '-m', 'init UI-heavy fixture']);
        const session = await launchClaudePty({
          permissionMode: 'plan',
          cwd: tempDir,
          timeoutMs: 1_080_000, // 18 min, slightly above test budget
        });
        const hits: PhaseHit[] = [];
        let outcome: 'chain_complete' | 'plan_ready' | 'timeout' | 'exited' = 'timeout';
        let evidence = '';
        try {
          await Bun.sleep(8000);
          const since = session.mark();
          session.send('/autoplan\r');
          const budgetMs = 900_000; // 15 min
          const start = Date.now();
          // Phase markers in autoplan/SKILL.md (lines 1126, 1211, 1331, 1437):
          //   "**Phase 1 complete." / "**Phase 2 complete." / "**Phase 3 complete." / "**Phase 3.5 complete."
          const phasePattern = /\*\*Phase\s+(\d+(?:\.\d+)?)\s+complete\.?\*\*/g;
          let lastPermSig = '';
          while (Date.now() - start < budgetMs) {
            await Bun.sleep(5000);
            if (session.exited()) {
              outcome = 'exited';
              evidence = session.visibleSince(since).slice(-3000);
              break;
            }
            const visible = session.visibleSince(since);
            // Auto-grant any permission dialog so autoplan can keep moving
            // through its phases. The autoplan template auto-decides AUQs
            // it owns; only permission prompts (file/tool grants) need our
            // hand-pressing. Classify on tail to avoid stale matches.
            const recentTail = visible.slice(-1500);
            if (isNumberedOptionListVisible(recentTail) && isPermissionDialogVisible(recentTail)) {
              const sig = visible.slice(-500);
              if (sig !== lastPermSig) {
                lastPermSig = sig;
                session.send('1\r');
                await Bun.sleep(2000);
                continue;
              }
            }
            // Re-scan for any phase markers we haven't yet recorded.
            phasePattern.lastIndex = 0;
            let m: RegExpExecArray | null;
            while ((m = phasePattern.exec(visible)) !== null) {
              const phaseNum = parseFloat(m[1] ?? '0');
              if (Number.isNaN(phaseNum)) continue;
              if (hits.some(h => h.phase === phaseNum)) continue;
              hits.push({ phase: phaseNum, ts: Date.now() });
            }
            // Terminal: Phase 3 (Eng) seen — chain reached the required end.
            if (hits.some(h => h.phase === 3)) {
              outcome = 'chain_complete';
              evidence = visible.slice(-3000);
              break;
            }
            // Plan-ready as a fallback terminal — autoplan finished without
            // surfacing a Phase 3 marker. This is a regression surface.
            if (isPlanReadyVisible(visible)) {
              outcome = 'plan_ready';
              evidence = visible.slice(-3000);
              break;
            }
          }
        } finally {
          await session.close();
        }
        if (outcome === 'exited' || outcome === 'timeout') {
          throw new Error(
            `autoplan chain test FAILED: outcome=${outcome}, hits=${JSON.stringify(hits)}\n` +
              `--- evidence (last 3KB) ---\n${evidence}`,
          );
        }
        // Phase 3 (Eng) MUST have been seen.
        const ceo = hits.find(h => h.phase === 1);
        const design = hits.find(h => h.phase === 2);
        const eng = hits.find(h => h.phase === 3);
        if (!ceo || !eng) {
          throw new Error(
            `Required phase markers missing. Saw: ${JSON.stringify(hits)}\n` +
              `--- evidence ---\n${evidence}`,
          );
        }
        // Sequencing: CEO must end before Eng ends. Design (if observed)
        // must end after CEO and before Eng.
        expect(ceo.ts).toBeLessThan(eng.ts);
        if (design) {
          expect(design.ts).toBeGreaterThan(ceo.ts);
          expect(design.ts).toBeLessThan(eng.ts);
        }
      } finally {
        try { fs.rmSync(tempDir, { recursive: true, force: true }); } catch { /* ignore */ }
      }
    },
    1_200_000, // 20 min absolute test ceiling
  );
 });
--- a/test/skill-e2e-plan-ceo-mode-routing.test.ts
+++ b/test/skill-e2e-plan-ceo-mode-routing.test.ts
@@ -0,0 +1,204 @@
 /**
 * /plan-ceo-review mode-routing E2E (periodic, paid, real-PTY).
 *
 * Asserts: when /plan-ceo-review reaches its Step 0F mode-selection
 * AskUserQuestion and the user picks HOLD SCOPE or SCOPE EXPANSION,
 * the downstream rendered output reflects that mode's distinctive
 * posture language.
 *
 * Why this exists: existing tests verify that the question fires. Nothing
 * verifies the answer actually routes. A regression where Step 0F shows
 * the question but the agent ignores the choice (e.g. always defaults
 * to EXPANSION) would not be caught by any prior test.
 *
 * Tier: periodic (not gate). Each run navigates 8-12 prior AUQs (telemetry,
 * proactive, routing, vendoring, brain, office-hours, premise×3, approach)
 * before reaching Step 0F. At ~30s per AUQ that's a 4-6 min navigation
 * phase per case. The full 2-case suite runs ~12-15 min, $3-4. Too slow
 * for gate-tier; weekly is fine.
 *
 * Mode coverage: HOLD SCOPE + SCOPE EXPANSION cover the two posture poles
 * (rigor vs ambition). SELECTIVE EXPANSION and SCOPE REDUCTION are V2 once
 * the navigation phase is shorter or has a deterministic fast-path through
 * Step 0A/0C-bis.
 *
 * Posture assertions: each mode has distinct downstream language. The
 * checks below are deliberately permissive — they catch the binary
 * "did the mode posture even apply" question, not Opus-specific phrasing.
 *
 *   HOLD SCOPE        — "rigor" or "bulletproof" or "hold scope"
 *   SCOPE EXPANSION   — "expansion" or "10x" or "delight" or "dream"
 */
 import { describe, test } from 'bun:test';
 import {
  launchClaudePty,
  isNumberedOptionListVisible,
  isPermissionDialogVisible,
  parseNumberedOptions,
  isPlanReadyVisible,
  type ClaudePtySession,
 } from './helpers/claude-pty-runner';
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
 const describeE2E = shouldRun ? describe : describe.skip;
 const MODE_RE = /HOLD SCOPE|SCOPE EXPANSION|SELECTIVE EXPANSION|SCOPE REDUCTION/i;
 interface ModeCase {
  mode: 'HOLD SCOPE' | 'SCOPE EXPANSION';
  /** Regex applied to visible-since-mode-pick text. At least one must match. */
  postureRe: RegExp;
 }
 const CASES: ModeCase[] = [
  { mode: 'HOLD SCOPE',      postureRe: /\b(rigor|bulletproof|hold\s*scope|maximum\s+rigor)\b/i },
  { mode: 'SCOPE EXPANSION', postureRe: /\b(expansion|10x|delight|dream|cathedral|opt[\s-]?in)\b/i },
 ];
 /**
 * Navigate prior AUQs by picking option 1 until we hit an AUQ whose
 * options match one of the 4 mode names. Returns the option index
 * matching `targetMode`, with the buffer marker pointing AT that AUQ.
 *
 * Throws if we don't reach the mode AUQ within `maxNav` prior AUQs or
 * the overall budget.
 */
 async function navigateToModeAuq(
  session: ClaudePtySession,
  since: number,
  targetMode: ModeCase['mode'],
  opts: { maxNav?: number; budgetMs?: number } = {},
 ): Promise<{ modeIndex: number; visibleAtMode: string }> {
  // /plan-ceo-review's mode AUQ (Step 0F) sits behind several preamble
  // and Step 0A-0C-bis gates: telemetry, proactive, routing, vendoring,
  // brain privacy, office-hours offer, premise challenge (3 questions),
  // approach selection. 12 hops is the conservative ceiling.
  const maxNav = opts.maxNav ?? 12;
  const budgetMs = opts.budgetMs ?? 420_000;
  const start = Date.now();
  let priorAnswered = 0;
  let lastSeenList: Array<{ index: number; label: string }> = [];
  while (Date.now() - start < budgetMs) {
    if (session.exited()) {
      throw new Error(
        `claude exited (code=${session.exitCode()}) during nav.\n` +
        `Last visible:\n${session.visibleSince(since).slice(-2000)}`,
      );
    }
    await Bun.sleep(2000);
    const visible = session.visibleSince(since);
    if (!isNumberedOptionListVisible(visible)) continue;
    const opts = parseNumberedOptions(visible);
    if (opts.length < 2) continue;
    // Has the rendered list changed since last poll? If not, we're seeing
    // the same prompt and shouldn't double-press.
    const sig = opts.map(o => `${o.index}:${o.label}`).join('|');
    const lastSig = lastSeenList.map(o => `${o.index}:${o.label}`).join('|');
    if (sig === lastSig) continue;
    lastSeenList = opts;
    // Is THIS the mode AUQ?
    if (opts.some(o => MODE_RE.test(o.label))) {
      const target = opts.find(o => o.label.toUpperCase().includes(targetMode));
      if (!target) {
        throw new Error(
          `Mode AUQ rendered but target "${targetMode}" not in option labels:\n` +
          opts.map(o => `  ${o.index}. ${o.label}`).join('\n'),
        );
      }
      return { modeIndex: target.index, visibleAtMode: visible };
    }
    // Permission dialog? Grant with "1" but don't count it against nav budget.
    // Classify on the recent tail only — old permission text persists in
    // visibleSince and would re-trigger forever.
    if (isPermissionDialogVisible(visible.slice(-1500))) {
      session.send('1\r');
      await Bun.sleep(1500);
      continue;
    }
    // Not the mode AUQ — answer with option 1 (recommended) and continue.
    if (priorAnswered >= maxNav) {
      throw new Error(
        `Navigated ${maxNav} prior AUQs without reaching the mode AUQ. ` +
        `Last list:\n${opts.map(o => `  ${o.index}. ${o.label}`).join('\n')}`,
      );
    }
    priorAnswered++;
    session.send('1\r');
    // Give the agent a beat to advance before re-polling.
    await Bun.sleep(2000);
  }
  throw new Error(`Mode AUQ not reached within ${budgetMs}ms`);
 }
 describeE2E('/plan-ceo-review mode routing (gate)', () => {
  for (const c of CASES) {
    test(
      `mode "${c.mode}" routes to its distinctive posture`,
      async () => {
        const session = await launchClaudePty({
          permissionMode: 'plan',
          timeoutMs: 540_000,
        });
        try {
          await Bun.sleep(8000);
          const since = session.mark();
          session.send('/plan-ceo-review\r');
          const { modeIndex } = await navigateToModeAuq(session, since, c.mode);
          // Snapshot the visible buffer at mode-pick time, then send the index.
          const sincePick = session.rawOutput().length;
          session.send(`${modeIndex}\r`);
          // Wait for downstream evidence: either next AUQ or plan_ready or
          // a posture-distinctive substring shows up.
          const budgetMs = 240_000;
          const start = Date.now();
          let postureMatched = false;
          let downstreamSnapshot = '';
          while (Date.now() - start < budgetMs) {
            await Bun.sleep(2500);
            if (session.exited()) {
              throw new Error(
                `claude exited (code=${session.exitCode()}) after mode pick.\n` +
                `Downstream:\n${session.visibleSince(sincePick).slice(-2000)}`,
              );
            }
            downstreamSnapshot = session.visibleSince(sincePick);
            if (c.postureRe.test(downstreamSnapshot)) {
              postureMatched = true;
              break;
            }
            // Don't bail early on plan_ready alone — the posture text may
            // arrive as the agent finishes writing the plan. Only break
            // once we either match posture or run the clock.
            if (
              isPlanReadyVisible(downstreamSnapshot) &&
              isNumberedOptionListVisible(downstreamSnapshot) &&
              !c.postureRe.test(downstreamSnapshot)
            ) {
              // Plan-ready AND a follow-up AUQ are both visible but
              // posture text has not appeared yet. Keep polling for a bit.
            }
          }
          if (!postureMatched) {
            throw new Error(
              `Mode "${c.mode}" routing FAILED: no posture match for ${c.postureRe.source}.\n` +
              `--- downstream visible since mode pick (last 3KB) ---\n` +
              downstreamSnapshot.slice(-3000),
            );
          }
        } finally {
          await session.close();
        }
      },
      600_000,
    );
  }
 });
--- a/test/skill-e2e-ship-idempotency.test.ts
+++ b/test/skill-e2e-ship-idempotency.test.ts
@@ -0,0 +1,271 @@
 /**
 * /ship idempotency E2E (periodic, paid, real-PTY).
 *
 * Asserts: when /ship runs against a branch that has ALREADY been bumped
 * (VERSION ahead of base AND package.json synced AND a CHANGELOG entry
 * exists for the bumped version), the workflow:
 *
 *   1. Detects ALREADY_BUMPED state via the Step 12 idempotency check
 *   2. Does NOT echo STATE: FRESH (which would trigger a second bump)
 *   3. Does NOT mutate the fixture's VERSION file
 *   4. Does NOT append a duplicate CHANGELOG [0.0.2] entry
 *   5. Does NOT create a new "chore: bump version" commit
 *
 * Why real-PTY: the existing ship-idempotency test in skill-e2e.test.ts
 * uses the SDK harness with a synthetic prompt asking the agent to "run
 * ONLY the idempotency checks." This test exercises the actual /ship
 * skill end-to-end against a real git fixture so a regression that
 * silently re-bumps despite the check passing would be caught.
 *
 * Plan-mode framing: we run /ship in plan mode so the agent cannot push,
 * commit, or open PRs. The Step 12 idempotency check is read-only
 * (reads VERSION + package.json + git rev-parse) and runs fine in plan
 * mode. The plan-ready output serves as the terminal signal — the agent
 * has done its analysis and produced a plan describing what it would do.
 *
 * If the agent decides to bump or push despite the fixture's
 * ALREADY_BUMPED state, that intent surfaces in the plan or in
 * tool-call attempts, which we detect.
 *
 * Cost: ~$2-4/run. Periodic tier — long, runs weekly.
 */
 import { describe, test, expect } from 'bun:test';
 import { spawnSync } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import {
  launchClaudePty,
  isPermissionDialogVisible,
  isNumberedOptionListVisible,
 } from './helpers/claude-pty-runner';
 const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
 const describeE2E = shouldRun ? describe : describe.skip;
 interface ShipFixture {
  workTree: string;
  bareRemote: string;
  /** Full bash log of `git` and helper commands run during setup. */
  setupLog: string[];
 }
 /**
 * Build a self-contained git fixture representing an already-shipped state:
 *   - main branch at VERSION 0.0.1, with one CHANGELOG entry [0.0.1]
 *   - feat/already-shipped branch at VERSION 0.0.2 (bumped + synced),
 *     CHANGELOG has [0.0.2] entry on top of [0.0.1], one feature commit
 *   - bareRemote is the origin; both branches are pushed
 *
 * Returns the work-tree dir for /ship to operate on.
 */
 function buildShippedFixture(): ShipFixture {
  const root = fs.mkdtempSync(path.join(os.tmpdir(), 'gstack-ship-fixture-'));
  const workTree = path.join(root, 'workspace');
  const bareRemote = path.join(root, 'origin.git');
  fs.mkdirSync(workTree, { recursive: true });
  const setupLog: string[] = [];
  const sh = (cmd: string, cwd: string): void => {
    setupLog.push(`[${cwd}] ${cmd}`);
    const result = spawnSync('bash', ['-c', cmd], { cwd, stdio: 'pipe', timeout: 15_000 });
    if (result.status !== 0) {
      const stderr = result.stderr?.toString() ?? '';
      throw new Error(`fixture setup failed at "${cmd}":\n${stderr}\n--- log ---\n${setupLog.join('\n')}`);
    }
  };
  // Bare remote.
  sh(`git init --bare "${bareRemote}"`, root);
  // Initial commit on main.
  sh('git init -b main', workTree);
  sh('git config user.email "test@test.com"', workTree);
  sh('git config user.name "Test"', workTree);
  sh('git config commit.gpgsign false', workTree);
  fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.1\n');
  fs.writeFileSync(
    path.join(workTree, 'package.json'),
    JSON.stringify({ name: 'fixture', version: '0.0.1', private: true }, null, 2) + '\n',
  );
  fs.writeFileSync(
    path.join(workTree, 'CHANGELOG.md'),
    `# Changelog\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`,
  );
  fs.writeFileSync(path.join(workTree, 'README.md'), '# Fixture\n');
  sh('git add VERSION package.json CHANGELOG.md README.md', workTree);
  sh('git commit -m "chore: initial release v0.0.1"', workTree);
  sh(`git remote add origin "${bareRemote}"`, workTree);
  sh('git push -u origin main', workTree);
  // Feature branch with ALREADY_BUMPED state.
  sh('git checkout -b feat/already-shipped', workTree);
  fs.writeFileSync(path.join(workTree, 'VERSION'), '0.0.2\n');
  fs.writeFileSync(
    path.join(workTree, 'package.json'),
    JSON.stringify({ name: 'fixture', version: '0.0.2', private: true }, null, 2) + '\n',
  );
  fs.writeFileSync(
    path.join(workTree, 'CHANGELOG.md'),
    `# Changelog\n\n## [0.0.2] - 2026-04-25\n\n**Feature shipped.**\n\nAdded the new feature.\n\n## [0.0.1] - 2026-01-01\n\n- Initial release\n`,
  );
  fs.writeFileSync(path.join(workTree, 'feature.md'), '# Feature\n\nAlready shipped.\n');
  sh('git add VERSION package.json CHANGELOG.md feature.md', workTree);
  sh('git commit -m "feat: add new feature\n\nbumps VERSION to 0.0.2"', workTree);
  sh('git push -u origin feat/already-shipped', workTree);
  return { workTree, bareRemote, setupLog };
 }
 /** Snapshot the load-bearing fixture state so we can compare post-run. */
 interface FixtureSnapshot {
  versionFile: string;
  packageVersion: string;
  changelogEntryCount: number;
  bumpCommitCount: number;
  branchHead: string;
 }
 function snapshotFixture(workTree: string): FixtureSnapshot {
  const versionFile = fs.readFileSync(path.join(workTree, 'VERSION'), 'utf-8').trim();
  const pkg = JSON.parse(fs.readFileSync(path.join(workTree, 'package.json'), 'utf-8'));
  const changelog = fs.readFileSync(path.join(workTree, 'CHANGELOG.md'), 'utf-8');
  // Count `## [0.0.2]` headings — should stay at 1 across re-runs.
  const changelogEntryCount = (changelog.match(/^##\s*\[0\.0\.2\]/gm) ?? []).length;
  const head = spawnSync('git', ['rev-parse', 'HEAD'], { cwd: workTree, stdio: 'pipe' });
  const branchHead = head.stdout?.toString().trim() ?? '';
  // Count "chore: bump version" commits on this branch since main.
  const log = spawnSync(
    'git', ['log', '--format=%s', 'main..HEAD'],
    { cwd: workTree, stdio: 'pipe' },
  );
  const subjects = log.stdout?.toString() ?? '';
  const bumpCommitCount = subjects.split('\n').filter(s => /chore:\s*bump\s+version/i.test(s)).length;
  return { versionFile, packageVersion: pkg.version, changelogEntryCount, bumpCommitCount, branchHead };
 }
 describeE2E('/ship idempotency E2E (periodic, real-PTY)', () => {
  test(
    'rerunning /ship on an already-shipped branch detects ALREADY_BUMPED and does not mutate fixture',
    async () => {
      const fixture = buildShippedFixture();
      const before = snapshotFixture(fixture.workTree);
      const session = await launchClaudePty({
        permissionMode: 'plan',
        cwd: fixture.workTree,
        timeoutMs: 720_000,
        // Disable network-y pieces so the agent can't reach actual github.
        env: { GH_TOKEN: 'mock-not-real', NO_COLOR: '1' },
      });
      let outcome: 'detected' | 'plan_ready' | 'attempted_mutation' | 'timeout' | 'exited' = 'timeout';
      let evidence = '';
      try {
        await Bun.sleep(8000);
        const since = session.mark();
        session.send('/ship\r');
        const budgetMs = 600_000;
        const start = Date.now();
        let lastPermSig = '';
        while (Date.now() - start < budgetMs) {
          await Bun.sleep(3000);
          if (session.exited()) {
            outcome = 'exited';
            evidence = session.visibleSince(since).slice(-3000);
            break;
          }
          const visible = session.visibleSince(since);
          // Auto-grant any permission dialogs the preamble triggers
          // (e.g. touch on a marker file claude considers sensitive).
          // Classify on the recent tail; don't double-press the same render.
          const tail = visible.slice(-1500);
          if (isNumberedOptionListVisible(tail) && isPermissionDialogVisible(tail)) {
            const sig = visible.slice(-500);
            if (sig !== lastPermSig) {
              lastPermSig = sig;
              session.send('1\r');
              await Bun.sleep(1500);
              continue;
            }
          }
          // Positive: the idempotency-check echoed ALREADY_BUMPED.
          if (/STATE:\s*ALREADY_BUMPED/.test(visible)) {
            outcome = 'detected';
            evidence = visible.slice(-3000);
            break;
          }
          // Negative regressions:
          //   - bump-action bash block ran (would echo on FRESH path)
          //   - agent attempted git commit -m "chore: bump version"
          //   - agent attempted git push
          //   - agent rendered an Edit/Write to CHANGELOG.md or VERSION (acceptable in plan mode but flagged here)
          if (
            /STATE:\s*FRESH(?![\w-])/i.test(visible) ||
            /git\s+commit\s+.*chore:\s*bump\s+version/i.test(visible) ||
            /git\s+push.*origin/i.test(visible)
          ) {
            outcome = 'attempted_mutation';
            evidence = visible.slice(-3000);
            break;
          }
          // Plan-ready outcome (acceptable terminal): the agent finished
          // analysis. We'll accept this if no mutation signals showed up.
          if (/ready to execute|Would you like to proceed/i.test(visible)) {
            outcome = 'plan_ready';
            evidence = visible.slice(-3000);
            break;
          }
        }
      } finally {
        await session.close();
      }
      // Verify fixture was not mutated regardless of outcome.
      const after = snapshotFixture(fixture.workTree);
      const fixtureStable =
        after.versionFile === before.versionFile &&
        after.packageVersion === before.packageVersion &&
        after.changelogEntryCount === before.changelogEntryCount &&
        after.bumpCommitCount === before.bumpCommitCount &&
        after.branchHead === before.branchHead;
      try {
        if (outcome === 'attempted_mutation') {
          throw new Error(
            `/ship attempted to mutate already-shipped state.\n` +
              `--- evidence (last 3KB) ---\n${evidence}\n` +
              `--- before ---\n${JSON.stringify(before, null, 2)}\n` +
              `--- after  ---\n${JSON.stringify(after, null, 2)}`,
          );
        }
        if (outcome === 'exited') {
          throw new Error(`claude exited unexpectedly.\n--- evidence ---\n${evidence}`);
        }
        if (outcome === 'timeout') {
          throw new Error(
            `Timed out before any terminal outcome.\n--- evidence (last 3KB) ---\n${evidence}`,
          );
        }
        // Detected or plan_ready — both are acceptable terminal outcomes.
        expect(['detected', 'plan_ready']).toContain(outcome);
        // Fixture must not have been mutated regardless of outcome.
        expect(fixtureStable).toBe(true);
      } finally {
        // Clean up fixture root.
        try { fs.rmSync(path.dirname(fixture.workTree), { recursive: true, force: true }); } catch { /* ignore */ }
      }
    },
    900_000, // 15 min wall clock
  );
 });