test: rewrite 5 plan-mode E2E tests on the real-PTY harness

Replaces SDK-based assertions with runPlanSkillObservation contract. Each
test launches real claude --permission-mode plan, invokes the skill, and
asserts the outcome reaches 'asked' or 'plan_ready' within a 300s budget
(no silent Write/Edit, no crash, no timeout).

Affected:
- test/skill-e2e-plan-ceo-plan-mode.test.ts
- test/skill-e2e-plan-eng-plan-mode.test.ts
- test/skill-e2e-plan-design-plan-mode.test.ts
- test/skill-e2e-plan-devex-plan-mode.test.ts
- test/skill-e2e-plan-mode-no-op.test.ts (inPlanMode: false; tests the
  preamble plan-mode-info no-op path)

test/e2e-harness-audit.test.ts — recognize runPlanSkillObservation as a
valid coverage path alongside the legacy canUseTool / runPlanModeSkillTest.

test/helpers/touchfiles.ts — point the 5 plan-mode test selections and
the e2e-harness-audit selection at test/helpers/claude-pty-runner.ts
instead of the deleted plan-mode-helpers.ts.

Proof: bun test EVALS=1 EVALS_TIER=gate on these 5 files runs sequentially
in 790s and passes 5/5. Same tests were 0/5 on origin/main, on v1.0.0.0,
and on this branch with the SDK harness.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-04-25 21:23:54 -07:00
parent 1b1fd30ec7
commit 38f31e3b1d
7 changed files with 140 additions and 113 deletions

View File

@@ -84,14 +84,15 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Plan-mode smoke tests — gate-tier safety regression tests. Each fires when
// any of: the interactive skill's template, the plan-mode resolver
// (completion-status now owns generatePlanModeInfo), preamble composition,
// the Agent SDK harness, or the shared plan-mode-helpers change.
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/plan-mode-helpers.ts'],
// (completion-status owns generatePlanModeInfo), preamble composition, or
// the real-PTY runner (which the tests now use instead of the SDK harness)
// change.
'plan-ceo-review-plan-mode': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-eng-review-plan-mode': ['plan-eng-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-design-review-plan-mode': ['plan-design-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-devex-review-plan-mode': ['plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'plan-mode-no-op': ['plan-ceo-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/preamble.ts', 'test/helpers/claude-pty-runner.ts'],
'e2e-harness-audit': ['plan-ceo-review/**', 'plan-eng-review/**', 'plan-design-review/**', 'plan-devex-review/**', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/agent-sdk-runner.ts', 'test/helpers/claude-pty-runner.ts'],
'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)