diff --git a/test/fixtures/forcing-finding-seeds.ts b/test/fixtures/forcing-finding-seeds.ts new file mode 100644 index 00000000..fec88793 --- /dev/null +++ b/test/fixtures/forcing-finding-seeds.ts @@ -0,0 +1,83 @@ +/** + * Per-skill draft-plan seeds engineered to surface at least one + * review-phase finding in the corresponding plan-* review skill. + * + * Used by gate-tier finding-floor tests + * (test/skill-e2e-plan-{eng,ceo,design,devex}-finding-floor.test.ts) as + * the minimum-cost regression for the May 2026 transcript bug: + * "/plan-eng-review reviewed a real PR diff, wrote a multi-section + * review plan to ~/.claude/plans/ and called ExitPlanMode without + * ever firing AskUserQuestion." + * + * Each seed is small and pre-loaded with one obvious finding the + * matching skill cannot honestly miss. Floor tests assert + * `reviewCount >= 1` — i.e., the model fired at least one review-phase + * AUQ before reaching plan_ready / completion_summary / ceiling. + * + * Each seed includes the standard "write your plan-mode plan to /tmp/…" + * preamble that the existing periodic finding-count fixtures use, so + * the agent has a concrete plan-file target. The /tmp path is unique + * per skill to avoid collisions if floor tests run in parallel. + * + * For a deeper [N-1, N+2] count band assertion, see the periodic + * test/skill-e2e-plan-{X}-finding-count.test.ts fixtures. + */ + +export const FORCING_FLOOR_ENG = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-eng-floor.md (use Edit/Write to that exact path).', + '', + '# Plan: Add request-id propagation across services', + '', + '## Architecture', + "We'll roll a custom UUIDv7 generator inline in each service rather than", + "use Node's crypto.randomUUID() built-in. Same shape, but we want full", + 'control over the entropy source for "future flexibility" — no concrete', + 'reason yet.', +].join('\n'); + +export const FORCING_FLOOR_CEO = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-ceo-floor.md (use Edit/Write to that exact path).', + '', + '# Plan: Launch a "developer-friendly" pricing tier', + '', + '## Goal', + 'Increase developer adoption.', + '', + '## Success metric', + 'More signups.', + '', + '## Premise', + "We haven't talked to any developers about whether the current pricing", + 'is actually a barrier. The team agreed it "feels like" it should be cheaper.', +].join('\n'); + +export const FORCING_FLOOR_DESIGN = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-design-floor.md (use Edit/Write to that exact path).', + '', + '# Plan: Marketing landing page', + '', + '## Layout', + 'All headings, taglines, and body copy will be center-aligned for a', + '"clean modern look." The hero h1 sits 8px above the subhead with no', + 'breathing room; the CTA button is the same visual weight as a', + 'secondary "Learn more" link directly beside it.', +].join('\n'); + +export const FORCING_FLOOR_DEVEX = [ + 'Please review this plan thoroughly. As you go, write your plan-mode plan to /tmp/gstack-test-plan-devex-floor.md (use Edit/Write to that exact path).', + '', + '# Plan: SDK quickstart docs', + '', + '## Onboarding flow', + 'Step 1: clone the repo.', + 'Step 2: install bun manually if not present.', + 'Step 3: copy .env.example to .env and fill in 8 environment variables.', + 'Step 4: run database migrations against your local Postgres.', + 'Step 5: start the dev server.', + 'Step 6: open the docs in a separate tab.', + 'Step 7: register an API key by emailing the team.', + 'Step 8: paste the key into your .env, restart the server, then make', + 'your first SDK call.', + '', + 'No quickstart command, no hosted sandbox, no copy-pasteable curl example.', +].join('\n'); diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 42ce4027..774c26d8 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -133,6 +133,16 @@ export const E2E_TOUCHFILES: Record = { 'plan-eng-finding-count': ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-eng-finding-count.test.ts'], 'plan-design-finding-count': ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-design-finding-count.test.ts'], 'plan-devex-finding-count': ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'test/helpers/claude-pty-runner.ts', 'test/skill-e2e-plan-devex-finding-count.test.ts'], + + // Gate-tier reviewCount-floor counterparts. Catch the May 2026 transcript + // bug (model wrote a plan-mode plan and ExitPlanMode'd without firing any + // review-phase AskUserQuestion). Same harness as the periodic + // finding-count tests (runPlanSkillCounting), smaller seeds, floor=1 + // assertion. ~6 min wall time per test, ~25 min total for all four. + 'plan-eng-finding-floor': ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-eng-finding-floor.test.ts'], + 'plan-ceo-finding-floor': ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-ceo-finding-floor.test.ts'], + 'plan-design-finding-floor': ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-design-finding-floor.test.ts'], + 'plan-devex-finding-floor': ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-devex-finding-floor.test.ts'], 'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'], // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10) @@ -422,6 +432,10 @@ export const E2E_TIERS: Record = { 'plan-eng-finding-count': 'periodic', 'plan-design-finding-count': 'periodic', 'plan-devex-finding-count': 'periodic', + 'plan-eng-finding-floor': 'gate', + 'plan-ceo-finding-floor': 'gate', + 'plan-design-finding-floor': 'gate', + 'plan-devex-finding-floor': 'gate', // Privacy gate for gstack-brain-sync — periodic (non-deterministic LLM call, // costs ~$0.30-$0.50 per run, not needed on every commit) diff --git a/test/skill-e2e-plan-ceo-finding-floor.test.ts b/test/skill-e2e-plan-ceo-finding-floor.test.ts new file mode 100644 index 00000000..ba3d7c96 --- /dev/null +++ b/test/skill-e2e-plan-ceo-finding-floor.test.ts @@ -0,0 +1,37 @@ +/** + * /plan-ceo-review AskUserQuestion floor regression (gate, paid, real-PTY). + * + * See test/skill-e2e-plan-eng-finding-floor.test.ts for the contract. + */ + +import { describe, test } from 'bun:test'; +import { runPlanSkillFloorCheck } from './helpers/claude-pty-runner'; +import { FORCING_FLOOR_CEO } from './fixtures/forcing-finding-seeds'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('/plan-ceo-review AskUserQuestion floor (gate)', () => { + test( + 'seeded forcing finding causes the agent to fire at least one AskUserQuestion', + async () => { + const obs = await runPlanSkillFloorCheck({ + skillName: 'plan-ceo-review', + slashCommand: '/plan-ceo-review', + followUpPrompt: FORCING_FLOOR_CEO, + cwd: process.cwd(), + timeoutMs: 600_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + if (obs.outcome !== 'auq_observed') { + throw new Error( + `floor test FAILED: outcome=${obs.outcome} elapsed=${obs.elapsedMs}ms\n` + + `summary: ${obs.summary}\n` + + `--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + }, + 660_000, + ); +}); diff --git a/test/skill-e2e-plan-design-finding-floor.test.ts b/test/skill-e2e-plan-design-finding-floor.test.ts new file mode 100644 index 00000000..ea3d4704 --- /dev/null +++ b/test/skill-e2e-plan-design-finding-floor.test.ts @@ -0,0 +1,37 @@ +/** + * /plan-design-review AskUserQuestion floor regression (gate, paid, real-PTY). + * + * See test/skill-e2e-plan-eng-finding-floor.test.ts for the contract. + */ + +import { describe, test } from 'bun:test'; +import { runPlanSkillFloorCheck } from './helpers/claude-pty-runner'; +import { FORCING_FLOOR_DESIGN } from './fixtures/forcing-finding-seeds'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('/plan-design-review AskUserQuestion floor (gate)', () => { + test( + 'seeded forcing finding causes the agent to fire at least one AskUserQuestion', + async () => { + const obs = await runPlanSkillFloorCheck({ + skillName: 'plan-design-review', + slashCommand: '/plan-design-review', + followUpPrompt: FORCING_FLOOR_DESIGN, + cwd: process.cwd(), + timeoutMs: 600_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + if (obs.outcome !== 'auq_observed') { + throw new Error( + `floor test FAILED: outcome=${obs.outcome} elapsed=${obs.elapsedMs}ms\n` + + `summary: ${obs.summary}\n` + + `--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + }, + 660_000, + ); +}); diff --git a/test/skill-e2e-plan-devex-finding-floor.test.ts b/test/skill-e2e-plan-devex-finding-floor.test.ts new file mode 100644 index 00000000..e2e394fc --- /dev/null +++ b/test/skill-e2e-plan-devex-finding-floor.test.ts @@ -0,0 +1,37 @@ +/** + * /plan-devex-review AskUserQuestion floor regression (gate, paid, real-PTY). + * + * See test/skill-e2e-plan-eng-finding-floor.test.ts for the contract. + */ + +import { describe, test } from 'bun:test'; +import { runPlanSkillFloorCheck } from './helpers/claude-pty-runner'; +import { FORCING_FLOOR_DEVEX } from './fixtures/forcing-finding-seeds'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('/plan-devex-review AskUserQuestion floor (gate)', () => { + test( + 'seeded forcing finding causes the agent to fire at least one AskUserQuestion', + async () => { + const obs = await runPlanSkillFloorCheck({ + skillName: 'plan-devex-review', + slashCommand: '/plan-devex-review', + followUpPrompt: FORCING_FLOOR_DEVEX, + cwd: process.cwd(), + timeoutMs: 600_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + if (obs.outcome !== 'auq_observed') { + throw new Error( + `floor test FAILED: outcome=${obs.outcome} elapsed=${obs.elapsedMs}ms\n` + + `summary: ${obs.summary}\n` + + `--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + }, + 660_000, + ); +}); diff --git a/test/skill-e2e-plan-eng-finding-floor.test.ts b/test/skill-e2e-plan-eng-finding-floor.test.ts new file mode 100644 index 00000000..bd35e1c1 --- /dev/null +++ b/test/skill-e2e-plan-eng-finding-floor.test.ts @@ -0,0 +1,52 @@ +/** + * /plan-eng-review AskUserQuestion floor regression (gate, paid, real-PTY). + * + * Catches the May 2026 transcript bug where /plan-eng-review wrote a + * multi-section review plan to ~/.claude/plans/ and called ExitPlanMode + * without firing any AskUserQuestion. See + * `.context/attachments/pasted_text_2026-05-06_10-25-23.txt`. + * + * Uses runPlanSkillFloorCheck — a minimal "did the agent fire ANY AUQ?" + * observer that exits early on the first non-permission numbered-option + * render. See claude-pty-runner.ts for why this is separate from the + * runPlanSkillCounting harness used by periodic finding-count tests. + * + * Tier: gate. Budget: 10 min (early exit on success ~30-90s typical). + * Cost: ~$0.50-$1.50 per run depending on early-exit timing. + */ + +import { describe, test } from 'bun:test'; +import { runPlanSkillFloorCheck } from './helpers/claude-pty-runner'; +import { FORCING_FLOOR_ENG } from './fixtures/forcing-finding-seeds'; + +const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'gate'; +const describeE2E = shouldRun ? describe : describe.skip; + +describeE2E('/plan-eng-review AskUserQuestion floor (gate)', () => { + test( + 'seeded forcing finding causes the agent to fire at least one AskUserQuestion', + async () => { + const obs = await runPlanSkillFloorCheck({ + skillName: 'plan-eng-review', + slashCommand: '/plan-eng-review', + followUpPrompt: FORCING_FLOOR_ENG, + cwd: process.cwd(), + timeoutMs: 600_000, + env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' }, + }); + + if (obs.outcome !== 'auq_observed') { + throw new Error( + `floor test FAILED: outcome=${obs.outcome} elapsed=${obs.elapsedMs}ms\n` + + `summary: ${obs.summary}\n` + + `If outcome is plan_ready or completion_summary, this is the transcript-bug ` + + `regression — agent reached terminal without firing AskUserQuestion. See ` + + `.context/attachments/pasted_text_2026-05-06_10-25-23.txt.\n` + + `If outcome is timeout, agent may just be slow — re-run or increase budget.\n` + + `--- evidence (last 3KB) ---\n${obs.evidence}`, + ); + } + }, + 660_000, + ); +}); diff --git a/test/touchfiles.test.ts b/test/touchfiles.test.ts index 8fb66161..8cd5af2d 100644 --- a/test/touchfiles.test.ts +++ b/test/touchfiles.test.ts @@ -103,8 +103,10 @@ describe('selectTests', () => { // auto-decide-preserved also depend on plan-ceo-review/** expect(result.selected).toContain('autoplan-auto-mode'); expect(result.selected).toContain('auto-decide-preserved'); - expect(result.selected.length).toBe(21); - expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 21); + // v1.27+ gate-tier reviewCount-floor regression for transcript bug + expect(result.selected).toContain('plan-ceo-finding-floor'); + expect(result.selected.length).toBe(22); + expect(result.skipped.length).toBe(Object.keys(E2E_TOUCHFILES).length - 22); }); test('global touchfile triggers ALL tests', () => {