From ece2650d4fd76c66531d56f625f59472bd9cbe82 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Sun, 3 May 2026 20:08:15 -0700 Subject: [PATCH] feat(test/helpers): initialPlanContent + wrote_findings_before_asking + shared report-at-bottom assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three additions to claude-pty-runner.ts: 1. runPlanSkillObservation gains initialPlanContent?: string. Pre-pumps a user message containing the seeded plan before invoking the skill, with a 3s gap so the message renders before the slash command. claude has no --plan-file flag (verified via claude --help), so message-pump is the route. Lets STOP-gate regression tests force complexity findings. 2. ClassifyResult gains wrote_findings_before_asking with companion strictPlanWrites?: boolean opt on classifyVisible. Fires when a Write/ Edit to .claude/plans/* precedes any AskUserQuestion render in the session window. Default off — preserves zero-findings → write plan → plan_ready as legitimate for unseeded smokes. Six new unit tests cover before/after-AUQ ordering, permission-dialog edge case, strict-off path. 3. assertReportAtBottomIfPlanWritten(obs) shared helper. Wraps the existing assertReviewReportAtBottom(content) and gates on obs.planFile (artifact existing), so the assertion fires under both 'asked' and 'plan_ready' when a plan was actually written. Also: runPlanSkillObservation now captures obs.planFile on every classifier outcome, not just 'plan_ready'. Catches the case where the skill wrote a plan partway through then paused on a question. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/helpers/claude-pty-runner.ts | 139 ++++++++++++++++++-- test/helpers/claude-pty-runner.unit.test.ts | 72 ++++++++++ 2 files changed, 202 insertions(+), 9 deletions(-) diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts index 58a17b51..796d125d 100644 --- a/test/helpers/claude-pty-runner.ts +++ b/test/helpers/claude-pty-runner.ts @@ -454,6 +454,7 @@ export function optionsSignature( */ export type ClassifyResult = | { outcome: 'silent_write'; summary: string } + | { outcome: 'wrote_findings_before_asking'; summary: string } | { outcome: 'auto_decided'; summary: string } | { outcome: 'plan_ready'; summary: string } | { outcome: 'asked'; summary: string } @@ -467,16 +468,73 @@ const SANCTIONED_WRITE_SUBSTRINGS = [ 'TODOS.md', ]; -export function classifyVisible(visible: string): ClassifyResult { +/** + * Find the position of the first AskUserQuestion-style numbered-option list + * that is NOT a permission dialog. Returns -1 if none has rendered yet. + * + * Used by the strict-plan-writes detector (D4) to distinguish legitimate + * post-AUQ plan writes from the transcript bug ("write findings to plan + * before asking"). + */ +function findFirstAuqRenderIndex(visible: string): number { + const re = /❯\s*1\./g; + let m: RegExpExecArray | null; + while ((m = re.exec(visible)) !== null) { + // 200 bytes back + TAIL_SCAN_BYTES forward gives enough context for + // isPermissionDialogVisible to recognize the typical permission UI. + const surroundStart = Math.max(0, m.index - 200); + const surroundEnd = Math.min(visible.length, m.index + TAIL_SCAN_BYTES); + const surround = visible.slice(surroundStart, surroundEnd); + if (!isPermissionDialogVisible(surround)) { + return m.index; + } + } + return -1; +} + +export function classifyVisible( + visible: string, + opts?: { + /** + * When true, treat Write/Edit to `.claude/plans/*` BEFORE any + * AskUserQuestion render as `wrote_findings_before_asking` rather than + * letting the sanctioned-write list silently approve it. Used by tests + * that seed a draft plan with guaranteed-finding-triggering complexity + * (D3-B), where a pre-AUQ plan write is the precise transcript bug. + * Default false — preserves existing behavior for unseeded smoke tests + * where zero-findings → write plan → plan_ready is legitimate. + */ + strictPlanWrites?: boolean; + }, +): ClassifyResult { // Silent-write detection: any Write/Edit tool render that targets a path // OUTSIDE the sanctioned dirs, AND no numbered prompt is currently on screen // (a numbered prompt means a permission/AskUserQuestion is gating the write, // not an actual silent write). const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g; let m: RegExpExecArray | null; + const auqRenderIdx = opts?.strictPlanWrites ? findFirstAuqRenderIndex(visible) : -1; while ((m = writeRe.exec(visible)) !== null) { const target = m[1] ?? ''; + const writePos = m.index; + const isPlanWrite = target.includes('.claude/plans'); const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) => target.includes(s)); + + // D4-B: when strictPlanWrites is on, plan writes that precede the first + // AUQ render are flagged. Legitimate end-of-workflow plan writes happen + // AFTER an AUQ has rendered (i.e., the user has been asked). The + // transcript bug is a plan write WITHOUT any AUQ render preceding it. + if (opts?.strictPlanWrites && isPlanWrite) { + if (auqRenderIdx < 0 || writePos < auqRenderIdx) { + return { + outcome: 'wrote_findings_before_asking', + summary: `Write/Edit to ${target} fired before any AskUserQuestion render`, + }; + } + // post-AUQ plan write — legitimate, fall through to other writes + continue; + } + if (!sanctioned && !isNumberedOptionListVisible(visible)) { return { outcome: 'silent_write', @@ -712,6 +770,36 @@ export function assertReviewReportAtBottom( return { ok: true }; } +/** + * Test helper: if `obs.planFile` was set, read it and assert + * `## GSTACK REVIEW REPORT` is the last `## ` section. Throws on + * violation with a diagnostic message including the plan path, + * the reason, any trailing headings, and the last 2KB of TTY output. + * + * Used by the four plan-mode E2E tests + * (skill-e2e-plan-{eng,ceo,design,devex}-plan-mode.test.ts) to enforce + * the {{PLAN_FILE_REVIEW_REPORT}} resolver contract uniformly. Gates on + * `obs.planFile` (artifact existing), not on `obs.outcome === 'plan_ready'`, + * so it also catches the report-missing case under `'asked'` / + * `'wrote_findings_before_asking'` when a plan was already written. + */ +export function assertReportAtBottomIfPlanWritten( + obs: { planFile?: string; evidence: string }, +): void { + if (!obs.planFile) return; + const content = fs.readFileSync(obs.planFile, 'utf-8'); + const verdict = assertReviewReportAtBottom(content); + if (!verdict.ok) { + const trailing = verdict.trailingHeadings?.length + ? `\ntrailing headings: ${verdict.trailingHeadings.join(', ')}` + : ''; + throw new Error( + `GSTACK REVIEW REPORT contract violation in ${obs.planFile}: ${verdict.reason}${trailing}\n` + + `--- evidence (last 2KB) ---\n${obs.evidence}`, + ); + } +} + /** * Per-skill Step-0 boundary predicates. Each fires `true` when the answered * AUQ's fingerprint matches the LAST question of that skill's Step 0 phase. @@ -1085,6 +1173,20 @@ export async function runPlanSkillObservation(opts: { * rendered AskUserQuestion list). */ env?: Record; + /** + * Seed an initial plan that the spawned `claude` process operates on. + * STOP-gate regression tests need a plan with guaranteed-finding-triggering + * complexity (8+ files, custom-vs-builtin smell) so the skill MUST emit + * AskUserQuestion or fall back to a Decisions section. Without this, + * plan-mode creates a fresh empty plan and the skill has nothing to find + * issues with. + * + * Implementation: claude has no `--plan-file` flag (verified via + * `claude --help`). We pre-pump a user message containing the draft + * plan, wait for it to register, then invoke the skill. The skill's + * Step 0 reads the prior conversation context so it sees the draft. + */ + initialPlanContent?: string; }): Promise { const startedAt = Date.now(); const session = await launchClaudePty({ @@ -1098,6 +1200,21 @@ export async function runPlanSkillObservation(opts: { try { // Boot grace + trust-dialog auto-handle. await Bun.sleep(8000); + if (opts.initialPlanContent) { + // Pre-pump the draft as a user message so the skill's Step 0 has + // concrete content to scope-challenge. The trailing `\r` submits + // the message; embedded `\n` are preserved as line breaks within + // the message (claude-code uses Enter to send, Shift+Enter for + // newlines, but raw `\r` from a PTY just submits whatever's in + // the input buffer). + const seed = `Please review the following draft plan when I run the skill below:\n\n${opts.initialPlanContent}`; + session.send(`${seed}\r`); + // Wait for the seed message to render before sending the skill + // command. Without this gap the two messages can fuse and the + // skill name becomes part of the user prompt instead of a slash + // command. + await Bun.sleep(3000); + } const since = session.mark(); session.send(`/${opts.skillName}\r`); @@ -1123,20 +1240,24 @@ export async function runPlanSkillObservation(opts: { elapsedMs: Date.now() - startedAt, }; } - const classified = classifyVisible(visible); + const classified = classifyVisible(visible, { + strictPlanWrites: !!opts.initialPlanContent, + }); if (classified) { const obs: PlanSkillObservation = { ...classified, evidence: visible.slice(-2000), elapsedMs: Date.now() - startedAt, }; - // For plan_ready outcomes, capture the plan file path from the full - // visible buffer — tests under --disallowedTools verify the file's - // contents to distinguish legitimate fallback flow from silent-skip. - if (classified.outcome === 'plan_ready') { - const planFile = extractPlanFilePath(visible); - if (planFile) obs.planFile = planFile; - } + // Capture the plan file path on any outcome where one may have been + // written. Gating only on 'plan_ready' missed two cases: (1) the + // 'asked' outcome where the model wrote a plan partway through then + // paused on a question, and (2) 'wrote_findings_before_asking' where + // the bug is precisely that the plan was written. The + // assertReviewReportAtBottom checks downstream gate on planFile + // existing, not on the outcome. + const planFile = extractPlanFilePath(visible); + if (planFile) obs.planFile = planFile; return obs; } } diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts index e830d730..dcb4f5ef 100644 --- a/test/helpers/claude-pty-runner.unit.test.ts +++ b/test/helpers/claude-pty-runner.unit.test.ts @@ -295,6 +295,78 @@ describe('classifyVisible (runtime path through the runner classifier)', () => { // recent-tail window would surface here. expect(TAIL_SCAN_BYTES).toBe(1500); }); + + // D4-B: strictPlanWrites detector. Catches the transcript bug where the + // model writes findings to the plan file before any AskUserQuestion fires. + test('strictPlanWrites: plan write before any AUQ → wrote_findings_before_asking', () => { + const visible = ` + ⏺ Edit(/Users/me/.claude/plans/some-plan.md) + ⎿ Updated 12 lines + `; + const result = classifyVisible(visible, { strictPlanWrites: true }); + expect(result?.outcome).toBe('wrote_findings_before_asking'); + expect(result?.summary).toContain('.claude/plans/some-plan.md'); + }); + + test('strictPlanWrites: plan write AFTER an AUQ render → not flagged', () => { + // AUQ renders first, then the model writes the plan post-answer. This is + // the legitimate end-of-workflow flow and must NOT trigger the detector. + const visible = ` + D1 — Some scope question + + ❯ 1. Option A + 2. Option B + + ⏺ Edit(/Users/me/.claude/plans/some-plan.md) + ⎿ Updated 12 lines + `; + const result = classifyVisible(visible, { strictPlanWrites: true }); + // Outcome is 'asked' (the numbered list rendered); the post-AUQ plan + // write is ignored by the detector. + expect(result?.outcome).toBe('asked'); + }); + + test('strictPlanWrites: AUQ first then plan write — write_pos > auq_pos → not flagged', () => { + // Same scenario, more explicit ordering: the regex finds the write at a + // position AFTER the numbered list. Detector lets it through. + const visible = [ + 'D1 — Choose your approach', + '', + '❯ 1. Approach A', + ' 2. Approach B', + '', + '⏺ Write(/Users/me/.claude/plans/draft.md)', + '⎿ Wrote 42 lines', + ].join('\n'); + const result = classifyVisible(visible, { strictPlanWrites: true }); + expect(result?.outcome).toBe('asked'); + }); + + test('strictPlanWrites: only a permission dialog visible → plan write still flagged', () => { + // A permission dialog ❯ 1./2. is NOT an AUQ; pre-AUQ plan writes still + // hit the detector even when a permission prompt is on screen. + const visible = ` + ⏺ Edit(/Users/me/.claude/plans/some-plan.md) + + Edit to /Users/me/.claude/plans/some-plan.md + + Do you want to proceed? + + ❯ 1. Yes + 2. No + `; + const result = classifyVisible(visible, { strictPlanWrites: true }); + expect(result?.outcome).toBe('wrote_findings_before_asking'); + }); + + test('strictPlanWrites OFF: plan write before AUQ → returns null (legacy behavior preserved)', () => { + const visible = ` + ⏺ Edit(/Users/me/.claude/plans/some-plan.md) + ⎿ Updated 12 lines + `; + // Without strictPlanWrites, the sanctioned-path list lets this through. + expect(classifyVisible(visible)).toBeNull(); + }); }); describe('parseNumberedOptions', () => {