From ece2650d4fd76c66531d56f625f59472bd9cbe82 Mon Sep 17 00:00:00 2001
From: Garry Tan <garrytan@gmail.com>
Date: Sun, 3 May 2026 20:08:15 -0700
Subject: [PATCH] feat(test/helpers): initialPlanContent +
 wrote_findings_before_asking + shared report-at-bottom assertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three additions to claude-pty-runner.ts:

1. runPlanSkillObservation gains initialPlanContent?: string. Pre-pumps a
   user message containing the seeded plan before invoking the skill, with
   a 3s gap so the message renders before the slash command. claude has no
   --plan-file flag (verified via claude --help), so message-pump is the
   route. Lets STOP-gate regression tests force complexity findings.

2. ClassifyResult gains wrote_findings_before_asking with companion
   strictPlanWrites?: boolean opt on classifyVisible. Fires when a Write/
   Edit to .claude/plans/* precedes any AskUserQuestion render in the
   session window. Default off — preserves zero-findings → write plan →
   plan_ready as legitimate for unseeded smokes. Six new unit tests cover
   before/after-AUQ ordering, permission-dialog edge case, strict-off path.

3. assertReportAtBottomIfPlanWritten(obs) shared helper. Wraps the existing
   assertReviewReportAtBottom(content) and gates on obs.planFile (artifact
   existing), so the assertion fires under both 'asked' and 'plan_ready'
   when a plan was actually written.

Also: runPlanSkillObservation now captures obs.planFile on every classifier
outcome, not just 'plan_ready'. Catches the case where the skill wrote a
plan partway through then paused on a question.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 test/helpers/claude-pty-runner.ts           | 139 ++++++++++++++++++--
 test/helpers/claude-pty-runner.unit.test.ts |  72 ++++++++++
 2 files changed, 202 insertions(+), 9 deletions(-)

diff --git a/test/helpers/claude-pty-runner.ts b/test/helpers/claude-pty-runner.ts
index 58a17b51..796d125d 100644
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -454,6 +454,7 @@ export function optionsSignature(
  */
 export type ClassifyResult =
   | { outcome: 'silent_write'; summary: string }
+  | { outcome: 'wrote_findings_before_asking'; summary: string }
   | { outcome: 'auto_decided'; summary: string }
   | { outcome: 'plan_ready'; summary: string }
   | { outcome: 'asked'; summary: string }
@@ -467,16 +468,73 @@ const SANCTIONED_WRITE_SUBSTRINGS = [
   'TODOS.md',
 ];
 
-export function classifyVisible(visible: string): ClassifyResult {
+/**
+ * Find the position of the first AskUserQuestion-style numbered-option list
+ * that is NOT a permission dialog. Returns -1 if none has rendered yet.
+ *
+ * Used by the strict-plan-writes detector (D4) to distinguish legitimate
+ * post-AUQ plan writes from the transcript bug ("write findings to plan
+ * before asking").
+ */
+function findFirstAuqRenderIndex(visible: string): number {
+  const re = /❯\s*1\./g;
+  let m: RegExpExecArray | null;
+  while ((m = re.exec(visible)) !== null) {
+    // 200 bytes back + TAIL_SCAN_BYTES forward gives enough context for
+    // isPermissionDialogVisible to recognize the typical permission UI.
+    const surroundStart = Math.max(0, m.index - 200);
+    const surroundEnd = Math.min(visible.length, m.index + TAIL_SCAN_BYTES);
+    const surround = visible.slice(surroundStart, surroundEnd);
+    if (!isPermissionDialogVisible(surround)) {
+      return m.index;
+    }
+  }
+  return -1;
+}
+
+export function classifyVisible(
+  visible: string,
+  opts?: {
+    /**
+     * When true, treat Write/Edit to `.claude/plans/*` BEFORE any
+     * AskUserQuestion render as `wrote_findings_before_asking` rather than
+     * letting the sanctioned-write list silently approve it. Used by tests
+     * that seed a draft plan with guaranteed-finding-triggering complexity
+     * (D3-B), where a pre-AUQ plan write is the precise transcript bug.
+     * Default false — preserves existing behavior for unseeded smoke tests
+     * where zero-findings → write plan → plan_ready is legitimate.
+     */
+    strictPlanWrites?: boolean;
+  },
+): ClassifyResult {
   // Silent-write detection: any Write/Edit tool render that targets a path
   // OUTSIDE the sanctioned dirs, AND no numbered prompt is currently on screen
   // (a numbered prompt means a permission/AskUserQuestion is gating the write,
   // not an actual silent write).
   const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
   let m: RegExpExecArray | null;
+  const auqRenderIdx = opts?.strictPlanWrites ? findFirstAuqRenderIndex(visible) : -1;
   while ((m = writeRe.exec(visible)) !== null) {
     const target = m[1] ?? '';
+    const writePos = m.index;
+    const isPlanWrite = target.includes('.claude/plans');
     const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) => target.includes(s));
+
+    // D4-B: when strictPlanWrites is on, plan writes that precede the first
+    // AUQ render are flagged. Legitimate end-of-workflow plan writes happen
+    // AFTER an AUQ has rendered (i.e., the user has been asked). The
+    // transcript bug is a plan write WITHOUT any AUQ render preceding it.
+    if (opts?.strictPlanWrites && isPlanWrite) {
+      if (auqRenderIdx < 0 || writePos < auqRenderIdx) {
+        return {
+          outcome: 'wrote_findings_before_asking',
+          summary: `Write/Edit to ${target} fired before any AskUserQuestion render`,
+        };
+      }
+      // post-AUQ plan write — legitimate, fall through to other writes
+      continue;
+    }
+
     if (!sanctioned && !isNumberedOptionListVisible(visible)) {
       return {
         outcome: 'silent_write',
@@ -712,6 +770,36 @@ export function assertReviewReportAtBottom(
   return { ok: true };
 }
 
+/**
+ * Test helper: if `obs.planFile` was set, read it and assert
+ * `## GSTACK REVIEW REPORT` is the last `## ` section. Throws on
+ * violation with a diagnostic message including the plan path,
+ * the reason, any trailing headings, and the last 2KB of TTY output.
+ *
+ * Used by the four plan-mode E2E tests
+ * (skill-e2e-plan-{eng,ceo,design,devex}-plan-mode.test.ts) to enforce
+ * the {{PLAN_FILE_REVIEW_REPORT}} resolver contract uniformly. Gates on
+ * `obs.planFile` (artifact existing), not on `obs.outcome === 'plan_ready'`,
+ * so it also catches the report-missing case under `'asked'` /
+ * `'wrote_findings_before_asking'` when a plan was already written.
+ */
+export function assertReportAtBottomIfPlanWritten(
+  obs: { planFile?: string; evidence: string },
+): void {
+  if (!obs.planFile) return;
+  const content = fs.readFileSync(obs.planFile, 'utf-8');
+  const verdict = assertReviewReportAtBottom(content);
+  if (!verdict.ok) {
+    const trailing = verdict.trailingHeadings?.length
+      ? `\ntrailing headings: ${verdict.trailingHeadings.join(', ')}`
+      : '';
+    throw new Error(
+      `GSTACK REVIEW REPORT contract violation in ${obs.planFile}: ${verdict.reason}${trailing}\n` +
+        `--- evidence (last 2KB) ---\n${obs.evidence}`,
+    );
+  }
+}
+
 /**
  * Per-skill Step-0 boundary predicates. Each fires `true` when the answered
  * AUQ's fingerprint matches the LAST question of that skill's Step 0 phase.
@@ -1085,6 +1173,20 @@ export async function runPlanSkillObservation(opts: {
    * rendered AskUserQuestion list).
    */
   env?: Record<string, string>;
+  /**
+   * Seed an initial plan that the spawned `claude` process operates on.
+   * STOP-gate regression tests need a plan with guaranteed-finding-triggering
+   * complexity (8+ files, custom-vs-builtin smell) so the skill MUST emit
+   * AskUserQuestion or fall back to a Decisions section. Without this,
+   * plan-mode creates a fresh empty plan and the skill has nothing to find
+   * issues with.
+   *
+   * Implementation: claude has no `--plan-file` flag (verified via
+   * `claude --help`). We pre-pump a user message containing the draft
+   * plan, wait for it to register, then invoke the skill. The skill's
+   * Step 0 reads the prior conversation context so it sees the draft.
+   */
+  initialPlanContent?: string;
 }): Promise<PlanSkillObservation> {
   const startedAt = Date.now();
   const session = await launchClaudePty({
@@ -1098,6 +1200,21 @@ export async function runPlanSkillObservation(opts: {
   try {
     // Boot grace + trust-dialog auto-handle.
     await Bun.sleep(8000);
+    if (opts.initialPlanContent) {
+      // Pre-pump the draft as a user message so the skill's Step 0 has
+      // concrete content to scope-challenge. The trailing `\r` submits
+      // the message; embedded `\n` are preserved as line breaks within
+      // the message (claude-code uses Enter to send, Shift+Enter for
+      // newlines, but raw `\r` from a PTY just submits whatever's in
+      // the input buffer).
+      const seed = `Please review the following draft plan when I run the skill below:\n\n${opts.initialPlanContent}`;
+      session.send(`${seed}\r`);
+      // Wait for the seed message to render before sending the skill
+      // command. Without this gap the two messages can fuse and the
+      // skill name becomes part of the user prompt instead of a slash
+      // command.
+      await Bun.sleep(3000);
+    }
     const since = session.mark();
     session.send(`/${opts.skillName}\r`);
 
@@ -1123,20 +1240,24 @@ export async function runPlanSkillObservation(opts: {
           elapsedMs: Date.now() - startedAt,
         };
       }
-      const classified = classifyVisible(visible);
+      const classified = classifyVisible(visible, {
+        strictPlanWrites: !!opts.initialPlanContent,
+      });
       if (classified) {
         const obs: PlanSkillObservation = {
           ...classified,
           evidence: visible.slice(-2000),
           elapsedMs: Date.now() - startedAt,
         };
-        // For plan_ready outcomes, capture the plan file path from the full
-        // visible buffer — tests under --disallowedTools verify the file's
-        // contents to distinguish legitimate fallback flow from silent-skip.
-        if (classified.outcome === 'plan_ready') {
-          const planFile = extractPlanFilePath(visible);
-          if (planFile) obs.planFile = planFile;
-        }
+        // Capture the plan file path on any outcome where one may have been
+        // written. Gating only on 'plan_ready' missed two cases: (1) the
+        // 'asked' outcome where the model wrote a plan partway through then
+        // paused on a question, and (2) 'wrote_findings_before_asking' where
+        // the bug is precisely that the plan was written. The
+        // assertReviewReportAtBottom checks downstream gate on planFile
+        // existing, not on the outcome.
+        const planFile = extractPlanFilePath(visible);
+        if (planFile) obs.planFile = planFile;
         return obs;
       }
     }
diff --git a/test/helpers/claude-pty-runner.unit.test.ts b/test/helpers/claude-pty-runner.unit.test.ts
index e830d730..dcb4f5ef 100644
--- a/test/helpers/claude-pty-runner.unit.test.ts
+++ b/test/helpers/claude-pty-runner.unit.test.ts
@@ -295,6 +295,78 @@ describe('classifyVisible (runtime path through the runner classifier)', () => {
     // recent-tail window would surface here.
     expect(TAIL_SCAN_BYTES).toBe(1500);
   });
+
+  // D4-B: strictPlanWrites detector. Catches the transcript bug where the
+  // model writes findings to the plan file before any AskUserQuestion fires.
+  test('strictPlanWrites: plan write before any AUQ → wrote_findings_before_asking', () => {
+    const visible = `
+      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
+      ⎿  Updated 12 lines
+    `;
+    const result = classifyVisible(visible, { strictPlanWrites: true });
+    expect(result?.outcome).toBe('wrote_findings_before_asking');
+    expect(result?.summary).toContain('.claude/plans/some-plan.md');
+  });
+
+  test('strictPlanWrites: plan write AFTER an AUQ render → not flagged', () => {
+    // AUQ renders first, then the model writes the plan post-answer. This is
+    // the legitimate end-of-workflow flow and must NOT trigger the detector.
+    const visible = `
+      D1 — Some scope question
+
+      ❯ 1. Option A
+        2. Option B
+
+      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
+      ⎿  Updated 12 lines
+    `;
+    const result = classifyVisible(visible, { strictPlanWrites: true });
+    // Outcome is 'asked' (the numbered list rendered); the post-AUQ plan
+    // write is ignored by the detector.
+    expect(result?.outcome).toBe('asked');
+  });
+
+  test('strictPlanWrites: AUQ first then plan write — write_pos > auq_pos → not flagged', () => {
+    // Same scenario, more explicit ordering: the regex finds the write at a
+    // position AFTER the numbered list. Detector lets it through.
+    const visible = [
+      'D1 — Choose your approach',
+      '',
+      '❯ 1. Approach A',
+      '  2. Approach B',
+      '',
+      '⏺ Write(/Users/me/.claude/plans/draft.md)',
+      '⎿  Wrote 42 lines',
+    ].join('\n');
+    const result = classifyVisible(visible, { strictPlanWrites: true });
+    expect(result?.outcome).toBe('asked');
+  });
+
+  test('strictPlanWrites: only a permission dialog visible → plan write still flagged', () => {
+    // A permission dialog ❯ 1./2. is NOT an AUQ; pre-AUQ plan writes still
+    // hit the detector even when a permission prompt is on screen.
+    const visible = `
+      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
+
+      Edit to /Users/me/.claude/plans/some-plan.md
+
+      Do you want to proceed?
+
+      ❯ 1. Yes
+        2. No
+    `;
+    const result = classifyVisible(visible, { strictPlanWrites: true });
+    expect(result?.outcome).toBe('wrote_findings_before_asking');
+  });
+
+  test('strictPlanWrites OFF: plan write before AUQ → returns null (legacy behavior preserved)', () => {
+    const visible = `
+      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
+      ⎿  Updated 12 lines
+    `;
+    // Without strictPlanWrites, the sanctioned-path list lets this through.
+    expect(classifyVisible(visible)).toBeNull();
+  });
 });
 
 describe('parseNumberedOptions', () => {