feat(test/helpers): initialPlanContent + wrote_findings_before_asking + shared report-at-bottom assertion

Three additions to claude-pty-runner.ts: 1. runPlanSkillObservation gains initialPlanContent?: string. Pre-pumps a user message containing the seeded plan before invoking the skill, with a 3s gap so the message renders before the slash command. claude has no --plan-file flag (verified via claude --help), so message-pump is the route. Lets STOP-gate regression tests force complexity findings. 2. ClassifyResult gains wrote_findings_before_asking with companion strictPlanWrites?: boolean opt on classifyVisible. Fires when a Write/ Edit to .claude/plans/* precedes any AskUserQuestion render in the session window. Default off — preserves zero-findings → write plan → plan_ready as legitimate for unseeded smokes. Six new unit tests cover before/after-AUQ ordering, permission-dialog edge case, strict-off path. 3. assertReportAtBottomIfPlanWritten(obs) shared helper. Wraps the existing assertReviewReportAtBottom(content) and gates on obs.planFile (artifact existing), so the assertion fires under both 'asked' and 'plan_ready' when a plan was actually written. Also: runPlanSkillObservation now captures obs.planFile on every classifier outcome, not just 'plan_ready'. Catches the case where the skill wrote a plan partway through then paused on a question. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 10:31:30 +08:00 · 2026-05-03 20:08:15 -07:00
parent 25ac219e98
commit ece2650d4f
2 changed files with 202 additions and 9 deletions
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -454,6 +454,7 @@ export function optionsSignature(
 */
 export type ClassifyResult =
  | { outcome: 'silent_write'; summary: string }
  | { outcome: 'wrote_findings_before_asking'; summary: string }
  | { outcome: 'auto_decided'; summary: string }
  | { outcome: 'plan_ready'; summary: string }
  | { outcome: 'asked'; summary: string }
@@ -467,16 +468,73 @@ const SANCTIONED_WRITE_SUBSTRINGS = [
  'TODOS.md',
 ];
-export function classifyVisible(visible: string): ClassifyResult {
+/**
 * Find the position of the first AskUserQuestion-style numbered-option list
 * that is NOT a permission dialog. Returns -1 if none has rendered yet.
 *
 * Used by the strict-plan-writes detector (D4) to distinguish legitimate
 * post-AUQ plan writes from the transcript bug ("write findings to plan
 * before asking").
 */
 function findFirstAuqRenderIndex(visible: string): number {
  const re = /❯\s*1\./g;
  let m: RegExpExecArray | null;
  while ((m = re.exec(visible)) !== null) {
    // 200 bytes back + TAIL_SCAN_BYTES forward gives enough context for
    // isPermissionDialogVisible to recognize the typical permission UI.
    const surroundStart = Math.max(0, m.index - 200);
    const surroundEnd = Math.min(visible.length, m.index + TAIL_SCAN_BYTES);
    const surround = visible.slice(surroundStart, surroundEnd);
    if (!isPermissionDialogVisible(surround)) {
      return m.index;
    }
  }
  return -1;
 }
 export function classifyVisible(
  visible: string,
  opts?: {
    /**
     * When true, treat Write/Edit to `.claude/plans/*` BEFORE any
     * AskUserQuestion render as `wrote_findings_before_asking` rather than
     * letting the sanctioned-write list silently approve it. Used by tests
     * that seed a draft plan with guaranteed-finding-triggering complexity
     * (D3-B), where a pre-AUQ plan write is the precise transcript bug.
     * Default false — preserves existing behavior for unseeded smoke tests
     * where zero-findings → write plan → plan_ready is legitimate.
     */
    strictPlanWrites?: boolean;
  },
 ): ClassifyResult {
  // Silent-write detection: any Write/Edit tool render that targets a path
  // OUTSIDE the sanctioned dirs, AND no numbered prompt is currently on screen
  // (a numbered prompt means a permission/AskUserQuestion is gating the write,
  // not an actual silent write).
  const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
  let m: RegExpExecArray | null;
  const auqRenderIdx = opts?.strictPlanWrites ? findFirstAuqRenderIndex(visible) : -1;
  while ((m = writeRe.exec(visible)) !== null) {
    const target = m[1] ?? '';
    const writePos = m.index;
    const isPlanWrite = target.includes('.claude/plans');
    const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) => target.includes(s));
    // D4-B: when strictPlanWrites is on, plan writes that precede the first
    // AUQ render are flagged. Legitimate end-of-workflow plan writes happen
    // AFTER an AUQ has rendered (i.e., the user has been asked). The
    // transcript bug is a plan write WITHOUT any AUQ render preceding it.
    if (opts?.strictPlanWrites && isPlanWrite) {
      if (auqRenderIdx < 0 || writePos < auqRenderIdx) {
        return {
          outcome: 'wrote_findings_before_asking',
          summary: `Write/Edit to ${target} fired before any AskUserQuestion render`,
        };
      }
      // post-AUQ plan write — legitimate, fall through to other writes
      continue;
    }
    if (!sanctioned && !isNumberedOptionListVisible(visible)) {
      return {
        outcome: 'silent_write',
@@ -712,6 +770,36 @@ export function assertReviewReportAtBottom(
  return { ok: true };
 }
 /**
 * Test helper: if `obs.planFile` was set, read it and assert
 * `## GSTACK REVIEW REPORT` is the last `## ` section. Throws on
 * violation with a diagnostic message including the plan path,
 * the reason, any trailing headings, and the last 2KB of TTY output.
 *
 * Used by the four plan-mode E2E tests
 * (skill-e2e-plan-{eng,ceo,design,devex}-plan-mode.test.ts) to enforce
 * the {{PLAN_FILE_REVIEW_REPORT}} resolver contract uniformly. Gates on
 * `obs.planFile` (artifact existing), not on `obs.outcome === 'plan_ready'`,
 * so it also catches the report-missing case under `'asked'` /
 * `'wrote_findings_before_asking'` when a plan was already written.
 */
 export function assertReportAtBottomIfPlanWritten(
  obs: { planFile?: string; evidence: string },
 ): void {
  if (!obs.planFile) return;
  const content = fs.readFileSync(obs.planFile, 'utf-8');
  const verdict = assertReviewReportAtBottom(content);
  if (!verdict.ok) {
    const trailing = verdict.trailingHeadings?.length
      ? `\ntrailing headings: ${verdict.trailingHeadings.join(', ')}`
      : '';
    throw new Error(
      `GSTACK REVIEW REPORT contract violation in ${obs.planFile}: ${verdict.reason}${trailing}\n` +
        `--- evidence (last 2KB) ---\n${obs.evidence}`,
    );
  }
 }
 /**
 * Per-skill Step-0 boundary predicates. Each fires `true` when the answered
 * AUQ's fingerprint matches the LAST question of that skill's Step 0 phase.
@@ -1085,6 +1173,20 @@ export async function runPlanSkillObservation(opts: {
   * rendered AskUserQuestion list).
   */
  env?: Record<string, string>;
  /**
   * Seed an initial plan that the spawned `claude` process operates on.
   * STOP-gate regression tests need a plan with guaranteed-finding-triggering
   * complexity (8+ files, custom-vs-builtin smell) so the skill MUST emit
   * AskUserQuestion or fall back to a Decisions section. Without this,
   * plan-mode creates a fresh empty plan and the skill has nothing to find
   * issues with.
   *
   * Implementation: claude has no `--plan-file` flag (verified via
   * `claude --help`). We pre-pump a user message containing the draft
   * plan, wait for it to register, then invoke the skill. The skill's
   * Step 0 reads the prior conversation context so it sees the draft.
   */
  initialPlanContent?: string;
 }): Promise<PlanSkillObservation> {
  const startedAt = Date.now();
  const session = await launchClaudePty({
@@ -1098,6 +1200,21 @@ export async function runPlanSkillObservation(opts: {
  try {
    // Boot grace + trust-dialog auto-handle.
    await Bun.sleep(8000);
    if (opts.initialPlanContent) {
      // Pre-pump the draft as a user message so the skill's Step 0 has
      // concrete content to scope-challenge. The trailing `\r` submits
      // the message; embedded `\n` are preserved as line breaks within
      // the message (claude-code uses Enter to send, Shift+Enter for
      // newlines, but raw `\r` from a PTY just submits whatever's in
      // the input buffer).
      const seed = `Please review the following draft plan when I run the skill below:\n\n${opts.initialPlanContent}`;
      session.send(`${seed}\r`);
      // Wait for the seed message to render before sending the skill
      // command. Without this gap the two messages can fuse and the
      // skill name becomes part of the user prompt instead of a slash
      // command.
      await Bun.sleep(3000);
    }
    const since = session.mark();
    session.send(`/${opts.skillName}\r`);
@@ -1123,20 +1240,24 @@ export async function runPlanSkillObservation(opts: {
          elapsedMs: Date.now() - startedAt,
        };
      }
-      const classified = classifyVisible(visible);
+      const classified = classifyVisible(visible, {
        strictPlanWrites: !!opts.initialPlanContent,
      });
      if (classified) {
        const obs: PlanSkillObservation = {
          ...classified,
          evidence: visible.slice(-2000),
          elapsedMs: Date.now() - startedAt,
        };
-        // For plan_ready outcomes, capture the plan file path from the full
+        // Capture the plan file path on any outcome where one may have been
-        // visible buffer — tests under --disallowedTools verify the file's
+        // written. Gating only on 'plan_ready' missed two cases: (1) the
-        // contents to distinguish legitimate fallback flow from silent-skip.
+        // 'asked' outcome where the model wrote a plan partway through then
-        if (classified.outcome === 'plan_ready') {
+        // paused on a question, and (2) 'wrote_findings_before_asking' where
-          const planFile = extractPlanFilePath(visible);
+        // the bug is precisely that the plan was written. The
-          if (planFile) obs.planFile = planFile;
+        // assertReviewReportAtBottom checks downstream gate on planFile
-        }
+        // existing, not on the outcome.
        const planFile = extractPlanFilePath(visible);
        if (planFile) obs.planFile = planFile;
        return obs;
      }
    }
--- a/test/helpers/claude-pty-runner.unit.test.ts
+++ b/test/helpers/claude-pty-runner.unit.test.ts
@@ -295,6 +295,78 @@ describe('classifyVisible (runtime path through the runner classifier)', () => {
    // recent-tail window would surface here.
    expect(TAIL_SCAN_BYTES).toBe(1500);
  });
  // D4-B: strictPlanWrites detector. Catches the transcript bug where the
  // model writes findings to the plan file before any AskUserQuestion fires.
  test('strictPlanWrites: plan write before any AUQ → wrote_findings_before_asking', () => {
    const visible = `
      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
      ⎿  Updated 12 lines
    `;
    const result = classifyVisible(visible, { strictPlanWrites: true });
    expect(result?.outcome).toBe('wrote_findings_before_asking');
    expect(result?.summary).toContain('.claude/plans/some-plan.md');
  });
  test('strictPlanWrites: plan write AFTER an AUQ render → not flagged', () => {
    // AUQ renders first, then the model writes the plan post-answer. This is
    // the legitimate end-of-workflow flow and must NOT trigger the detector.
    const visible = `
      D1 — Some scope question
      ❯ 1. Option A
        2. Option B
      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
      ⎿  Updated 12 lines
    `;
    const result = classifyVisible(visible, { strictPlanWrites: true });
    // Outcome is 'asked' (the numbered list rendered); the post-AUQ plan
    // write is ignored by the detector.
    expect(result?.outcome).toBe('asked');
  });
  test('strictPlanWrites: AUQ first then plan write — write_pos > auq_pos → not flagged', () => {
    // Same scenario, more explicit ordering: the regex finds the write at a
    // position AFTER the numbered list. Detector lets it through.
    const visible = [
      'D1 — Choose your approach',
      '',
      '❯ 1. Approach A',
      '  2. Approach B',
      '',
      '⏺ Write(/Users/me/.claude/plans/draft.md)',
      '⎿  Wrote 42 lines',
    ].join('\n');
    const result = classifyVisible(visible, { strictPlanWrites: true });
    expect(result?.outcome).toBe('asked');
  });
  test('strictPlanWrites: only a permission dialog visible → plan write still flagged', () => {
    // A permission dialog ❯ 1./2. is NOT an AUQ; pre-AUQ plan writes still
    // hit the detector even when a permission prompt is on screen.
    const visible = `
      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
      Edit to /Users/me/.claude/plans/some-plan.md
      Do you want to proceed?
      ❯ 1. Yes
        2. No
    `;
    const result = classifyVisible(visible, { strictPlanWrites: true });
    expect(result?.outcome).toBe('wrote_findings_before_asking');
  });
  test('strictPlanWrites OFF: plan write before AUQ → returns null (legacy behavior preserved)', () => {
    const visible = `
      ⏺ Edit(/Users/me/.claude/plans/some-plan.md)
      ⎿  Updated 12 lines
    `;
    // Without strictPlanWrites, the sanctioned-path list lets this through.
    expect(classifyVisible(visible)).toBeNull();
  });
 });
 describe('parseNumberedOptions', () => {