test(harness): plumb extraArgs and auto_decided outcome through PTY runner

runPlanSkillObservation now accepts extraArgs that pass through to launchClaudePty (which already supported them at the lower level), and exposes a new 'auto_decided' outcome detected via isAutoDecidedVisible when the AUTO_DECIDE preamble template fires (Auto-decided ... (your preference)). Both pieces are needed for the v1.21+ AskUserQuestion-blocked regression tests in the next commit. Detection order is deliberate: 'asked' (rendered numbered list) wins over 'auto_decided' (text only, no list), which wins over 'plan_ready' so the auto-decide evidence isn't masked by a downstream plan-mode confirmation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-20 11:19:56 +08:00 · 2026-04-30 21:13:30 -07:00
parent e8893a18b1
commit 6c2db0bec6
1 changed files with 55 additions and 16 deletions
--- a/test/helpers/claude-pty-runner.ts
+++ b/test/helpers/claude-pty-runner.ts
@@ -138,6 +138,19 @@ export function isPlanReadyVisible(visible: string): boolean {
  return /ready to execute|Would you like to proceed/i.test(visible);
 }
 /**
 * Detect the AUTO_DECIDE preamble template firing. The model prints
 * "Auto-decided <summary> → <option> (your preference). Change with /plan-tune."
 * when it short-circuits an AskUserQuestion via the question-tuning resolver
 * (`scripts/resolvers/question-tuning.ts:26`). We detect any of those phrases
 * — the wording can drift slightly between model invocations, so each cue is
 * checked independently. The arrow + "(your preference)" combination is the
 * tightest signal.
 */
 export function isAutoDecidedVisible(visible: string): boolean {
  return /Auto-decided\b/i.test(visible) && /\(your preference\)/i.test(visible);
 }
 /**
 * Detect a Claude Code permission dialog. These render as a numbered
 * option list (so isNumberedOptionListVisible matches them) but they
@@ -521,16 +534,23 @@ export async function invokeAndObserve(
 export interface PlanSkillObservation {
  /**
   * What happened first. One of:
-   *  - 'asked'      — skill emitted a numbered-option prompt (its Step 0
+   *  - 'asked'        — skill emitted a numbered-option prompt (its Step 0
-   *                   AskUserQuestion or the routing-injection prompt)
+   *                     AskUserQuestion or the routing-injection prompt)
-   *  - 'plan_ready' — claude wrote a plan and emitted its native
+   *  - 'auto_decided' — visible TTY shows "Auto-decided ... → ..." (the
-   *                   "Ready to execute" confirmation
+   *                     AUTO_DECIDE preamble template fired). Distinguishes
   *                     "the regression we're tracking" (auto-mode silently
   *                     auto-deciding questions the user wanted to see) from
   *                     "skill legitimately reached plan_ready". Detected
   *                     before plan_ready/silent_write so the auto-decide
   *                     evidence wins when both are present.
   *  - 'plan_ready'   — claude wrote a plan and emitted its native
   *                     "Ready to execute" confirmation
   *  - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path
-   *                   outside the sanctioned plan/project directories
+   *                     outside the sanctioned plan/project directories
-   *  - 'exited'     — claude process died before any of the above
+   *  - 'exited'       — claude process died before any of the above
-   *  - 'timeout'    — none of the above within budget
+   *  - 'timeout'      — none of the above within budget
   */
-  outcome: 'asked' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
+  outcome: 'asked' | 'auto_decided' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
  /** Human-readable summary. */
  summary: string;
  /** Visible terminal text since the slash command was sent (last 2KB). */
@@ -566,12 +586,19 @@ export async function runPlanSkillObservation(opts: {
  cwd?: string;
  /** Total budget for skill to reach a terminal outcome. Default 180000. */
  timeoutMs?: number;
  /** Extra CLI args appended after --permission-mode. Used by the v1.21+
   *  AskUserQuestion-blocked regression tests to pass
   *  `['--disallowedTools', 'AskUserQuestion']` (the flag set Conductor
   *  uses to remove native AskUserQuestion in favor of its MCP variant).
   *  Plumbs straight through to launchClaudePty. */
  extraArgs?: string[];
 }): Promise<PlanSkillObservation> {
  const startedAt = Date.now();
  const session = await launchClaudePty({
    permissionMode: opts.inPlanMode === false ? null : 'plan',
    cwd: opts.cwd,
    timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
    extraArgs: opts.extraArgs,
  });
  try {
@@ -624,14 +651,10 @@ export async function runPlanSkillObservation(opts: {
          };
        }
      }
-      if (isPlanReadyVisible(visible)) {
+      // Order: 'asked' first (rendered numbered list = user being asked),
-        return {
+      // then 'auto_decided' (auto-decide text fired upstream of plan_ready
-          outcome: 'plan_ready',
+      // — surfacing this distinguishes the auto-mode regression from a
-          summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
+      // legitimate plan_ready outcome), then 'plan_ready'.
          evidence: visible.slice(-2000),
          elapsedMs: Date.now() - startedAt,
        };
      }
      if (isNumberedOptionListVisible(visible)) {
        return {
          outcome: 'asked',
@@ -640,6 +663,22 @@ export async function runPlanSkillObservation(opts: {
          elapsedMs: Date.now() - startedAt,
        };
      }
      if (isAutoDecidedVisible(visible)) {
        return {
          outcome: 'auto_decided',
          summary: 'skill auto-decided an AskUserQuestion via the AUTO_DECIDE preamble (the user never saw the prompt)',
          evidence: visible.slice(-2000),
          elapsedMs: Date.now() - startedAt,
        };
      }
      if (isPlanReadyVisible(visible)) {
        return {
          outcome: 'plan_ready',
          summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
          evidence: visible.slice(-2000),
          elapsedMs: Date.now() - startedAt,
        };
      }
    }
    return {