feat(test/helpers): runPlanSkillFloorCheck — minimal AskUserQuestion-floor observer

Adds a focused PTY observer that exits at the first non-permission
numbered-option render. Catches the May 2026 transcript-bug class
(model wrote plan + ExitPlanMode without firing any AUQ) without
needing to fingerprint or navigate past the AUQ.

Why separate from runPlanSkillCounting: plan-mode AUQs render every
option on a single logical line via cursor-positioning escapes that
stripAnsi can't simulate, so parseNumberedOptions returns < 2 options
and never records a fingerprint. Counting tests work on 25-min budgets
because eventually one frame parses cleanly; gate-tier floor tests
need to exit early on the first observation. Trades fingerprint
precision for early-exit reliability.

Also drops COMPLETION_SUMMARY_RE check from this helper — it matches
"GSTACK REVIEW REPORT" anywhere in the buffer including when the
agent does recon by reading existing plan files. plan_ready
(claude's actual "Ready to execute" confirmation) is the reliable
terminal signal for "agent finished without asking."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-05-06 19:47:42 -07:00
parent 19e699ab9b
commit 866982decd

View File

@@ -1550,3 +1550,167 @@ export async function runPlanSkillCounting(opts: {
await session.close();
}
}
// ────────────────────────────────────────────────────────────────────────────
// runPlanSkillFloorCheck — minimal "did the agent fire ANY AskUserQuestion?"
// observer for gate-tier floor tests catching the May 2026 transcript bug
// (model wrote plan + ExitPlanMode'd with reviewCount=0).
//
// Why this exists separately from runPlanSkillCounting: plan-mode AUQs render
// every option on a single logical line via cursor-positioning escapes that
// stripAnsi can't simulate. parseNumberedOptions therefore returns < 2 options
// from those frames and never records a fingerprint. The full counting helper
// works for periodic finding-count tests because their 25-min budgets give the
// agent enough redraws that one frame eventually parses cleanly. Gate-tier
// floor tests don't have that wall-time budget and need to exit early on the
// first observation. This helper trades fingerprint precision for early-exit
// reliability.
//
// Contract:
// - PASS → outcome === 'auq_observed' (agent rendered any non-permission
// numbered-option list; we exit immediately and report success)
// - FAIL → outcome === 'plan_ready' | 'completion_summary' | 'silent_write'
// (agent reached a terminal state without ever firing an AUQ —
// this IS the transcript bug)
// - SOFT → outcome === 'timeout' (neither happened in budget; agent may
// just be slow — test should retry with a larger budget rather
// than treat as a hard regression)
// ────────────────────────────────────────────────────────────────────────────
export interface PlanSkillFloorObservation {
/** True iff a review-phase AUQ render was observed. */
auqObserved: boolean;
outcome:
| 'auq_observed'
| 'plan_ready'
| 'silent_write'
| 'exited'
| 'timeout';
summary: string;
/** Visible TTY tail (last 3KB) at terminal time. */
evidence: string;
/** Wall time (ms) until the outcome was decided. */
elapsedMs: number;
}
/**
* Drive a plan-* skill in plan mode and exit at the first non-permission
* numbered-option render. See block comment above for the contract.
*/
export async function runPlanSkillFloorCheck(opts: {
/** Skill name, e.g. 'plan-eng-review'. Used for diagnostic strings only. */
skillName: string;
/** Slash command to send alone, e.g. '/plan-eng-review'. */
slashCommand: string;
/** Plan content sent as a follow-up message ~3s after the slash command. */
followUpPrompt: string;
/** Working directory. Default process.cwd(). */
cwd?: string;
/** Total budget. Default 600000 (10 min). Tests exit early on AUQ. */
timeoutMs?: number;
/** Extra env merged into the spawned `claude` process. */
env?: Record<string, string>;
}): Promise<PlanSkillFloorObservation> {
const startedAt = Date.now();
const timeoutMs = opts.timeoutMs ?? 600_000;
const session = await launchClaudePty({
permissionMode: 'plan',
cwd: opts.cwd,
timeoutMs: timeoutMs + 60_000,
env: opts.env,
});
try {
await Bun.sleep(8000); // boot grace + auto-trust handler window
const since = session.mark();
session.send(`${opts.slashCommand}\r`);
await Bun.sleep(3000);
session.send(`${opts.followUpPrompt}\r`);
const start = Date.now();
while (Date.now() - start < timeoutMs) {
await Bun.sleep(2000);
const visible = session.visibleSince(since);
if (session.exited()) {
return {
auqObserved: false,
outcome: 'exited',
summary: `claude exited (code=${session.exitCode()}) before any AUQ render`,
evidence: visible.slice(-3000),
elapsedMs: Date.now() - startedAt,
};
}
if (visible.includes('Unknown command:')) {
return {
auqObserved: false,
outcome: 'exited',
summary: `claude rejected ${opts.slashCommand} as unknown command`,
evidence: visible.slice(-3000),
elapsedMs: Date.now() - startedAt,
};
}
// Success: ANY non-permission numbered-option list is an AUQ render.
// The bug we're catching is "fired zero AUQs," so observing one is
// sufficient — we don't need to fingerprint or navigate past it.
if (
isNumberedOptionListVisible(visible) &&
!isPermissionDialogVisible(visible.slice(-TAIL_SCAN_BYTES))
) {
return {
auqObserved: true,
outcome: 'auq_observed',
summary: 'agent rendered an AskUserQuestion (floor met)',
evidence: visible.slice(-3000),
elapsedMs: Date.now() - startedAt,
};
}
// Silent write outside sanctioned dirs is the transcript-bug shape.
const writeRe = /⏺\s*(?:Write|Edit)\(([^)]+)\)/g;
let m: RegExpExecArray | null;
while ((m = writeRe.exec(visible)) !== null) {
const target = m[1] ?? '';
const sanctioned = SANCTIONED_WRITE_SUBSTRINGS.some((s) => target.includes(s));
if (!sanctioned && !isNumberedOptionListVisible(visible)) {
return {
auqObserved: false,
outcome: 'silent_write',
summary: `Write/Edit to ${target} fired before any AskUserQuestion`,
evidence: visible.slice(-3000),
elapsedMs: Date.now() - startedAt,
};
}
}
// Reached terminal without AUQ → transcript-bug regression.
// Note: COMPLETION_SUMMARY_RE is intentionally NOT checked here — it
// matches "GSTACK REVIEW REPORT" anywhere in the buffer, including
// when the agent does recon by reading existing plan files (which
// contain that string as a generated section). The plan_ready check
// (claude's actual "Ready to execute" confirmation) is the reliable
// terminal signal for "agent finished without asking."
if (isPlanReadyVisible(visible)) {
return {
auqObserved: false,
outcome: 'plan_ready',
summary: 'agent reached plan_ready without firing any AskUserQuestion',
evidence: visible.slice(-3000),
elapsedMs: Date.now() - startedAt,
};
}
}
return {
auqObserved: false,
outcome: 'timeout',
summary: `no AUQ render and no terminal outcome within ${timeoutMs}ms`,
evidence: session.visibleSince(since).slice(-3000),
elapsedMs: Date.now() - startedAt,
};
} finally {
await session.close();
}
}