mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-20 11:19:56 +08:00
test(harness): plumb extraArgs and auto_decided outcome through PTY runner
runPlanSkillObservation now accepts extraArgs that pass through to launchClaudePty (which already supported them at the lower level), and exposes a new 'auto_decided' outcome detected via isAutoDecidedVisible when the AUTO_DECIDE preamble template fires (Auto-decided ... (your preference)). Both pieces are needed for the v1.21+ AskUserQuestion-blocked regression tests in the next commit. Detection order is deliberate: 'asked' (rendered numbered list) wins over 'auto_decided' (text only, no list), which wins over 'plan_ready' so the auto-decide evidence isn't masked by a downstream plan-mode confirmation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -138,6 +138,19 @@ export function isPlanReadyVisible(visible: string): boolean {
|
|||||||
return /ready to execute|Would you like to proceed/i.test(visible);
|
return /ready to execute|Would you like to proceed/i.test(visible);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect the AUTO_DECIDE preamble template firing. The model prints
|
||||||
|
* "Auto-decided <summary> → <option> (your preference). Change with /plan-tune."
|
||||||
|
* when it short-circuits an AskUserQuestion via the question-tuning resolver
|
||||||
|
* (`scripts/resolvers/question-tuning.ts:26`). We detect any of those phrases
|
||||||
|
* — the wording can drift slightly between model invocations, so each cue is
|
||||||
|
* checked independently. The arrow + "(your preference)" combination is the
|
||||||
|
* tightest signal.
|
||||||
|
*/
|
||||||
|
export function isAutoDecidedVisible(visible: string): boolean {
|
||||||
|
return /Auto-decided\b/i.test(visible) && /\(your preference\)/i.test(visible);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Detect a Claude Code permission dialog. These render as a numbered
|
* Detect a Claude Code permission dialog. These render as a numbered
|
||||||
* option list (so isNumberedOptionListVisible matches them) but they
|
* option list (so isNumberedOptionListVisible matches them) but they
|
||||||
@@ -521,16 +534,23 @@ export async function invokeAndObserve(
|
|||||||
export interface PlanSkillObservation {
|
export interface PlanSkillObservation {
|
||||||
/**
|
/**
|
||||||
* What happened first. One of:
|
* What happened first. One of:
|
||||||
* - 'asked' — skill emitted a numbered-option prompt (its Step 0
|
* - 'asked' — skill emitted a numbered-option prompt (its Step 0
|
||||||
* AskUserQuestion or the routing-injection prompt)
|
* AskUserQuestion or the routing-injection prompt)
|
||||||
* - 'plan_ready' — claude wrote a plan and emitted its native
|
* - 'auto_decided' — visible TTY shows "Auto-decided ... → ..." (the
|
||||||
* "Ready to execute" confirmation
|
* AUTO_DECIDE preamble template fired). Distinguishes
|
||||||
|
* "the regression we're tracking" (auto-mode silently
|
||||||
|
* auto-deciding questions the user wanted to see) from
|
||||||
|
* "skill legitimately reached plan_ready". Detected
|
||||||
|
* before plan_ready/silent_write so the auto-decide
|
||||||
|
* evidence wins when both are present.
|
||||||
|
* - 'plan_ready' — claude wrote a plan and emitted its native
|
||||||
|
* "Ready to execute" confirmation
|
||||||
* - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path
|
* - 'silent_write' — a Write/Edit landed BEFORE any prompt, to a path
|
||||||
* outside the sanctioned plan/project directories
|
* outside the sanctioned plan/project directories
|
||||||
* - 'exited' — claude process died before any of the above
|
* - 'exited' — claude process died before any of the above
|
||||||
* - 'timeout' — none of the above within budget
|
* - 'timeout' — none of the above within budget
|
||||||
*/
|
*/
|
||||||
outcome: 'asked' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
|
outcome: 'asked' | 'auto_decided' | 'plan_ready' | 'silent_write' | 'exited' | 'timeout';
|
||||||
/** Human-readable summary. */
|
/** Human-readable summary. */
|
||||||
summary: string;
|
summary: string;
|
||||||
/** Visible terminal text since the slash command was sent (last 2KB). */
|
/** Visible terminal text since the slash command was sent (last 2KB). */
|
||||||
@@ -566,12 +586,19 @@ export async function runPlanSkillObservation(opts: {
|
|||||||
cwd?: string;
|
cwd?: string;
|
||||||
/** Total budget for skill to reach a terminal outcome. Default 180000. */
|
/** Total budget for skill to reach a terminal outcome. Default 180000. */
|
||||||
timeoutMs?: number;
|
timeoutMs?: number;
|
||||||
|
/** Extra CLI args appended after --permission-mode. Used by the v1.21+
|
||||||
|
* AskUserQuestion-blocked regression tests to pass
|
||||||
|
* `['--disallowedTools', 'AskUserQuestion']` (the flag set Conductor
|
||||||
|
* uses to remove native AskUserQuestion in favor of its MCP variant).
|
||||||
|
* Plumbs straight through to launchClaudePty. */
|
||||||
|
extraArgs?: string[];
|
||||||
}): Promise<PlanSkillObservation> {
|
}): Promise<PlanSkillObservation> {
|
||||||
const startedAt = Date.now();
|
const startedAt = Date.now();
|
||||||
const session = await launchClaudePty({
|
const session = await launchClaudePty({
|
||||||
permissionMode: opts.inPlanMode === false ? null : 'plan',
|
permissionMode: opts.inPlanMode === false ? null : 'plan',
|
||||||
cwd: opts.cwd,
|
cwd: opts.cwd,
|
||||||
timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
|
timeoutMs: (opts.timeoutMs ?? 180_000) + 30_000,
|
||||||
|
extraArgs: opts.extraArgs,
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -624,14 +651,10 @@ export async function runPlanSkillObservation(opts: {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (isPlanReadyVisible(visible)) {
|
// Order: 'asked' first (rendered numbered list = user being asked),
|
||||||
return {
|
// then 'auto_decided' (auto-decide text fired upstream of plan_ready
|
||||||
outcome: 'plan_ready',
|
// — surfacing this distinguishes the auto-mode regression from a
|
||||||
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
|
// legitimate plan_ready outcome), then 'plan_ready'.
|
||||||
evidence: visible.slice(-2000),
|
|
||||||
elapsedMs: Date.now() - startedAt,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
if (isNumberedOptionListVisible(visible)) {
|
if (isNumberedOptionListVisible(visible)) {
|
||||||
return {
|
return {
|
||||||
outcome: 'asked',
|
outcome: 'asked',
|
||||||
@@ -640,6 +663,22 @@ export async function runPlanSkillObservation(opts: {
|
|||||||
elapsedMs: Date.now() - startedAt,
|
elapsedMs: Date.now() - startedAt,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
if (isAutoDecidedVisible(visible)) {
|
||||||
|
return {
|
||||||
|
outcome: 'auto_decided',
|
||||||
|
summary: 'skill auto-decided an AskUserQuestion via the AUTO_DECIDE preamble (the user never saw the prompt)',
|
||||||
|
evidence: visible.slice(-2000),
|
||||||
|
elapsedMs: Date.now() - startedAt,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (isPlanReadyVisible(visible)) {
|
||||||
|
return {
|
||||||
|
outcome: 'plan_ready',
|
||||||
|
summary: 'skill ran end-to-end and emitted plan-mode "Ready to execute" confirmation',
|
||||||
|
evidence: visible.slice(-2000),
|
||||||
|
elapsedMs: Date.now() - startedAt,
|
||||||
|
};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
Reference in New Issue
Block a user