mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-20 11:19:56 +08:00
test: add firstAUQPick + plan-ceo skip-interview routing
Calibration run 1 surfaced a second issue beyond the parser bug: the default pick of 1 on /plan-ceo-review's scope-selection AUQ routes the agent to "branch diff vs main" — so it reviews the gstack PR itself (recursive!) instead of the seeded fixture plan we sent. Added firstAUQPick callback to runPlanSkillCounting. Override applies only to the FIRST AUQ; subsequent presses keep using defaultPick. ceoStep0Boundary now fires on either the mode-pick AUQ (existing path) or any AUQ containing "Skip interview and plan immediately" — which is the scope-selection AUQ. Picking that option bypasses Step 0 and routes straight to review-phase using the chat-paste plan as context. Plan-ceo test wires firstAUQPick = pickSkipInterview which finds the "Skip interview" option by label. Falls back to "describe inline" if the option labels change. Two new unit tests: ceoStep0Boundary fires on the scope-selection fixture; existing mode-pick fixture still fires. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -621,7 +621,12 @@ export function assertReviewReportAtBottom(
|
|||||||
* files import them directly.
|
* files import them directly.
|
||||||
*/
|
*/
|
||||||
export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) =>
|
export const ceoStep0Boundary: Step0BoundaryPredicate = (fp) =>
|
||||||
fp.options.some((o) => MODE_RE.test(o.label));
|
// Mode-pick path (Step 0F): one of HOLD SCOPE / SCOPE EXPANSION / etc.
|
||||||
|
fp.options.some((o) => MODE_RE.test(o.label)) ||
|
||||||
|
// Skip-interview path: scope-selection AUQ has "Skip interview and plan
|
||||||
|
// immediately" — picking it bypasses the rest of Step 0 and routes
|
||||||
|
// directly to review-phase. Boundary fires on the scope AUQ itself.
|
||||||
|
fp.options.some((o) => /skip\s+interview|plan\s+immediately/i.test(o.label));
|
||||||
|
|
||||||
export const engStep0Boundary: Step0BoundaryPredicate = (fp) =>
|
export const engStep0Boundary: Step0BoundaryPredicate = (fp) =>
|
||||||
/scope reduction recommendation|cross[\s-]?project learnings/i.test(
|
/scope reduction recommendation|cross[\s-]?project learnings/i.test(
|
||||||
@@ -1097,6 +1102,18 @@ export async function runPlanSkillCounting(opts: {
|
|||||||
reviewCountCeiling: number;
|
reviewCountCeiling: number;
|
||||||
/** Numbered option to press by default. Defaults to 1 (recommended). */
|
/** Numbered option to press by default. Defaults to 1 (recommended). */
|
||||||
defaultPick?: number;
|
defaultPick?: number;
|
||||||
|
/**
|
||||||
|
* Optional override for the FIRST AUQ observed. Receives the fingerprint;
|
||||||
|
* returns the option index to press. Subsequent AUQs always use defaultPick.
|
||||||
|
*
|
||||||
|
* Skill-specific routing helper: /plan-ceo-review's first AUQ asks "what
|
||||||
|
* scope?" with options like "branch diff" / "describe inline" / "skip
|
||||||
|
* interview". Pressing the default 1 routes to "branch diff" (the wrong
|
||||||
|
* review target for a seeded fixture). firstAUQPick lets the test pick
|
||||||
|
* "Skip interview" or "describe inline" so the agent reviews the
|
||||||
|
* follow-up plan content the test sent, not the git diff.
|
||||||
|
*/
|
||||||
|
firstAUQPick?: (fp: AskUserQuestionFingerprint) => number;
|
||||||
/** Working directory. Default process.cwd() (repo cwd holds skill registry). */
|
/** Working directory. Default process.cwd() (repo cwd holds skill registry). */
|
||||||
cwd?: string;
|
cwd?: string;
|
||||||
/** Total budget for skill to reach a terminal outcome. Default 1_500_000 (25 min). */
|
/** Total budget for skill to reach a terminal outcome. Default 1_500_000 (25 min). */
|
||||||
@@ -1120,6 +1137,7 @@ export async function runPlanSkillCounting(opts: {
|
|||||||
let boundaryFired = false;
|
let boundaryFired = false;
|
||||||
let step0Count = 0;
|
let step0Count = 0;
|
||||||
let reviewCount = 0;
|
let reviewCount = 0;
|
||||||
|
let isFirstAUQ = true;
|
||||||
let lastSig = '';
|
let lastSig = '';
|
||||||
|
|
||||||
function snapshot(
|
function snapshot(
|
||||||
@@ -1239,8 +1257,11 @@ export async function runPlanSkillCounting(opts: {
|
|||||||
if (boundaryFired) reviewCount += 1;
|
if (boundaryFired) reviewCount += 1;
|
||||||
else step0Count += 1;
|
else step0Count += 1;
|
||||||
|
|
||||||
// Press to advance.
|
// Press to advance — first AUQ may use the override pick.
|
||||||
session.send(`${defaultPick}\r`);
|
const pickIdx =
|
||||||
|
isFirstAUQ && opts.firstAUQPick ? opts.firstAUQPick(fp) : defaultPick;
|
||||||
|
isFirstAUQ = false;
|
||||||
|
session.send(`${pickIdx}\r`);
|
||||||
|
|
||||||
// Evaluate boundary AFTER pressing — if THIS AUQ was the last Step 0
|
// Evaluate boundary AFTER pressing — if THIS AUQ was the last Step 0
|
||||||
// question, all subsequent AUQs go to reviewCount.
|
// question, all subsequent AUQs go to reviewCount.
|
||||||
|
|||||||
@@ -665,6 +665,26 @@ describe('Step0BoundaryPredicate per-skill', () => {
|
|||||||
expect(ceoStep0Boundary(f)).toBe(true);
|
expect(ceoStep0Boundary(f)).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test('FIRES on scope-selection AUQ with "Skip interview" option (skip-interview path)', () => {
|
||||||
|
// After calibration run 1: plan-ceo's first AUQ is scope-selection,
|
||||||
|
// and we route via "Skip interview and plan immediately" to bypass
|
||||||
|
// Step 0 entirely. Boundary must fire on this AUQ so subsequent
|
||||||
|
// AUQs go to reviewCount.
|
||||||
|
const f = fp(
|
||||||
|
'What scope do you want me to CEO-review?',
|
||||||
|
[
|
||||||
|
"The branch's diff vs main",
|
||||||
|
'A specific plan file',
|
||||||
|
"An idea you'll describe inline",
|
||||||
|
'Cancel — wrong skill',
|
||||||
|
'Type something.',
|
||||||
|
'Chat about this',
|
||||||
|
'Skip interview and plan immediately',
|
||||||
|
],
|
||||||
|
);
|
||||||
|
expect(ceoStep0Boundary(f)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
test('does NOT fire on premise challenge AUQs', () => {
|
test('does NOT fire on premise challenge AUQs', () => {
|
||||||
const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']);
|
const f = fp('D1 — Premise check: is this the right problem?', ['Yes', 'No', 'Other']);
|
||||||
expect(ceoStep0Boundary(f)).toBe(false);
|
expect(ceoStep0Boundary(f)).toBe(false);
|
||||||
|
|||||||
@@ -21,8 +21,36 @@ import {
|
|||||||
runPlanSkillCounting,
|
runPlanSkillCounting,
|
||||||
ceoStep0Boundary,
|
ceoStep0Boundary,
|
||||||
assertReviewReportAtBottom,
|
assertReviewReportAtBottom,
|
||||||
|
type AskUserQuestionFingerprint,
|
||||||
} from './helpers/claude-pty-runner';
|
} from './helpers/claude-pty-runner';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* /plan-ceo-review's first AUQ asks "what scope?" with options like
|
||||||
|
* 1. Branch diff vs main
|
||||||
|
* 2. A specific plan file or design doc
|
||||||
|
* 3. An idea you'll describe inline
|
||||||
|
* ...
|
||||||
|
* 7. Skip interview and plan immediately
|
||||||
|
*
|
||||||
|
* The default pick (1) routes to "branch diff vs main" — the wrong target
|
||||||
|
* for our seeded fixture (the agent would review the gstack PR itself,
|
||||||
|
* recursively). Picking "Skip interview and plan immediately" bypasses
|
||||||
|
* Step 0 and routes the agent to review the chat context (where our
|
||||||
|
* follow-up plan was pasted).
|
||||||
|
*/
|
||||||
|
function pickSkipInterview(fp: AskUserQuestionFingerprint): number {
|
||||||
|
const skipOpt = fp.options.find((o) =>
|
||||||
|
/skip\s+interview|plan\s+immediately/i.test(o.label),
|
||||||
|
);
|
||||||
|
if (skipOpt) return skipOpt.index;
|
||||||
|
// Fallback: "describe inline" also routes to using our pasted plan.
|
||||||
|
const inlineOpt = fp.options.find((o) =>
|
||||||
|
/describe.*inline|inline.*idea/i.test(o.label),
|
||||||
|
);
|
||||||
|
if (inlineOpt) return inlineOpt.index;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
const shouldRun = !!process.env.EVALS && process.env.EVALS_TIER === 'periodic';
|
||||||
const describeE2E = shouldRun ? describe : describe.skip;
|
const describeE2E = shouldRun ? describe : describe.skip;
|
||||||
|
|
||||||
@@ -93,6 +121,7 @@ describeE2E('/plan-ceo-review per-finding AskUserQuestion count (periodic)', ()
|
|||||||
followUpPrompt: PLAN_CEO_5_FINDINGS,
|
followUpPrompt: PLAN_CEO_5_FINDINGS,
|
||||||
isLastStep0AUQ: ceoStep0Boundary,
|
isLastStep0AUQ: ceoStep0Boundary,
|
||||||
reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling
|
reviewCountCeiling: CEILING_DISTINCT + 1, // hard cap above assertion ceiling
|
||||||
|
firstAUQPick: pickSkipInterview, // bypass scope-selection, route to review
|
||||||
cwd: process.cwd(),
|
cwd: process.cwd(),
|
||||||
timeoutMs: 1_500_000, // 25 min
|
timeoutMs: 1_500_000, // 25 min
|
||||||
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
|
env: { QUESTION_TUNING: 'false', EXPLAIN_LEVEL: 'default' },
|
||||||
|
|||||||
Reference in New Issue
Block a user