Merge remote-tracking branch 'origin/main' into garrytan/eng-review-askuser-fix

# Conflicts:
#	test/helpers/touchfiles.ts
This commit is contained in:
Garry Tan
2026-05-06 19:49:28 -07:00
86 changed files with 6010 additions and 1290 deletions

View File

@@ -136,14 +136,21 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {
// Gate-tier reviewCount-floor counterparts. Catch the May 2026 transcript
// bug (model wrote a plan-mode plan and ExitPlanMode'd without firing any
// review-phase AskUserQuestion). Same harness as the periodic
// finding-count tests (runPlanSkillCounting), smaller seeds, floor=1
// assertion. ~6 min wall time per test, ~25 min total for all four.
// review-phase AskUserQuestion). Uses runPlanSkillFloorCheck — minimal
// "did agent fire ANY AUQ?" observer that exits early on first non-permission
// numbered-option render. ~1-3 min typical wall time per test, ~$2-6 total.
'plan-eng-finding-floor': ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-eng-finding-floor.test.ts'],
'plan-ceo-finding-floor': ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-ceo-finding-floor.test.ts'],
'plan-design-finding-floor': ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-design-finding-floor.test.ts'],
'plan-devex-finding-floor': ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-devex-finding-floor.test.ts'],
'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
'brain-privacy-gate': ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-artifacts-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
// /setup-gbrain Path 4 (Remote MCP) — happy + bad-token end-to-end via
// Agent SDK. Gate-tier (deterministic stub server, fixed inputs); fires
// when the skill template, the verify helper, the artifacts-init helper,
// or the detect script changes.
'setup-gbrain-remote': ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'bin/gstack-artifacts-init', 'bin/gstack-gbrain-detect', 'test/helpers/agent-sdk-runner.ts'],
'setup-gbrain-bad-token': ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'test/helpers/agent-sdk-runner.ts'],
// AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
// Fires when either template OR the two preamble resolvers change.
@@ -441,6 +448,16 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
// costs ~$0.30-$0.50 per run, not needed on every commit)
'brain-privacy-gate': 'periodic',
// /setup-gbrain Path 4 (Remote MCP) — periodic-tier. The stub HTTP
// server is deterministic but the model's interpretation of "follow
// Path 4 only" is not — assertions on which steps the model ran are
// flaky. The deterministic gate-tier coverage for Path 4 lives in
// test/setup-gbrain-path4-structure.test.ts (free, <200ms). These
// E2E tests stay available for on-demand verification of the live
// model's behavior against a stub MCP server.
'setup-gbrain-remote': 'periodic',
'setup-gbrain-bad-token': 'periodic',
// AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark)
'plan-ceo-review-format-mode': 'periodic',
'plan-ceo-review-format-approach': 'periodic',