Merge remote-tracking branch 'origin/main' into garrytan/eng-review-askuser-fix

# Conflicts: # test/helpers/touchfiles.ts
2026-05-17 01:31:26 +08:00 · 2026-05-06 19:49:28 -07:00
parent 3aee5a7476 f44de365c5
commit 2d400fbc85
86 changed files with 6010 additions and 1290 deletions
--- a/test/helpers/touchfiles.ts
+++ b/test/helpers/touchfiles.ts
@@ -136,14 +136,21 @@ export const E2E_TOUCHFILES: Record<string, string[]> = {

  // Gate-tier reviewCount-floor counterparts. Catch the May 2026 transcript
  // bug (model wrote a plan-mode plan and ExitPlanMode'd without firing any
-  // review-phase AskUserQuestion). Same harness as the periodic
-  // finding-count tests (runPlanSkillCounting), smaller seeds, floor=1
-  // assertion. ~6 min wall time per test, ~25 min total for all four.
+  // review-phase AskUserQuestion). Uses runPlanSkillFloorCheck — minimal
+  // "did agent fire ANY AUQ?" observer that exits early on first non-permission
+  // numbered-option render. ~1-3 min typical wall time per test, ~$2-6 total.
  'plan-eng-finding-floor':      ['plan-eng-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-eng-finding-floor.test.ts'],
  'plan-ceo-finding-floor':      ['plan-ceo-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-ceo-finding-floor.test.ts'],
  'plan-design-finding-floor':   ['plan-design-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-design-finding-floor.test.ts'],
  'plan-devex-finding-floor':    ['plan-devex-review/**', 'scripts/resolvers/preamble.ts', 'scripts/resolvers/preamble/generate-ask-user-format.ts', 'scripts/resolvers/preamble/generate-completion-status.ts', 'scripts/resolvers/review.ts', 'test/helpers/claude-pty-runner.ts', 'test/fixtures/forcing-finding-seeds.ts', 'test/skill-e2e-plan-devex-finding-floor.test.ts'],
-  'brain-privacy-gate':           ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-brain-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
+  'brain-privacy-gate':           ['scripts/resolvers/preamble/generate-brain-sync-block.ts', 'scripts/resolvers/preamble.ts', 'bin/gstack-brain-sync', 'bin/gstack-artifacts-init', 'bin/gstack-config', 'test/helpers/agent-sdk-runner.ts'],
+
+  // /setup-gbrain Path 4 (Remote MCP) — happy + bad-token end-to-end via
+  // Agent SDK. Gate-tier (deterministic stub server, fixed inputs); fires
+  // when the skill template, the verify helper, the artifacts-init helper,
+  // or the detect script changes.
+  'setup-gbrain-remote':          ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'bin/gstack-artifacts-init', 'bin/gstack-gbrain-detect', 'test/helpers/agent-sdk-runner.ts'],
+  'setup-gbrain-bad-token':       ['setup-gbrain/SKILL.md.tmpl', 'bin/gstack-gbrain-mcp-verify', 'test/helpers/agent-sdk-runner.ts'],

  // AskUserQuestion format regression (RECOMMENDATION + Completeness: N/10)
  // Fires when either template OR the two preamble resolvers change.
@@ -441,6 +448,16 @@ export const E2E_TIERS: Record<string, 'gate' | 'periodic'> = {
  // costs ~$0.30-$0.50 per run, not needed on every commit)
  'brain-privacy-gate': 'periodic',

+  // /setup-gbrain Path 4 (Remote MCP) — periodic-tier. The stub HTTP
+  // server is deterministic but the model's interpretation of "follow
+  // Path 4 only" is not — assertions on which steps the model ran are
+  // flaky. The deterministic gate-tier coverage for Path 4 lives in
+  // test/setup-gbrain-path4-structure.test.ts (free, <200ms). These
+  // E2E tests stay available for on-demand verification of the live
+  // model's behavior against a stub MCP server.
+  'setup-gbrain-remote': 'periodic',
+  'setup-gbrain-bad-token': 'periodic',
+
  // AskUserQuestion format regression — periodic (Opus 4.7 non-deterministic benchmark)
  'plan-ceo-review-format-mode': 'periodic',
  'plan-ceo-review-format-approach': 'periodic',