Files
gstack/test/skill-e2e-plan-prosons.test.ts
Garry Tan a64d70ba35 Merge remote-tracking branch 'origin/main' into garrytan/workspace-aware-ship
Rebumped v1.8.0.0 -> v1.11.0.0 (minor-past main's v1.10.1.0) using
bin/gstack-next-version — the same queue-aware path this branch introduces.
CHANGELOG repositioned so v1.11.0.0 sits above main's new entries
(v1.10.1.0 / v1.10.0.0 / v1.9.0.0).

Conflicts resolved:
- VERSION, package.json: rebumped to v1.11.0.0 (util-picked)
- bin/gstack-config: merged both lists (workspace_root + gbrain keys)
- CHANGELOG.md: hoisted v1.11.0.0 entry above main's new entries

Pre-existing failures in main (4) documented but not fixed in this PR:
1. gstack-brain-sync secret scan > blocks bearer-json (brain-sync tests)
2. no files larger than 2MB (security-bench fixture, already TODO'd)
3. selectTests > skill-specific change (touchfiles scoping)
4. Opus 4.7 overlay pacing directive (expectation stale after v1.10.1.0
   removed the Fan out nudge)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-23 21:20:25 -07:00

353 lines
14 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* v1.7.0.0 Pros/Cons format regression tests for plan reviews.
*
* Extends the v1.6.3.0 format harness (skill-e2e-plan-format.test.ts) with
* four new cases covering the Pros/Cons decision-brief format:
*
* 1. Format positive — every AskUserQuestion renders with D<N> / ELI10 /
* Stakes / Recommendation / Pros/cons / ✅×2+ / ❌×1+ / Net tokens.
* 2. Hard-stop positive — destructive-action question may use the single
* "No cons — this is a hard-stop choice" escape.
* 3. Hard-stop NEGATIVE (CT2) — plan with genuine tradeoff, model must NOT
* dodge to the hard-stop escape. Forces real tradeoff articulation.
* 4. Neutral-posture NEGATIVE (CT2) — plan with one clearly-dominant option,
* model must emit (recommended) label and concrete recommendation, NOT
* "no preference — taste call" dodge.
*
* Capture pattern matches existing harness: agent writes verbatim
* AskUserQuestion text to $OUT_FILE; regex predicates run on the captured
* file. Classified periodic (Opus 4.7 non-deterministic).
*
* FOLLOW-UP (not in v1.7.0.0):
* - True cadence eval (3 findings → 3 distinct asks across turns). Current
* $OUT_FILE harness captures ONE would-be question per session. Multi-turn
* cadence needs new harness support. Filed in TODOs.
* - Expanded coverage for /ship /office-hours /investigate /qa /review
* /design-review /document-release. Touchfiles entries already exist; eval
* cases will land as follow-up PRs per skill.
*/
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
import { runSkillTest } from './helpers/session-runner';
import {
ROOT, runId,
describeIfSelected, testConcurrentIfSelected,
logCost, recordE2E,
createEvalCollector, finalizeEvalCollector,
} from './helpers/e2e-helpers';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const evalCollector = createEvalCollector('e2e-plan-prosons');
// v1.7.0.0 format tokens
const D_NUMBER_RE = /D\d+\s+—/;
const ELI10_RE = /ELI10:/i;
const STAKES_RE = /Stakes if we pick wrong:/i;
const RECOMMENDATION_RE = /[Rr]ecommendation:/;
const PROS_CONS_HEADER_RE = /Pros\s*\/\s*cons:/i;
const NET_LINE_RE = /^Net:/m;
const HARD_STOP_ESCAPE_RE = /✅\s+No cons\s+—\s+this is a hard-stop choice/;
const NEUTRAL_POSTURE_RE = /taste call/i;
const RECOMMENDED_LABEL_RE = /\(recommended\)/;
function countChars(text: string, char: string): number {
return (text.match(new RegExp(char, 'g')) || []).length;
}
const TRADEOFF_PLAN = `# Plan: Add user dashboard caching
## Context
Dashboard renders in 3s on cold load, 800ms on warm cache. Users complain.
## Approach options
### Option A: Redis cache layer (complete)
- Add Redis with 5min TTL for dashboard aggregates.
- Cold path: compute + cache. Warm path: fetch from cache.
- Needs Redis infra, cache invalidation logic for activity updates.
- Covers all users, all flows, fails gracefully on cache miss.
### Option B: In-memory LRU cache (happy path only)
- Per-process LRU with 100-entry cap.
- No cross-process sharing; cache warms per-pod.
- Skips cache invalidation; stale reads up to 5min.
Both options have real pros and cons. This is a genuine tradeoff.
`;
const HARDSTOP_PLAN = `# Plan: Delete all user sessions
## Context
Security incident. All active sessions need to be terminated immediately.
## Action
Run \`DELETE FROM sessions WHERE TRUE\`. No dry-run mode.
This is a one-way door. There is no "partial" version.
`;
const DOMINANT_PLAN = `# Plan: Add input validation to signup endpoint
## Context
Signup endpoint currently accepts any email string and any password length.
Bug report: users type gibberish, signup succeeds, they can't log in.
## Options
### Option A: Full RFC 5322 email validation + min 8-char password + server-side checks
- Catches malformed emails, rejects weak passwords, validated on server.
- Prevents the reported bug and adjacent bugs.
- Standard web practice.
### Option B: Client-side type="email" only, no password validation
- Only catches some browsers' built-in validation.
- Attackers bypass by disabling JS.
- Does not fix the reported bug.
Option A clearly dominates on coverage. This is NOT a taste call.
`;
function setupPlanDir(tmpPrefix: string, planContent: string, skillName: string): string {
const planDir = fs.mkdtempSync(path.join(os.tmpdir(), tmpPrefix));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: planDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(planDir, 'plan.md'), planContent);
run('git', ['add', '.']);
run('git', ['commit', '-m', 'add plan']);
fs.mkdirSync(path.join(planDir, skillName), { recursive: true });
fs.copyFileSync(
path.join(ROOT, skillName, 'SKILL.md'),
path.join(planDir, skillName, 'SKILL.md'),
);
return planDir;
}
function captureInstruction(outFile: string): string {
return `Write the verbatim text of the single AskUserQuestion you would have made to ${outFile} (full text including D<N> header, ELI10, Stakes, Recommendation, Pros/cons, and Net line — the complete rich markdown body). Do NOT call any tool to ask the user. Do NOT paraphrase. This is a format-capture test.`;
}
// --- Case 1: Format positive — all v1.7.0.0 tokens present ---
describeIfSelected('Plan Prosons — Format Positive', ['plan-review-prosons-format'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-prosons-format-', TRADEOFF_PLAN, 'plan-ceo-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-review-prosons-format', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md for the review workflow.
Read plan.md — two cache approaches with real tradeoffs. Pick the architectural approach via AskUserQuestion (Step 0C-bis / Implementation Alternatives). These options differ in coverage.
${captureInstruction(outFile)}
After writing the file, stop.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-review-prosons-format',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-review prosons format positive', result);
recordE2E(evalCollector, '/plan-review-prosons-format', 'Plan Prosons — Format Positive', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(200);
// Every Pros/Cons token present
expect(captured).toMatch(D_NUMBER_RE);
expect(captured).toMatch(ELI10_RE);
expect(captured).toMatch(STAKES_RE);
expect(captured).toMatch(RECOMMENDATION_RE);
expect(captured).toMatch(PROS_CONS_HEADER_RE);
expect(captured).toMatch(NET_LINE_RE);
// Pro/con bullet counts: ≥2 ✅ and ≥1 ❌ per option (total ≥4 ✅ and ≥2 ❌ for 2 options)
expect(countChars(captured, '✅')).toBeGreaterThanOrEqual(4);
expect(countChars(captured, '❌')).toBeGreaterThanOrEqual(2);
// (recommended) label on one option
expect(captured).toMatch(RECOMMENDED_LABEL_RE);
}, 300_000);
});
// --- Case 2: Hard-stop escape NEGATIVE (CT2) ---
describeIfSelected('Plan Prosons — Hard-stop Negative', ['plan-review-prosons-hardstop-neg'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-prosons-hardstop-neg-', TRADEOFF_PLAN, 'plan-ceo-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-review-prosons-hardstop-neg', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md.
Read plan.md — this has REAL tradeoffs between Redis and in-memory caching (both have pros and cons). Pick the architectural approach via AskUserQuestion.
${captureInstruction(outFile)}
After writing the file, stop.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-review-prosons-hardstop-neg',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-review prosons hard-stop negative', result);
recordE2E(evalCollector, '/plan-review-prosons-hardstop-neg', 'Plan Prosons — Hard-stop Negative', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(200);
// Genuine tradeoff — must NOT dodge to hard-stop escape.
expect(captured).not.toMatch(HARD_STOP_ESCAPE_RE);
// Must have real pros and cons (≥2 ✅ + ≥1 ❌ per option)
expect(countChars(captured, '✅')).toBeGreaterThanOrEqual(4);
expect(countChars(captured, '❌')).toBeGreaterThanOrEqual(2);
}, 300_000);
});
// --- Case 3: Neutral-posture NEGATIVE (CT2) ---
describeIfSelected('Plan Prosons — Neutral-posture Negative', ['plan-review-prosons-neutral-neg'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-prosons-neutral-neg-', DOMINANT_PLAN, 'plan-ceo-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-review-prosons-neutral-neg', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md.
Read plan.md — Option A dominates Option B on coverage. This is NOT a taste call. Pick the approach via AskUserQuestion (Step 0C-bis / Implementation Alternatives — coverage-differentiated, so Completeness: N/10 applies).
${captureInstruction(outFile)}
After writing the file, stop.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-review-prosons-neutral-neg',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-review prosons neutral negative', result);
recordE2E(evalCollector, '/plan-review-prosons-neutral-neg', 'Plan Prosons — Neutral Negative', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(200);
// One option dominates — must NOT use "taste call" neutral-posture dodge.
expect(captured).not.toMatch(NEUTRAL_POSTURE_RE);
// (recommended) label MUST be present on the dominant option.
expect(captured).toMatch(RECOMMENDED_LABEL_RE);
// Recommendation line must contain "because" (concrete reason, not "no preference")
expect(captured).toMatch(/[Rr]ecommendation:.*because/);
}, 300_000);
});
// --- Case 4: Hard-stop POSITIVE (escape allowed when legitimately one-sided) ---
describeIfSelected('Plan Prosons — Hard-stop Positive', ['plan-ceo-review-prosons-cadence'], () => {
let planDir: string;
let outFile: string;
beforeAll(() => {
planDir = setupPlanDir('skill-e2e-plan-prosons-hardstop-pos-', HARDSTOP_PLAN, 'plan-ceo-review');
outFile = path.join(planDir, 'ask-capture.md');
});
afterAll(() => {
try { fs.rmSync(planDir, { recursive: true, force: true }); } catch {}
});
testConcurrentIfSelected('plan-ceo-review-prosons-cadence', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md.
Read plan.md — this is a destructive one-way action (terminate all sessions). Ask the user to confirm via AskUserQuestion. This is a legitimate hard-stop choice — the hard-stop escape (\`✅ No cons — this is a hard-stop choice\`) is allowed here because there is no meaningful alternative besides doing or not doing the action.
${captureInstruction(outFile)}
After writing the file, stop.`,
workingDirectory: planDir,
maxTurns: 10,
timeout: 240_000,
testName: 'plan-ceo-review-prosons-cadence',
runId,
model: 'claude-opus-4-7',
});
logCost('/plan-review prosons hard-stop positive', result);
recordE2E(evalCollector, '/plan-ceo-review-prosons-cadence', 'Plan Prosons — Hard-stop Positive', result, {
passed: ['success', 'error_max_turns'].includes(result.exitReason),
});
expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(fs.existsSync(outFile)).toBe(true);
const captured = fs.readFileSync(outFile, 'utf-8');
expect(captured.length).toBeGreaterThan(100);
// Format scaffolding still required
expect(captured).toMatch(PROS_CONS_HEADER_RE);
// Hard-stop escape is ACCEPTED here (destructive one-way action)
// Either the escape is used OR real pros/cons are present — both are valid.
const hasEscape = HARD_STOP_ESCAPE_RE.test(captured);
const hasProsAndCons = countChars(captured, '✅') >= 1 && countChars(captured, '❌') >= 1;
expect(hasEscape || hasProsAndCons).toBe(true);
}, 300_000);
});
afterAll(async () => {
await finalizeEvalCollector(evalCollector);
});