mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-08 13:39:45 +08:00
Adversarial review (Claude + Codex, both high confidence) identified 6
critical production-harm findings in the /ship pre-landing pass.
All folded in.
Migration v1.0.1.0.sh hardening:
- Add explicit `[ -z "${HOME:-}" ]` guard. HOME="" survives set -u and
expands paths to /.claude/skills/... which could hit absolute paths
under root/containers/sudo-without-H.
- Add python3 fallback inside resolve_real() (was missing; broken
symlinks silently defeated ownership check).
- Ownership-guard Shape 2 (~/.claude/skills/gstack/checkpoint/). Was
unconditional rm -rf. Now: if symlink, check target resolves inside
gstack; if regular dir, check realpath resolves inside gstack. A
user's hand-edited customization or a symlink pointing outside gstack
is preserved with a notice.
- Use `rm --` and `rm -r --` consistently to resist hostile basenames.
- Use `find -type f -not -name .DS_Store -not -name ._*` instead of
`ls -A | grep`. macOS sidecars no longer mask a legit prefix-mode
install. Strip sidecars explicitly before removing the dir.
context-save/SKILL.md.tmpl:
- Sanitize title in bash, not LLM prose. Allowlist [a-z0-9.-], cap 60
chars, default to "untitled". Closes a prompt-injection surface where
`/context-save $(rm -rf ~)` could propagate into subsequent commands.
- Collision-safe filename. If ${TIMESTAMP}-${SLUG}.md already exists
(same-second double-save with same title), append a 4-char random
suffix. The skill contract says "saved files are append-only" — this
enforces it. Silent overwrite was a data-loss bug.
context-restore/SKILL.md.tmpl:
- Cap `find ... | sort -r` at 20 entries via `| head -20`. A user with
10k+ saved files no longer blows the context window just to pick one.
/context-save list still handles the full-history listing path.
test/skill-e2e-autoplan-dual-voice.test.ts:
- Filter transcript to tool_use / tool_result / assistant entries
before matching, so prompt-text mentions of "plan-ceo-review" don't
force the reachedPhase1 assertion to pass. Phase-1 assertion now
requires completion markers ("Phase 1 complete", "Phase 2 started"),
not mere name occurrence.
- claudeVoiceFired now requires JSON evidence of an Agent tool_use
(name:"Agent" or subagent_type field), not the literal string
"Agent(" which could appear anywhere.
- codexVoiceFired now requires a Bash tool_use with a `codex exec/review`
command string, not prompt-text mentions.
All SKILL.md files regenerated. Golden fixtures updated. bun test: 0
failures across 80+ targeted tests and the full suite.
Review source: /ship Step 11 adversarial pass (claude subagent + codex
exec). Same findings independently surfaced by both reviewers — this is
cross-model high confidence.
129 lines
6.1 KiB
TypeScript
129 lines
6.1 KiB
TypeScript
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
import { runSkillTest } from './helpers/session-runner';
|
|
import {
|
|
ROOT, runId, evalsEnabled,
|
|
describeIfSelected, logCost, recordE2E,
|
|
copyDirSync, createEvalCollector, finalizeEvalCollector,
|
|
} from './helpers/e2e-helpers';
|
|
import { spawnSync } from 'child_process';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as os from 'os';
|
|
|
|
// E2E for /autoplan's dual-voice (Claude subagent + Codex). Periodic tier:
|
|
// non-deterministic, costs ~$1/run, not a gate. The purpose is to catch
|
|
// regressions where one of the two voices fails silently post-hardening.
|
|
|
|
const evalCollector = createEvalCollector('e2e-autoplan-dual-voice');
|
|
|
|
describeIfSelected('Autoplan dual-voice E2E', ['autoplan-dual-voice'], () => {
|
|
let workDir: string;
|
|
let planPath: string;
|
|
|
|
beforeAll(() => {
|
|
workDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-autoplan-dv-'));
|
|
|
|
const run = (cmd: string, args: string[]) =>
|
|
spawnSync(cmd, args, { cwd: workDir, stdio: 'pipe', timeout: 10000 });
|
|
|
|
run('git', ['init', '-b', 'main']);
|
|
run('git', ['config', 'user.email', 'test@test.com']);
|
|
run('git', ['config', 'user.name', 'Test']);
|
|
fs.writeFileSync(path.join(workDir, 'README.md'), '# test repo\n');
|
|
run('git', ['add', '.']);
|
|
run('git', ['commit', '-m', 'initial']);
|
|
|
|
// Copy /autoplan + its review-skill dependencies (they're loaded from disk).
|
|
copyDirSync(path.join(ROOT, 'autoplan'), path.join(workDir, 'autoplan'));
|
|
copyDirSync(path.join(ROOT, 'plan-ceo-review'), path.join(workDir, 'plan-ceo-review'));
|
|
copyDirSync(path.join(ROOT, 'plan-eng-review'), path.join(workDir, 'plan-eng-review'));
|
|
copyDirSync(path.join(ROOT, 'plan-design-review'), path.join(workDir, 'plan-design-review'));
|
|
copyDirSync(path.join(ROOT, 'plan-devex-review'), path.join(workDir, 'plan-devex-review'));
|
|
|
|
// Write a tiny plan file for /autoplan to review.
|
|
planPath = path.join(workDir, 'TEST_PLAN.md');
|
|
fs.writeFileSync(planPath, `# Test Plan: add /greet skill
|
|
|
|
## Context
|
|
Add a new /greet skill that prints a welcome message.
|
|
|
|
## Scope
|
|
- Create greet/SKILL.md with a simple "hello" flow
|
|
- Add to gen-skill-docs pipeline
|
|
- One unit test
|
|
`);
|
|
});
|
|
|
|
afterAll(() => {
|
|
finalizeEvalCollector(evalCollector);
|
|
if (workDir && fs.existsSync(workDir)) {
|
|
fs.rmSync(workDir, { recursive: true, force: true });
|
|
}
|
|
});
|
|
|
|
// Skip entirely unless evals enabled (periodic tier).
|
|
test.skipIf(!evalsEnabled)(
|
|
'both Claude + Codex voices produce output in Phase 1 (within timeout)',
|
|
async () => {
|
|
// Fire /autoplan with a 5-min hard timeout on the spawn itself.
|
|
// The skill itself has 10-min phase timeouts + auth-gate failfast.
|
|
// If Codex is unavailable on the test machine, the skill should print
|
|
// [codex-unavailable] and still complete the Claude subagent half.
|
|
const result = await runSkillTest({
|
|
testName: 'autoplan-dual-voice',
|
|
workingDirectory: workDir,
|
|
prompt: `/autoplan ${planPath}`,
|
|
timeout: 300_000, // 5 min
|
|
// /autoplan spawns subagents and calls codex via Bash; it needs the
|
|
// full tool set to get past Phase 1. Bash+Read+Write alone wasn't
|
|
// enough — the skill stalled trying to invoke Agent/Skill.
|
|
allowedTools: ['Bash', 'Read', 'Write', 'Edit', 'Grep', 'Glob', 'Agent', 'Skill'],
|
|
maxTurns: 30,
|
|
runId,
|
|
});
|
|
|
|
// Accept EITHER outcome as success:
|
|
// (a) Both voices produced output (ideal case)
|
|
// (b) Codex unavailable + Claude voice produced output (graceful degrade)
|
|
// Search ONLY the tool-call structure — NOT the prompt string that went in.
|
|
// Matching against full transcript is risky because the prompt itself
|
|
// contains "plan-ceo-review" and other marker strings that would produce
|
|
// false positives regardless of skill behavior. Filter to tool_result
|
|
// content + assistant messages emitted DURING execution.
|
|
const transcript = Array.isArray(result.transcript) ? result.transcript : [];
|
|
const executionContent = transcript
|
|
.filter((entry: any) => entry && (entry.type === 'tool_use' || entry.type === 'tool_result' || entry.role === 'assistant'))
|
|
.map((entry: any) => JSON.stringify(entry))
|
|
.join('\n');
|
|
const out = (result.output ?? '') + '\n' + executionContent;
|
|
|
|
// Claude voice: require evidence of a dispatched Agent subagent, not
|
|
// merely the literal string "Agent(" (which could appear in any text).
|
|
// Task/Agent tool_use entries have name:"Agent" or subagent_type:"..."
|
|
const claudeVoiceFired = /"name":\s*"Agent"|"subagent_type":\s*"[^"]/.test(out) ||
|
|
/Claude\s+(CEO|subagent)\s+(review|complete|finished)|claude-subagent\s/i.test(out);
|
|
// Codex voice: require evidence of codex CLI invocation (command string in
|
|
// a Bash tool_use), not prompt-text mentions.
|
|
const codexVoiceFired = /"command":\s*"[^"]*codex\s+(exec|review)/.test(out) ||
|
|
/CODEX SAYS\s*\(/i.test(out);
|
|
// Unavailable markers: explicit probe-failure strings emitted by the skill.
|
|
const codexUnavailable = /\[codex-unavailable\]|AUTH_FAILED\b|CODEX_NOT_AVAILABLE\b|codex_cli_missing|Codex CLI not found/i.test(out);
|
|
|
|
expect(claudeVoiceFired).toBe(true);
|
|
expect(codexVoiceFired || codexUnavailable).toBe(true);
|
|
|
|
// Hang protection: require phase completion evidence, not name mentions.
|
|
// "Phase 1 complete" or a phase-transition marker, not "plan-ceo-review"
|
|
// as a bare string (which appears in the prompt itself).
|
|
const reachedPhase1 = /Phase\s+1\s+(complete|done|finished)|CEO\s+Review\s+(complete|done|approved)|Strategy\s*&\s*Scope\s+(complete|done)|Phase\s+2\s+(started|begin)/i.test(out);
|
|
expect(reachedPhase1).toBe(true);
|
|
|
|
logCost('autoplan-dual-voice', result);
|
|
recordE2E(evalCollector, 'autoplan-dual-voice', 'Autoplan dual-voice E2E', result, {
|
|
passed: claudeVoiceFired && (codexVoiceFired || codexUnavailable) && reachedPhase1,
|
|
});
|
|
},
|
|
330_000, // per-test timeout slightly > spawn timeout so cleanup can run
|
|
);
|
|
});
|