mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-12 23:43:05 +08:00
* feat: /codex skill — multi-AI second opinion (review, challenge, consult) Three modes: code review with pass/fail gate, adversarial challenge mode, and conversational consult with session continuity. First multi-AI skill in gstack, wrapping OpenAI's Codex CLI. * feat: integrate /codex into /review, /ship, /plan-eng-review + dashboard /review offers Codex second opinion after completing its own review. /ship offers Codex review as optional gate before pushing. /plan-eng-review offers Codex plan critique after scope challenge. Review Readiness Dashboard shows Codex Review as optional row. * chore: bump version and changelog (v0.8.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * test: codex skill validation (12 stub tests) + E2E eval test Stub tests (free tier): verify template content — three modes, gate verdict, session continuity, cost tracking, cross-model comparison, binary discovery, error handling, mktemp usage, and integrations into /review, /ship, /plan-eng-review. E2E test (paid tier): runs /codex review on vulnerable fixture repo via session-runner, verifies output contains findings and GATE verdict. * fix: codex auth error message — use codex login, not OPENAI_API_KEY Codex authenticates via ChatGPT OAuth (codex login), not an env var. * feat: codex uses high reasoning effort by default gpt-5.2-codex is the only model available with ChatGPT login. All commands now use model_reasoning_effort="high" for maximum depth — the whole point is a thorough second opinion. * feat: crank codex reasoning to xhigh (maximum) * feat: per-mode reasoning (high for review/consult, xhigh for challenge) + web search Review and consult use high reasoning — thorough but not slow. Challenge (adversarial) uses xhigh — maximum depth for breaking code. All modes enable web_search_cached so Codex can look up docs/APIs. * refactor: don't hardcode model — use codex default (always latest) * feat: JSONL output for codex challenge + consult modes Use --json flag to parse codex's JSONL events, extracting reasoning traces ([codex thinking]), tool calls ([codex ran]), and token counts. This gives richer output than the -o flag alone — you can see what codex thought through before its answer. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: only persist codex-review log when code review actually ran Don't write a codex-review entry to reviews.jsonl when only the adversarial challenge (option B) was selected — there's no gate verdict to record, and a false entry misleads the Review Readiness Dashboard into thinking a code review happened. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: add codex plan review option to /plan-eng-review After scope challenge (Step 0), offer to have Codex independently review the plan with a brutally honest tech reviewer persona. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * test: update e2e test for codex skill Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: codex integration bugs — plan content, review persistence, quoting, stderr - plan-eng-review: Codex now reads the plan file itself instead of inlining content as a CLI arg (avoids ARG_MAX for large plans) - review: add missing echo to persist codex-review results to reviews.jsonl - codex: consult mode uses $TMPERR (mktemp) instead of hardcoded stderr path - codex + review: quote $SLUG/$BRANCH_SLUG in review log paths - codex: scope plan lookup to current project, warn on cross-project fallback Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: add .context/ to .gitignore to prevent session ID leaks Codex consult mode stores session IDs in .context/codex-session-id. Without this ignore rule, session IDs could leak into commits. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: proactive skill suggestions + opt-out + trigger phrase tests - Preamble reads proactive config via gstack-config - Root SKILL.md.tmpl has lifecycle map (stage → skill suggestion) - Users can opt out ("stop suggesting") / opt in ("be proactive again") - Restored trigger phrase validation tests (16 skills × "Use when" check) - Added missing "Use when" trigger phrases to /debug and /office-hours Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: update changelog for v0.8.0 — add proactive suggestions note Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
217 lines
8.9 KiB
TypeScript
217 lines
8.9 KiB
TypeScript
/**
|
|
* Diff-based test selection for E2E and LLM-judge evals.
|
|
*
|
|
* Each test declares which source files it depends on ("touchfiles").
|
|
* The test runner checks `git diff` and only runs tests whose
|
|
* dependencies were modified. Override with EVALS_ALL=1 to run everything.
|
|
*/
|
|
|
|
import { spawnSync } from 'child_process';
|
|
|
|
// --- Glob matching ---
|
|
|
|
/**
|
|
* Match a file path against a glob pattern.
|
|
* Supports:
|
|
* ** — match any number of path segments
|
|
* * — match within a single segment (no /)
|
|
*/
|
|
export function matchGlob(file: string, pattern: string): boolean {
|
|
const regexStr = pattern
|
|
.replace(/\./g, '\\.')
|
|
.replace(/\*\*/g, '{{GLOBSTAR}}')
|
|
.replace(/\*/g, '[^/]*')
|
|
.replace(/\{\{GLOBSTAR\}\}/g, '.*');
|
|
return new RegExp(`^${regexStr}$`).test(file);
|
|
}
|
|
|
|
// --- Touchfile maps ---
|
|
|
|
/**
|
|
* E2E test touchfiles — keyed by testName (the string passed to runSkillTest).
|
|
* Each test lists the file patterns that, if changed, require the test to run.
|
|
*/
|
|
export const E2E_TOUCHFILES: Record<string, string[]> = {
|
|
// Browse core
|
|
'browse-basic': ['browse/src/**'],
|
|
'browse-snapshot': ['browse/src/**'],
|
|
|
|
// SKILL.md setup + preamble (depend on ROOT SKILL.md only)
|
|
'skillmd-setup-discovery': ['SKILL.md', 'SKILL.md.tmpl'],
|
|
'skillmd-no-local-binary': ['SKILL.md', 'SKILL.md.tmpl'],
|
|
'skillmd-outside-git': ['SKILL.md', 'SKILL.md.tmpl'],
|
|
'contributor-mode': ['SKILL.md', 'SKILL.md.tmpl'],
|
|
'session-awareness': ['SKILL.md', 'SKILL.md.tmpl'],
|
|
|
|
// QA
|
|
'qa-quick': ['qa/**', 'browse/src/**'],
|
|
'qa-b6-static': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval.html', 'test/fixtures/qa-eval-ground-truth.json'],
|
|
'qa-b7-spa': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-spa.html', 'test/fixtures/qa-eval-spa-ground-truth.json'],
|
|
'qa-b8-checkout': ['qa/**', 'browse/src/**', 'browse/test/fixtures/qa-eval-checkout.html', 'test/fixtures/qa-eval-checkout-ground-truth.json'],
|
|
'qa-only-no-fix': ['qa-only/**', 'qa/templates/**'],
|
|
'qa-fix-loop': ['qa/**', 'browse/src/**'],
|
|
|
|
// Review
|
|
'review-sql-injection': ['review/**', 'test/fixtures/review-eval-vuln.rb'],
|
|
'review-enum-completeness': ['review/**', 'test/fixtures/review-eval-enum*.rb'],
|
|
'review-base-branch': ['review/**'],
|
|
'review-design-lite': ['review/**', 'test/fixtures/review-eval-design-slop.*'],
|
|
|
|
// Plan reviews
|
|
'plan-ceo-review': ['plan-ceo-review/**'],
|
|
'plan-ceo-review-selective': ['plan-ceo-review/**'],
|
|
'plan-eng-review': ['plan-eng-review/**'],
|
|
'plan-eng-review-artifact': ['plan-eng-review/**'],
|
|
|
|
// Ship
|
|
'ship-base-branch': ['ship/**'],
|
|
|
|
// Retro
|
|
'retro': ['retro/**'],
|
|
'retro-base-branch': ['retro/**'],
|
|
|
|
// Document-release
|
|
'document-release': ['document-release/**'],
|
|
|
|
// Codex
|
|
'codex-review': ['codex/**'],
|
|
|
|
// QA bootstrap
|
|
'qa-bootstrap': ['qa/**', 'browse/src/**', 'ship/**'],
|
|
|
|
// Ship coverage audit
|
|
'ship-coverage-audit': ['ship/**'],
|
|
|
|
// Design
|
|
'design-consultation-core': ['design-consultation/**'],
|
|
'design-consultation-research': ['design-consultation/**'],
|
|
'design-consultation-existing': ['design-consultation/**'],
|
|
'design-consultation-preview': ['design-consultation/**'],
|
|
'plan-design-review-plan-mode': ['plan-design-review/**'],
|
|
'plan-design-review-no-ui-scope': ['plan-design-review/**'],
|
|
'design-review-fix': ['design-review/**', 'browse/src/**'],
|
|
|
|
// gstack-upgrade
|
|
'gstack-upgrade-happy-path': ['gstack-upgrade/**'],
|
|
|
|
// Skill routing — journey-stage tests (depend on ALL skill descriptions)
|
|
'journey-ideation': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-plan-eng': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-think-bigger': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-debug': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-code-review': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-ship': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-docs': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-retro': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-design-system': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
'journey-visual-qa': ['*/SKILL.md.tmpl', 'SKILL.md.tmpl', 'scripts/gen-skill-docs.ts'],
|
|
};
|
|
|
|
/**
|
|
* LLM-judge test touchfiles — keyed by test description string.
|
|
*/
|
|
export const LLM_JUDGE_TOUCHFILES: Record<string, string[]> = {
|
|
'command reference table': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts'],
|
|
'snapshot flags reference': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/snapshot.ts'],
|
|
'browse/SKILL.md reference': ['browse/SKILL.md', 'browse/SKILL.md.tmpl', 'browse/src/**'],
|
|
'setup block': ['SKILL.md', 'SKILL.md.tmpl'],
|
|
'regression vs baseline': ['SKILL.md', 'SKILL.md.tmpl', 'browse/src/commands.ts', 'test/fixtures/eval-baselines.json'],
|
|
'qa/SKILL.md workflow': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
|
'qa/SKILL.md health rubric': ['qa/SKILL.md', 'qa/SKILL.md.tmpl'],
|
|
'cross-skill greptile consistency': ['review/SKILL.md', 'review/SKILL.md.tmpl', 'ship/SKILL.md', 'ship/SKILL.md.tmpl', 'review/greptile-triage.md', 'retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
|
'baseline score pinning': ['SKILL.md', 'SKILL.md.tmpl', 'test/fixtures/eval-baselines.json'],
|
|
|
|
// Ship & Release
|
|
'ship/SKILL.md workflow': ['ship/SKILL.md', 'ship/SKILL.md.tmpl'],
|
|
'document-release/SKILL.md workflow': ['document-release/SKILL.md', 'document-release/SKILL.md.tmpl'],
|
|
|
|
// Plan Reviews
|
|
'plan-ceo-review/SKILL.md modes': ['plan-ceo-review/SKILL.md', 'plan-ceo-review/SKILL.md.tmpl'],
|
|
'plan-eng-review/SKILL.md sections': ['plan-eng-review/SKILL.md', 'plan-eng-review/SKILL.md.tmpl'],
|
|
'plan-design-review/SKILL.md passes': ['plan-design-review/SKILL.md', 'plan-design-review/SKILL.md.tmpl'],
|
|
|
|
// Design skills
|
|
'design-review/SKILL.md fix loop': ['design-review/SKILL.md', 'design-review/SKILL.md.tmpl'],
|
|
'design-consultation/SKILL.md research': ['design-consultation/SKILL.md', 'design-consultation/SKILL.md.tmpl'],
|
|
|
|
// Other skills
|
|
'retro/SKILL.md instructions': ['retro/SKILL.md', 'retro/SKILL.md.tmpl'],
|
|
'qa-only/SKILL.md workflow': ['qa-only/SKILL.md', 'qa-only/SKILL.md.tmpl'],
|
|
'gstack-upgrade/SKILL.md upgrade flow': ['gstack-upgrade/SKILL.md', 'gstack-upgrade/SKILL.md.tmpl'],
|
|
};
|
|
|
|
/**
|
|
* Changes to any of these files trigger ALL tests (both E2E and LLM-judge).
|
|
*/
|
|
export const GLOBAL_TOUCHFILES = [
|
|
'test/helpers/session-runner.ts',
|
|
'test/helpers/eval-store.ts',
|
|
'test/helpers/llm-judge.ts',
|
|
'scripts/gen-skill-docs.ts',
|
|
'test/helpers/touchfiles.ts',
|
|
'browse/test/test-server.ts',
|
|
];
|
|
|
|
// --- Base branch detection ---
|
|
|
|
/**
|
|
* Detect the base branch by trying refs in order.
|
|
* Returns the first valid ref, or null if none found.
|
|
*/
|
|
export function detectBaseBranch(cwd: string): string | null {
|
|
for (const ref of ['origin/main', 'origin/master', 'main', 'master']) {
|
|
const result = spawnSync('git', ['rev-parse', '--verify', ref], {
|
|
cwd, stdio: 'pipe', timeout: 3000,
|
|
});
|
|
if (result.status === 0) return ref;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Get list of files changed between base branch and HEAD.
|
|
*/
|
|
export function getChangedFiles(baseBranch: string, cwd: string): string[] {
|
|
const result = spawnSync('git', ['diff', '--name-only', `${baseBranch}...HEAD`], {
|
|
cwd, stdio: 'pipe', timeout: 5000,
|
|
});
|
|
if (result.status !== 0) return [];
|
|
return result.stdout.toString().trim().split('\n').filter(Boolean);
|
|
}
|
|
|
|
// --- Test selection ---
|
|
|
|
/**
|
|
* Select tests to run based on changed files.
|
|
*
|
|
* Algorithm:
|
|
* 1. If any changed file matches a global touchfile → run ALL tests
|
|
* 2. Otherwise, for each test, check if any changed file matches its patterns
|
|
* 3. Return selected + skipped lists with reason
|
|
*/
|
|
export function selectTests(
|
|
changedFiles: string[],
|
|
touchfiles: Record<string, string[]>,
|
|
globalTouchfiles: string[] = GLOBAL_TOUCHFILES,
|
|
): { selected: string[]; skipped: string[]; reason: string } {
|
|
const allTestNames = Object.keys(touchfiles);
|
|
|
|
// Global touchfile hit → run all
|
|
for (const file of changedFiles) {
|
|
if (globalTouchfiles.some(g => matchGlob(file, g))) {
|
|
return { selected: allTestNames, skipped: [], reason: `global: ${file}` };
|
|
}
|
|
}
|
|
|
|
// Per-test matching
|
|
const selected: string[] = [];
|
|
const skipped: string[] = [];
|
|
for (const [testName, patterns] of Object.entries(touchfiles)) {
|
|
const hit = changedFiles.some(f => patterns.some(p => matchGlob(f, p)));
|
|
(hit ? selected : skipped).push(testName);
|
|
}
|
|
|
|
return { selected, skipped, reason: 'diff' };
|
|
}
|