feat: worktree isolation for E2E tests + infrastructure elegance (v0.11.12.0) (#425)

* refactor: extract gen-skill-docs into modular resolver architecture Break the 3000-line monolith into 10 domain modules under scripts/resolvers/: types, constants, preamble, utility, browse, design, testing, review, codex-helpers, and index. Each module owns one domain of template generation. The preamble module introduces a 4-tier composition system (T1-T4) so skills only pay for the preamble sections they actually need, reducing token usage for lightweight skills by ~40%. Adds a token budget dashboard that prints after every generation run showing per-skill and total token counts. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: tiered preamble — skills only pay for what they use Tag all 23 templates with preamble-tier (T1-T4). Lightweight skills like /browse and /benchmark get a minimal preamble (~40% fewer tokens), while review skills get the full stack. Regenerate all SKILL.md files. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: migrate eval storage to project-scoped paths Move eval results and E2E run artifacts from ~/.gstack-dev/evals/ to ~/.gstack/projects/$SLUG/evals/ so each project's eval history lives alongside its other gstack data. Falls back to legacy path if slug detection fails. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: sync package.json version with VERSION after merge Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add WorktreeManager for isolated test environments Reusable platform module (lib/worktree.ts) that creates git worktrees for test isolation and harvests useful changes as patches. Includes SHA-256 dedup, original SHA tracking for committed change detection, and automatic gitignored artifact copying (.agents/, browse/dist/). 12 unit tests covering lifecycle, harvest, dedup, and error handling. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: integrate worktree isolation into E2E test infrastructure Add createTestWorktree(), harvestAndCleanup(), and describeWithWorktree() helpers to e2e-helpers.ts. Add harvest field to EvalTestEntry for eval-store integration. Register lib/worktree.ts as a global touchfile. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: run Gemini and Codex E2E tests in worktrees Switch both test suites from cwd: ROOT to worktree isolation. Gemini (--yolo) no longer pollutes the working tree. Codex (read-only) gets worktree for consistency. Useful changes are harvested as patches for cherry-picking. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: skip symlinks in copyDirSync to prevent infinite recursion Adversarial review caught that .claude/skills/gstack may be a symlink back to the repo root, causing copyDirSync to recurse infinitely when copying gitignored artifacts into worktrees. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: bump version and changelog (v0.11.12.0) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: relax session-awareness assertion to accept structured options The LLM consistently presents well-formatted A/B choices with pros/cons but doesn't always use the exact string "RECOMMENDATION". Accept case-insensitive "recommend", "option a", "which do you want", or "which approach" as equivalent signals of a structured recommendation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-15 08:48:42 +08:00 · 2026-03-23 23:05:22 -07:00
parent 2c5ae38542
commit dc5e0538e5
79 changed files with 4217 additions and 4681 deletions
--- a/scripts/resolvers/codex-helpers.ts
+++ b/scripts/resolvers/codex-helpers.ts
@@ -0,0 +1,132 @@
+import type { Host } from './types';
+
+const OPENAI_SHORT_DESCRIPTION_LIMIT = 120;
+
+export function extractNameAndDescription(content: string): { name: string; description: string } {
+  const fmStart = content.indexOf('---\n');
+  if (fmStart !== 0) return { name: '', description: '' };
+  const fmEnd = content.indexOf('\n---', fmStart + 4);
+  if (fmEnd === -1) return { name: '', description: '' };
+
+  const frontmatter = content.slice(fmStart + 4, fmEnd);
+  const nameMatch = frontmatter.match(/^name:\s*(.+)$/m);
+  const name = nameMatch ? nameMatch[1].trim() : '';
+
+  let description = '';
+  const lines = frontmatter.split('\n');
+  let inDescription = false;
+  const descLines: string[] = [];
+  for (const line of lines) {
+    if (line.match(/^description:\s*\|?\s*$/)) {
+      inDescription = true;
+      continue;
+    }
+    if (line.match(/^description:\s*\S/)) {
+      description = line.replace(/^description:\s*/, '').trim();
+      break;
+    }
+    if (inDescription) {
+      if (line === '' || line.match(/^\s/)) {
+        descLines.push(line.replace(/^  /, ''));
+      } else {
+        break;
+      }
+    }
+  }
+  if (descLines.length > 0) {
+    description = descLines.join('\n').trim();
+  }
+
+  return { name, description };
+}
+
+export function condenseOpenAIShortDescription(description: string): string {
+  const firstParagraph = description.split(/\n\s*\n/)[0] || description;
+  const collapsed = firstParagraph.replace(/\s+/g, ' ').trim();
+  if (collapsed.length <= OPENAI_SHORT_DESCRIPTION_LIMIT) return collapsed;
+
+  const truncated = collapsed.slice(0, OPENAI_SHORT_DESCRIPTION_LIMIT - 3);
+  const lastSpace = truncated.lastIndexOf(' ');
+  const safe = lastSpace > 40 ? truncated.slice(0, lastSpace) : truncated;
+  return `${safe}...`;
+}
+
+export function generateOpenAIYaml(displayName: string, shortDescription: string): string {
+  return `interface:
+  display_name: ${JSON.stringify(displayName)}
+  short_description: ${JSON.stringify(shortDescription)}
+  default_prompt: ${JSON.stringify(`Use ${displayName} for this task.`)}
+policy:
+  allow_implicit_invocation: true
+`;
+}
+
+export function codexSkillName(skillDir: string): string {
+  if (skillDir === '.' || skillDir === '') return 'gstack';
+  // Don't double-prefix: gstack-upgrade → gstack-upgrade (not gstack-gstack-upgrade)
+  if (skillDir.startsWith('gstack-')) return skillDir;
+  return `gstack-${skillDir}`;
+}
+
+/**
+ * Transform frontmatter for Codex: keep only name + description.
+ * Strips allowed-tools, hooks, version, and all other fields.
+ * Handles multiline block scalar descriptions (YAML | syntax).
+ */
+export function transformFrontmatter(content: string, host: Host): string {
+  if (host === 'claude') return content;
+
+  // Find frontmatter boundaries
+  const fmStart = content.indexOf('---\n');
+  if (fmStart !== 0) return content; // frontmatter must be at the start
+  const fmEnd = content.indexOf('\n---', fmStart + 4);
+  if (fmEnd === -1) return content;
+
+  const body = content.slice(fmEnd + 4); // includes the leading \n after ---
+  const { name, description } = extractNameAndDescription(content);
+
+  // Codex 1024-char description limit — fail build, don't ship broken skills
+  const MAX_DESC = 1024;
+  if (description.length > MAX_DESC) {
+    throw new Error(
+      `Codex description for "${name}" is ${description.length} chars (max ${MAX_DESC}). ` +
+      `Compress the description in the .tmpl file.`
+    );
+  }
+
+  // Re-emit Codex frontmatter (name + description only)
+  const indentedDesc = description.split('\n').map(l => `  ${l}`).join('\n');
+  const codexFm = `---\nname: ${name}\ndescription: |\n${indentedDesc}\n---`;
+  return codexFm + body;
+}
+
+/**
+ * Extract hook descriptions from frontmatter for inline safety prose.
+ * Returns a description of what the hooks do, or null if no hooks.
+ */
+export function extractHookSafetyProse(tmplContent: string): string | null {
+  if (!tmplContent.match(/^hooks:/m)) return null;
+
+  // Parse the hook matchers to build a human-readable safety description
+  const matchers: string[] = [];
+  const matcherRegex = /matcher:\s*"(\w+)"/g;
+  let m;
+  while ((m = matcherRegex.exec(tmplContent)) !== null) {
+    if (!matchers.includes(m[1])) matchers.push(m[1]);
+  }
+
+  if (matchers.length === 0) return null;
+
+  // Build safety prose based on what tools are hooked
+  const toolDescriptions: Record<string, string> = {
+    Bash: 'check bash commands for destructive operations (rm -rf, DROP TABLE, force-push, git reset --hard, etc.) before execution',
+    Edit: 'verify file edits are within the allowed scope boundary before applying',
+    Write: 'verify file writes are within the allowed scope boundary before applying',
+  };
+
+  const safetyChecks = matchers
+    .map(t => toolDescriptions[t] || `check ${t} operations for safety`)
+    .join(', and ');
+
+  return `> **Safety Advisory:** This skill includes safety checks that ${safetyChecks}. When using this skill, always pause and verify before executing potentially destructive operations. If uncertain about a command's safety, ask the user for confirmation before proceeding.`;
+}