feat(windows): curated windows-free-tests CI job + test-free-shards curation

Codex's v1.18.0.0 review flagged that a windows-latest matrix entry on the existing Linux-container evals.yml workflow can't work as a drop-in, and that the free test suite has POSIX-bound dependencies a sharded runner doesn't fix on its own. This commit takes McGluut's test-free-shards.ts (190 LOC), adds a Windows-fragility scan, and runs the curated subset on a separate non-container windows-latest job. scripts/test-free-shards.ts: - Enumeration + paid-eval filtering + stable-hash sharding (FNV-1a). Adapted from McGluut/gstack fork. - Upstream-original: --windows-only filter scans each test's content for POSIX-bound patterns: hardcoded /bin/sh, spawn('sh', ...), bash -c, raw /tmp/, chmod, xargs, which claude. Files matching are excluded with the reason logged. Currently filters 25 of 128 free tests; remaining 103 run on windows-latest. .github/workflows/windows-free-tests.yml: - Separate non-container job (NOT a matrix entry on evals.yml). Runs: bun run test:windows # curated subset bun test browse/test/claude-bin.test.ts # PATHEXT+overrides on Windows bun test test/gstack-paths.test.ts # state-root resolution package.json: new test:free + test:windows scripts. Honest about scope (codex-flagged): this does NOT make the full free suite Windows-safe. The 25 excluded tests need POSIX-only surfaces ported off shell primitives (test/ship-version-sync.test.ts:72 hardcodes /bin/bash, etc). Tracked as a P4 follow-up TODO. Full Windows parity is the next wave; this release ships the curated lane. Tests: test/test-free-shards.test.ts has 14 unit tests covering enumeration, paid-eval filtering, Windows-fragility detection (POSIX patterns + safe code), and stable sharding determinism. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 12:18:24 +08:00 · 2026-04-27 23:02:08 -07:00
parent 87ce4c696f
commit 8745f89ad4
4 changed files with 471 additions and 1 deletions
--- a/.github/workflows/windows-free-tests.yml
+++ b/.github/workflows/windows-free-tests.yml
@@ -0,0 +1,57 @@
 name: Windows Free Tests
 # Curated subset of the free test suite that runs on windows-latest.
 #
 # Codex's v1.18.0.0 review flagged that the existing evals.yml workflow uses
 # a Linux container, so a windows-latest matrix entry there isn't a drop-in.
 # This workflow is non-container, runs the curated Windows-safe subset, plus
 # targeted resolver tests that exercise the Bun.which-based claude binary
 # resolution + the GSTACK_CLAUDE_BIN override path on Windows.
 #
 # What this DOES NOT do (out of scope for v1.18.0.0):
 #   - Run the full free suite on Windows. The 24 tests that hardcode /bin/sh,
 #     spawn('sh',...), or raw /tmp/ paths are excluded by scripts/test-free-shards.ts
 #     --windows-only. They need POSIX-bound surfaces to be ported off shell
 #     primitives before they can run on Windows. Tracked as a follow-up TODO.
 #   - Run Playwright/browser-backed tests. Browse server bring-up on Windows is
 #     a separate concern (PR #1238 windows-pty-bun-pty-fix is in flight).
 on:
  pull_request:
    branches: [main]
  workflow_dispatch:
 concurrency:
  group: windows-free-${{ github.head_ref }}
  cancel-in-progress: true
 jobs:
  windows-free-tests:
    runs-on: windows-latest
    timeout-minutes: 15
    steps:
      - uses: actions/checkout@v4
      - uses: oven-sh/setup-bun@v1
        with:
          bun-version: latest
      - name: Install dependencies
        run: bun install --frozen-lockfile
      - name: Show curated subset (for build log audit trail)
        run: bun run scripts/test-free-shards.ts --windows-only --list
        shell: bash
      - name: Run curated Windows-safe subset
        run: bun run test:windows
        shell: bash
      - name: Targeted Claude resolver tests (real PATHEXT coverage on Windows)
        run: bun test browse/test/claude-bin.test.ts
        shell: bash
      - name: gstack-paths helper test (resolves $GSTACK_STATE_ROOT etc. on Windows)
        run: bun test test/gstack-paths.test.ts
        shell: bash
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "gstack",
-  "version": "1.15.0.0",
+  "version": "1.18.0.0",
  "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.",
  "license": "MIT",
  "type": "module",
@@ -17,6 +17,8 @@
    "dev": "bun run browse/src/cli.ts",
    "server": "bun run browse/src/server.ts",
    "test": "bun test browse/test/ test/ make-pdf/test/ --ignore 'test/skill-e2e-*.test.ts' --ignore test/skill-llm-eval.test.ts --ignore test/skill-routing-e2e.test.ts --ignore test/codex-e2e.test.ts --ignore test/gemini-e2e.test.ts && (bun run slop:diff 2>/dev/null || true)",
    "test:free": "bun run scripts/test-free-shards.ts",
    "test:windows": "bun run scripts/test-free-shards.ts --windows-only",
    "test:evals": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
    "test:evals:all": "EVALS=1 EVALS_ALL=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-llm-eval.test.ts test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
    "test:e2e": "EVALS=1 bun test --retry 2 --concurrent --max-concurrency ${EVALS_CONCURRENCY:-15} test/skill-e2e-*.test.ts test/skill-routing-e2e.test.ts test/codex-e2e.test.ts test/gemini-e2e.test.ts",
--- a/scripts/test-free-shards.ts
+++ b/scripts/test-free-shards.ts
@@ -0,0 +1,283 @@
 #!/usr/bin/env bun
 /**
 * test-free-shards — enumerate, shard, and curate the free test suite.
 *
 * Three jobs:
 *   1. Enumeration. Walk `browse/test/`, `test/`, `make-pdf/test/` and return
 *      every `*.test.{ts,tsx,js,jsx,mjs,cjs}` that isn't a paid-eval test.
 *   2. Sharding. Stable-hash assign each test to one of N shards. Used by CI
 *      to parallelize the free suite when needed.
 *   3. Curation (Windows-safe filter). Scan each test's content for POSIX-only
 *      patterns (`/bin/bash`, `sh -c`, raw `/tmp/`, `chmod`, `xargs`). Files
 *      that match are excluded from the Windows-safe subset — they would fail
 *      on `windows-latest` no matter how the runner shards them.
 *
 * Adapted from the McGluut/gstack fork's test-free-shards.ts (190 LOC). The
 * Windows-safe filter is upstream-original — codex flagged that sharding alone
 * doesn't fix POSIX-bound tests, so we curate the subset that actually runs
 * on the windows-latest CI job.
 *
 * Usage:
 *   bun run scripts/test-free-shards.ts --list                    # show all
 *   bun run scripts/test-free-shards.ts --windows-only --list     # show curated
 *   bun run scripts/test-free-shards.ts --windows-only            # run curated
 *   bun run scripts/test-free-shards.ts --shards 4 --shard 1      # one shard
 */
 import * as fs from 'fs';
 import * as path from 'path';
 import { spawnSync } from 'child_process';
 const ROOT = path.resolve(import.meta.dir, '..');
 const TEST_ROOTS = ['browse/test', 'test', 'make-pdf/test'] as const;
 const TEST_FILE_REGEX = /\.test\.(?:[cm]?[jt]s|tsx|jsx)$/;
 // Tests that require API spend, external services, or e2e harnesses.
 // These are filtered out before any sharding or curation.
 const PAID_EVAL_TESTS = [
  /^browse\/test\/security-review-fullstack\.test\.ts$/,
  /^test\/skill-e2e-.*\.test\.ts$/,
  /^test\/skill-llm-eval\.test\.ts$/,
  /^test\/skill-routing-e2e\.test\.ts$/,
  /^test\/codex-e2e\.test\.ts$/,
  /^test\/gemini-e2e\.test\.ts$/,
 ] as const;
 // POSIX-only patterns that indicate a test will fail on windows-latest no
 // matter how the runner shards. Codex's v1.18.0.0 review flagged the first
 // three as concrete examples in the existing free suite (test/ship-version-sync.test.ts:72,
 // test/helpers/providers/claude.ts:22, package.json:12). We scan the test's
 // own content here so the filter stays automatic as new tests land.
 const WINDOWS_FRAGILE_PATTERNS: Array<{ pattern: RegExp; reason: string }> = [
  { pattern: /['"`]\/bin\/(?:ba)?sh/, reason: 'hardcoded /bin/sh or /bin/bash' },
  { pattern: /spawnSync\(['"]sh['"],|spawn\(['"]sh['"],|exec\(['"]sh /, reason: 'spawn("sh", ...)' },
  { pattern: /['"]bash -c['"]|['"]sh -c['"]/, reason: 'bash -c / sh -c' },
  { pattern: /['"`]\/tmp\//, reason: 'raw /tmp/ path (use os.tmpdir())' },
  { pattern: /['"]chmod\b/, reason: 'chmod shell command' },
  { pattern: /['"]xargs\b/, reason: 'xargs pipeline' },
  { pattern: /\bwhich claude\b/, reason: 'which claude (use Bun.which)' },
 ];
 export const DEFAULT_SHARD_COUNT = 20;
 export const FREE_TEST_TIMEOUT_MS = 10_000;
 export function normalizeRelativePath(filePath: string): string {
  return filePath.replace(/\\/g, '/');
 }
 export function isFreeTestFile(relativePath: string): boolean {
  const normalized = normalizeRelativePath(relativePath);
  if (!TEST_FILE_REGEX.test(normalized)) return false;
  return !PAID_EVAL_TESTS.some(pattern => pattern.test(normalized));
 }
 /**
 * Returns the first POSIX-only pattern hit in the file, or null if Windows-safe.
 */
 export function detectWindowsFragility(absolutePath: string): { reason: string } | null {
  let content: string;
  try {
    content = fs.readFileSync(absolutePath, 'utf-8');
  } catch {
    return null;
  }
  for (const { pattern, reason } of WINDOWS_FRAGILE_PATTERNS) {
    if (pattern.test(content)) return { reason };
  }
  return null;
 }
 function walkTestFiles(dirPath: string): string[] {
  const entries = fs.readdirSync(dirPath, { withFileTypes: true });
  const files: string[] = [];
  for (const entry of entries) {
    const fullPath = path.join(dirPath, entry.name);
    if (entry.isDirectory()) {
      files.push(...walkTestFiles(fullPath));
      continue;
    }
    if (TEST_FILE_REGEX.test(entry.name)) {
      files.push(fullPath);
    }
  }
  return files;
 }
 export function collectFreeTestFiles(rootDir = ROOT): string[] {
  const discovered = new Set<string>();
  for (const testRoot of TEST_ROOTS) {
    const absoluteRoot = path.join(rootDir, testRoot);
    if (!fs.existsSync(absoluteRoot)) continue;
    for (const fullPath of walkTestFiles(absoluteRoot)) {
      const relativePath = normalizeRelativePath(path.relative(rootDir, fullPath));
      if (isFreeTestFile(relativePath)) {
        discovered.add(relativePath);
      }
    }
  }
  return [...discovered].sort();
 }
 export interface CurationResult {
  safe: string[];
  excluded: Array<{ file: string; reason: string }>;
 }
 export function curateWindowsSafe(files: string[], rootDir = ROOT): CurationResult {
  const safe: string[] = [];
  const excluded: Array<{ file: string; reason: string }> = [];
  for (const relativePath of files) {
    const absolute = path.join(rootDir, relativePath);
    const fragility = detectWindowsFragility(absolute);
    if (fragility) {
      excluded.push({ file: relativePath, reason: fragility.reason });
    } else {
      safe.push(relativePath);
    }
  }
  return { safe, excluded };
 }
 export function stableHash(input: string): number {
  let hash = 0x811c9dc5;
  for (let index = 0; index < input.length; index += 1) {
    hash ^= input.charCodeAt(index);
    hash = Math.imul(hash, 0x01000193);
  }
  return hash >>> 0;
 }
 export function assignFilesToShards(files: string[], shardCount: number): string[][] {
  if (!Number.isInteger(shardCount) || shardCount <= 0) {
    throw new Error(`Shard count must be a positive integer. Received: ${shardCount}`);
  }
  const shards = Array.from({ length: shardCount }, () => [] as string[]);
  for (const file of files) {
    const shardIndex = stableHash(file) % shardCount;
    shards[shardIndex].push(file);
  }
  return shards
    .map(filesInShard => filesInShard.sort())
    .filter(filesInShard => filesInShard.length > 0);
 }
 export function buildShardArgs(files: string[]): string[] {
  return ['test', ...files, '--max-concurrency=1', `--timeout=${FREE_TEST_TIMEOUT_MS}`];
 }
 type CliOptions = {
  dryRun: boolean;
  listOnly: boolean;
  windowsOnly: boolean;
  shardCount: number;
  shardIndex: number | null;
 };
 function parseCliOptions(argv: string[]): CliOptions {
  let dryRun = false;
  let listOnly = false;
  let windowsOnly = false;
  let shardCount = DEFAULT_SHARD_COUNT;
  let shardIndex: number | null = null;
  for (let index = 0; index < argv.length; index += 1) {
    const arg = argv[index];
    if (arg === '--dry-run') { dryRun = true; continue; }
    if (arg === '--list') { listOnly = true; continue; }
    if (arg === '--windows-only') { windowsOnly = true; continue; }
    if (arg === '--shards') {
      const value = argv[index + 1];
      if (!value) throw new Error('Missing value for --shards');
      shardCount = Number.parseInt(value, 10);
      index += 1;
      continue;
    }
    if (arg === '--shard') {
      const value = argv[index + 1];
      if (!value) throw new Error('Missing value for --shard');
      shardIndex = Number.parseInt(value, 10);
      index += 1;
      continue;
    }
    throw new Error(`Unknown argument: ${arg}`);
  }
  return { dryRun, listOnly, windowsOnly, shardCount, shardIndex };
 }
 function formatShardSummary(shards: string[][]): string[] {
  return shards.map((files, index) => {
    const preview = files.slice(0, 3).join(', ');
    const suffix = files.length > 3 ? ', ...' : '';
    return `Shard ${index + 1}/${shards.length}: ${files.length} files${preview ? ` -> ${preview}${suffix}` : ''}`;
  });
 }
 function runShard(files: string[], shardNumber: number, totalShards: number): number {
  const header = `[test:free] shard ${shardNumber}/${totalShards} (${files.length} files)`;
  console.log(header);
  const result = spawnSync(process.execPath, buildShardArgs(files), {
    cwd: ROOT,
    stdio: 'inherit',
    env: process.env,
  });
  if (result.status !== 0) {
    console.error(`${header} failed with exit code ${result.status ?? 1}`);
  }
  return result.status ?? 1;
 }
 function main(): number {
  const options = parseCliOptions(process.argv.slice(2));
  const allFiles = collectFreeTestFiles();
  if (allFiles.length === 0) {
    throw new Error('No free test files were discovered.');
  }
  let files = allFiles;
  let curationReport: CurationResult | null = null;
  if (options.windowsOnly) {
    curationReport = curateWindowsSafe(allFiles);
    files = curationReport.safe;
    console.log(`[test:free] curated ${files.length} Windows-safe tests (${curationReport.excluded.length} excluded)`);
    if (options.listOnly && curationReport.excluded.length > 0) {
      console.log('\nExcluded (POSIX-fragile):');
      for (const { file, reason } of curationReport.excluded) {
        console.log(`  - ${file}  [${reason}]`);
      }
    }
  }
  if (options.listOnly) {
    console.log(`\nDiscovered ${files.length} test files.`);
    for (const file of files) console.log(`  ${file}`);
    return 0;
  }
  const shards = assignFilesToShards(files, options.shardCount);
  if (options.dryRun) {
    console.log(`\nWould run ${files.length} files across ${shards.length} shards.`);
    for (const line of formatShardSummary(shards)) console.log(line);
    return 0;
  }
  if (options.shardIndex !== null) {
    if (!Number.isInteger(options.shardIndex) || options.shardIndex < 1 || options.shardIndex > shards.length) {
      throw new Error(`--shard must be between 1 and ${shards.length}. Received: ${options.shardIndex}`);
    }
    return runShard(shards[options.shardIndex - 1], options.shardIndex, shards.length);
  }
  for (let index = 0; index < shards.length; index += 1) {
    const exitCode = runShard(shards[index], index + 1, shards.length);
    if (exitCode !== 0) return exitCode;
  }
  return 0;
 }
 if (import.meta.main) {
  process.exitCode = main();
 }
--- a/test/test-free-shards.test.ts
+++ b/test/test-free-shards.test.ts
@@ -0,0 +1,128 @@
 import { describe, test, expect } from 'bun:test';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import {
  isFreeTestFile,
  collectFreeTestFiles,
  detectWindowsFragility,
  curateWindowsSafe,
  stableHash,
  assignFilesToShards,
  normalizeRelativePath,
 } from '../scripts/test-free-shards';
 const ROOT = path.resolve(import.meta.dir, '..');
 describe('test-free-shards: enumeration', () => {
  test('isFreeTestFile rejects non-test files', () => {
    expect(isFreeTestFile('test/foo.ts')).toBe(false);
    expect(isFreeTestFile('test/foo.test.ts')).toBe(true);
    expect(isFreeTestFile('test/foo.test.tsx')).toBe(true);
    expect(isFreeTestFile('test/foo.test.mjs')).toBe(true);
  });
  test('isFreeTestFile rejects paid eval tests', () => {
    expect(isFreeTestFile('test/skill-e2e-foo.test.ts')).toBe(false);
    expect(isFreeTestFile('test/skill-llm-eval.test.ts')).toBe(false);
    expect(isFreeTestFile('test/codex-e2e.test.ts')).toBe(false);
    expect(isFreeTestFile('test/gemini-e2e.test.ts')).toBe(false);
  });
  test('collectFreeTestFiles returns sorted, deduped, only-free list', () => {
    const files = collectFreeTestFiles(ROOT);
    expect(files.length).toBeGreaterThan(10);
    expect(files).toEqual([...files].sort());
    expect(new Set(files).size).toBe(files.length);
    for (const f of files) {
      expect(isFreeTestFile(f)).toBe(true);
    }
  });
  test('normalizeRelativePath converts Windows backslashes to forward slashes', () => {
    expect(normalizeRelativePath('test\\foo\\bar.test.ts')).toBe('test/foo/bar.test.ts');
    expect(normalizeRelativePath('test/foo/bar.test.ts')).toBe('test/foo/bar.test.ts');
  });
 });
 describe('test-free-shards: Windows curation', () => {
  function withTempFile(content: string, fn: (filePath: string) => void): void {
    const dir = fs.mkdtempSync(path.join(os.tmpdir(), 'curation-test-'));
    const file = path.join(dir, 'sample.test.ts');
    fs.writeFileSync(file, content);
    try {
      fn(file);
    } finally {
      fs.rmSync(dir, { recursive: true, force: true });
    }
  }
  test('detects /bin/bash hardcode', () => {
    withTempFile(`spawn('/bin/bash', ['-c', 'echo hi']);`, (f) => {
      expect(detectWindowsFragility(f)?.reason).toBe('hardcoded /bin/sh or /bin/bash');
    });
  });
  test('detects spawn("sh", ...)', () => {
    withTempFile(`spawnSync('sh', ['-c', 'command -v claude']);`, (f) => {
      expect(detectWindowsFragility(f)?.reason).toBe('spawn("sh", ...)');
    });
  });
  test('detects raw /tmp/ paths', () => {
    withTempFile(`const TMPERR = '/tmp/codex-err.txt';`, (f) => {
      expect(detectWindowsFragility(f)?.reason).toBe('raw /tmp/ path (use os.tmpdir())');
    });
  });
  test('detects which claude shell command', () => {
    withTempFile(`execSync('which claude').trim();`, (f) => {
      expect(detectWindowsFragility(f)?.reason).toBe('which claude (use Bun.which)');
    });
  });
  test('Windows-safe code passes the filter', () => {
    withTempFile(`import { spawn } from 'child_process'; spawn(claude.command, args);`, (f) => {
      expect(detectWindowsFragility(f)).toBeNull();
    });
  });
  test('curateWindowsSafe partitions files into safe + excluded', () => {
    const files = collectFreeTestFiles(ROOT);
    const result = curateWindowsSafe(files, ROOT);
    expect(result.safe.length + result.excluded.length).toBe(files.length);
    // Sanity: at least one excluded entry, since we know test/ship-version-sync.test.ts uses /bin/bash
    expect(result.excluded.length).toBeGreaterThan(0);
    // Every excluded entry has a non-empty reason
    for (const { reason } of result.excluded) {
      expect(reason.length).toBeGreaterThan(0);
    }
  });
 });
 describe('test-free-shards: sharding', () => {
  test('stableHash is deterministic', () => {
    expect(stableHash('foo.test.ts')).toBe(stableHash('foo.test.ts'));
    expect(stableHash('foo.test.ts')).not.toBe(stableHash('bar.test.ts'));
  });
  test('assignFilesToShards distributes files into N non-empty shards', () => {
    const files = ['a.test.ts', 'b.test.ts', 'c.test.ts', 'd.test.ts', 'e.test.ts'];
    const shards = assignFilesToShards(files, 3);
    const flattened = shards.flat();
    expect(flattened.sort()).toEqual([...files].sort());
    expect(shards.every((s) => s.length > 0)).toBe(true);
  });
  test('assignFilesToShards rejects invalid shard counts', () => {
    expect(() => assignFilesToShards(['a.test.ts'], 0)).toThrow();
    expect(() => assignFilesToShards(['a.test.ts'], -1)).toThrow();
  });
  test('shards are stable across runs (same files always land in same shard)', () => {
    const files = ['x.test.ts', 'y.test.ts', 'z.test.ts'];
    const a = assignFilesToShards(files, 5);
    const b = assignFilesToShards(files, 5);
    expect(a).toEqual(b);
  });
 });