feat: parallelize CI evals — 12 runners (1 per suite) for ~3min wall clock

Switch eval workflow to use Docker container image with pre-baked
toolchain. Each of 12 matrix runners pulls the image, hardlinks
cached node_modules, builds browse, and runs one test suite.
Setup drops from ~70s to ~19s per runner. Wall clock is dominated
by the slowest individual test, not sequential sum.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-23 05:37:59 -07:00
parent 2cfba5c30c
commit a5c70977f1

View File

@@ -10,6 +10,11 @@ concurrency:
jobs: jobs:
evals: evals:
runs-on: ubicloud-standard-2 runs-on: ubicloud-standard-2
container:
image: ghcr.io/${{ github.repository }}/ci:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
timeout-minutes: 20 timeout-minutes: 20
strategy: strategy:
fail-fast: false fail-fast: false
@@ -44,22 +49,18 @@ jobs:
with: with:
fetch-depth: 0 fetch-depth: 0
- uses: oven-sh/setup-bun@v2 # Restore pre-installed node_modules from Docker image (~1s vs ~15s install)
# If lockfile changed since image was built, fall back to fresh install
- name: Cache bun dependencies - name: Restore deps
uses: actions/cache@v4 run: |
with: if diff -q /opt/node_modules_cache/.package-lock.json package.json >/dev/null 2>&1; then
path: ~/.bun/install/cache cp -al /opt/node_modules_cache node_modules
key: bun-${{ hashFiles('bun.lockb') }} else
restore-keys: bun- bun install
fi
- run: bun install
- run: bun run build - run: bun run build
- name: Install Claude CLI
run: npm i -g @anthropic-ai/claude-code
- name: Run ${{ matrix.suite.name }} - name: Run ${{ matrix.suite.name }}
env: env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
@@ -78,6 +79,11 @@ jobs:
report: report:
runs-on: ubicloud-standard-2 runs-on: ubicloud-standard-2
container:
image: ghcr.io/${{ github.repository }}/ci:latest
credentials:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
needs: evals needs: evals
if: always() && github.event_name == 'pull_request' if: always() && github.event_name == 'pull_request'
timeout-minutes: 5 timeout-minutes: 5
@@ -129,7 +135,7 @@ jobs:
$(echo -e "$SUITE_LINES") $(echo -e "$SUITE_LINES")
--- ---
*12x ubicloud-standard-2 ($0.0008/min each) | Wall clock ≈ slowest suite*" *12x ubicloud-standard-2 (Docker: pre-baked toolchain + deps) | wall clock ≈ slowest suite*"
if [ "$FAILED" -gt 0 ]; then if [ "$FAILED" -gt 0 ]; then
FAILURES="" FAILURES=""