fix: three flaky E2E test fixes

ship-local-workflow: Use `git log --all` on bare remote so we count
commits on feature/ship-test, not just HEAD (main).

setup-cookies-detect: Accept "no browsers detected" as valid on CI
(headless Ubuntu has no browser cookie databases). Increase maxTurns
from 5→8 and make prompt explicit about always writing the file.

routing tests: Apply EVALS_TIER filtering — all routing tests are
periodic but the file had no tier awareness, so they ran under
EVALS_TIER=gate in CI and failed non-deterministically.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-24 14:08:38 -07:00
parent 91387d4766
commit 8763787cdf
2 changed files with 53 additions and 22 deletions

View File

@@ -175,9 +175,10 @@ describeIfSelected('Ship workflow E2E', ['ship-local-workflow'], () => {
logCost('/ship local workflow', result); logCost('/ship local workflow', result);
// Check push succeeded // Check push succeeded — check the feature branch on the bare remote
const remoteLog = spawnSync('git', ['log', '--oneline'], { cwd: shipRemoteDir, stdio: 'pipe' }); // (bare repo HEAD points to main which only has 1 commit; the push goes to feature/ship-test)
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').length; const remoteLog = spawnSync('git', ['log', '--oneline', '--all'], { cwd: shipRemoteDir, stdio: 'pipe' });
const remoteCommits = remoteLog.stdout.toString().trim().split('\n').filter(l => l.length > 0).length;
// Check VERSION was bumped // Check VERSION was bumped
const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION')) const versionContent = fs.existsSync(path.join(shipWorkDir, 'VERSION'))
@@ -217,12 +218,14 @@ describeIfSelected('Setup Browser Cookies E2E', ['setup-cookies-detect'], () =>
const result = await runSkillTest({ const result = await runSkillTest({
prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow. prompt: `Read setup-browser-cookies/SKILL.md for the cookie import workflow.
This is a test environment. List which browsers you can detect on this system by checking for their cookie database files. This is a test environment. Check which browsers exist on this system by looking for their cookie database files.
Write the detected browsers to ${cookieDir}/detected-browsers.md. IMPORTANT: You MUST write a file called ${cookieDir}/detected-browsers.md with your findings.
If you find browsers, list them. If you find NO browsers, write "No browsers detected" to the file.
The file must always be created regardless of results.
Do NOT launch the cookie picker UI — just detect and report.`, Do NOT launch the cookie picker UI — just detect and report.`,
workingDirectory: cookieDir, workingDirectory: cookieDir,
maxTurns: 5, maxTurns: 8,
timeout: 45_000, timeout: 60_000,
testName: 'setup-cookies-detect', testName: 'setup-cookies-detect',
runId, runId,
}); });
@@ -233,17 +236,21 @@ Do NOT launch the cookie picker UI — just detect and report.`,
const detectExists = fs.existsSync(detectPath); const detectExists = fs.existsSync(detectPath);
const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : ''; const detectContent = detectExists ? fs.readFileSync(detectPath, 'utf-8') : '';
const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent); const hasBrowserName = /chrome|arc|brave|edge|comet|safari|firefox/i.test(detectContent);
const hasNoBrowsers = /no browser|none|not found|not detected|could not|couldn't/i.test(detectContent);
// On CI (headless Ubuntu), no browsers are installed — "no browsers detected" is valid
const contentValid = hasBrowserName || hasNoBrowsers;
recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, { recordE2E(evalCollector, '/setup-browser-cookies detect', 'Setup Browser Cookies E2E', result, {
passed: detectExists && hasBrowserName && ['success', 'error_max_turns'].includes(result.exitReason), passed: detectExists && contentValid && ['success', 'error_max_turns'].includes(result.exitReason),
}); });
expect(['success', 'error_max_turns']).toContain(result.exitReason); expect(['success', 'error_max_turns']).toContain(result.exitReason);
expect(detectExists).toBe(true); expect(detectExists).toBe(true);
if (detectExists) { if (detectExists) {
expect(hasBrowserName).toBe(true); expect(contentValid).toBe(true);
} }
}, 60_000); }, 90_000);
}); });
// --- gstack-upgrade E2E --- // --- gstack-upgrade E2E ---

View File

@@ -3,7 +3,7 @@ import { runSkillTest } from './helpers/session-runner';
import type { SkillTestResult } from './helpers/session-runner'; import type { SkillTestResult } from './helpers/session-runner';
import { EvalCollector } from './helpers/eval-store'; import { EvalCollector } from './helpers/eval-store';
import type { EvalTestEntry } from './helpers/eval-store'; import type { EvalTestEntry } from './helpers/eval-store';
import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, GLOBAL_TOUCHFILES } from './helpers/touchfiles'; import { selectTests, detectBaseBranch, getChangedFiles, E2E_TOUCHFILES, E2E_TIERS, GLOBAL_TOUCHFILES } from './helpers/touchfiles';
import { spawnSync } from 'child_process'; import { spawnSync } from 'child_process';
import * as fs from 'fs'; import * as fs from 'fs';
import * as path from 'path'; import * as path from 'path';
@@ -42,6 +42,21 @@ if (evalsEnabled && !process.env.EVALS_ALL) {
} }
} }
// Apply EVALS_TIER filter (same logic as e2e-helpers.ts)
if (evalsEnabled && process.env.EVALS_TIER) {
const tier = process.env.EVALS_TIER as 'gate' | 'periodic';
const tierTests = Object.entries(E2E_TIERS)
.filter(([, t]) => t === tier)
.map(([name]) => name);
if (selectedTests === null) {
selectedTests = tierTests;
} else {
selectedTests = selectedTests.filter(t => tierTests.includes(t));
}
process.stderr.write(`Routing EVALS_TIER=${tier}: ${selectedTests.length} tests\n\n`);
}
// --- Helper functions --- // --- Helper functions ---
/** Copy all SKILL.md files for auto-discovery. /** Copy all SKILL.md files for auto-discovery.
@@ -140,6 +155,15 @@ function recordRouting(name: string, result: SkillTestResult, expectedSkill: str
}); });
} }
// Skip individual tests based on selectedTests (diff + tier filtering)
const testIfSelected = (name: string, fn: () => Promise<void>, timeout?: number) => {
if (selectedTests !== null && !selectedTests.includes(name)) {
test.skip(name, () => {});
} else {
test.concurrent(name, fn, timeout);
}
};
// --- Tests --- // --- Tests ---
describeE2E('Skill Routing E2E — Developer Journey', () => { describeE2E('Skill Routing E2E — Developer Journey', () => {
@@ -147,7 +171,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
evalCollector?.finalize(); evalCollector?.finalize();
}); });
test.concurrent('journey-ideation', async () => { testIfSelected('journey-ideation', async () => {
const tmpDir = createRoutingWorkDir('ideation'); const tmpDir = createRoutingWorkDir('ideation');
try { try {
@@ -176,7 +200,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
} }
}, 150_000); }, 150_000);
test.concurrent('journey-plan-eng', async () => { testIfSelected('journey-plan-eng', async () => {
const tmpDir = createRoutingWorkDir('plan-eng'); const tmpDir = createRoutingWorkDir('plan-eng');
try { try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
@@ -226,7 +250,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
} }
}, 150_000); }, 150_000);
test.concurrent('journey-think-bigger', async () => { testIfSelected('journey-think-bigger', async () => {
const tmpDir = createRoutingWorkDir('think-bigger'); const tmpDir = createRoutingWorkDir('think-bigger');
try { try {
fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture fs.writeFileSync(path.join(tmpDir, 'plan.md'), `# Waitlist App Architecture
@@ -277,7 +301,7 @@ describeE2E('Skill Routing E2E — Developer Journey', () => {
} }
}, 180_000); }, 180_000);
test.concurrent('journey-debug', async () => { testIfSelected('journey-debug', async () => {
const tmpDir = createRoutingWorkDir('debug'); const tmpDir = createRoutingWorkDir('debug');
try { try {
const run = (cmd: string, args: string[]) => const run = (cmd: string, args: string[]) =>
@@ -335,7 +359,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-qa', async () => { testIfSelected('journey-qa', async () => {
const tmpDir = createRoutingWorkDir('qa'); const tmpDir = createRoutingWorkDir('qa');
try { try {
fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2)); fs.writeFileSync(path.join(tmpDir, 'package.json'), JSON.stringify({ name: 'waitlist-app', scripts: { dev: 'next dev' } }, null, 2));
@@ -371,7 +395,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-code-review', async () => { testIfSelected('journey-code-review', async () => {
const tmpDir = createRoutingWorkDir('code-review'); const tmpDir = createRoutingWorkDir('code-review');
try { try {
const run = (cmd: string, args: string[]) => const run = (cmd: string, args: string[]) =>
@@ -411,7 +435,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-ship', async () => { testIfSelected('journey-ship', async () => {
const tmpDir = createRoutingWorkDir('ship'); const tmpDir = createRoutingWorkDir('ship');
try { try {
const run = (cmd: string, args: string[]) => const run = (cmd: string, args: string[]) =>
@@ -450,7 +474,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-docs', async () => { testIfSelected('journey-docs', async () => {
const tmpDir = createRoutingWorkDir('docs'); const tmpDir = createRoutingWorkDir('docs');
try { try {
const run = (cmd: string, args: string[]) => const run = (cmd: string, args: string[]) =>
@@ -487,7 +511,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-retro', async () => { testIfSelected('journey-retro', async () => {
const tmpDir = createRoutingWorkDir('retro'); const tmpDir = createRoutingWorkDir('retro');
try { try {
const run = (cmd: string, args: string[]) => const run = (cmd: string, args: string[]) =>
@@ -530,7 +554,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-design-system', async () => { testIfSelected('journey-design-system', async () => {
const tmpDir = createRoutingWorkDir('design-system'); const tmpDir = createRoutingWorkDir('design-system');
try { try {
@@ -559,7 +583,7 @@ export default app;
} }
}, 150_000); }, 150_000);
test.concurrent('journey-visual-qa', async () => { testIfSelected('journey-visual-qa', async () => {
const tmpDir = createRoutingWorkDir('visual-qa'); const tmpDir = createRoutingWorkDir('visual-qa');
try { try {
const run = (cmd: string, args: string[]) => const run = (cmd: string, args: string[]) =>