Merge remote-tracking branch 'origin/main' into garrytan/test-coverage-catalog

# Conflicts:
#	scripts/gen-skill-docs.ts
#	test/gen-skill-docs.test.ts
This commit is contained in:
Garry Tan
2026-03-20 16:18:28 -07:00
65 changed files with 3277 additions and 499 deletions

View File

@@ -2928,6 +2928,128 @@ Write the full output (including the GATE verdict) to ${codexDir}/codex-output.m
}, 360_000);
});
// --- Office Hours Spec Review E2E ---
describeIfSelected('Office Hours Spec Review E2E', ['office-hours-spec-review'], () => {
let ohDir: string;
beforeAll(() => {
ohDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-oh-spec-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: ohDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(ohDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy office-hours skill
fs.mkdirSync(path.join(ohDir, 'office-hours'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'office-hours', 'SKILL.md'),
path.join(ohDir, 'office-hours', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(ohDir, { recursive: true, force: true }); } catch {}
});
test('/office-hours SKILL.md contains spec review loop', async () => {
const result = await runSkillTest({
prompt: `Read office-hours/SKILL.md. I want to understand the spec review loop.
Summarize what the "Spec Review Loop" section does — specifically:
1. How many dimensions does the reviewer check?
2. What tool is used to dispatch the reviewer?
3. What's the maximum number of iterations?
4. What metrics are tracked?
Write your summary to ${ohDir}/spec-review-summary.md`,
workingDirectory: ohDir,
maxTurns: 8,
timeout: 120_000,
testName: 'office-hours-spec-review',
runId,
});
logCost('/office-hours spec review', result);
recordE2E('/office-hours-spec-review', 'Office Hours Spec Review E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(ohDir, 'spec-review-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the key concepts
expect(summary).toMatch(/5.*dimension|dimension.*5|completeness|consistency|clarity|scope|feasibility/);
expect(summary).toMatch(/agent|subagent/);
expect(summary).toMatch(/3.*iteration|iteration.*3|maximum.*3/);
}
}, 180_000);
});
// --- Plan CEO Review Benefits-From E2E ---
describeIfSelected('Plan CEO Review Benefits-From E2E', ['plan-ceo-review-benefits'], () => {
let benefitsDir: string;
beforeAll(() => {
benefitsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'skill-e2e-benefits-'));
const run = (cmd: string, args: string[]) =>
spawnSync(cmd, args, { cwd: benefitsDir, stdio: 'pipe', timeout: 5000 });
run('git', ['init', '-b', 'main']);
run('git', ['config', 'user.email', 'test@test.com']);
run('git', ['config', 'user.name', 'Test']);
fs.writeFileSync(path.join(benefitsDir, 'README.md'), '# Test Project\n');
run('git', ['add', '.']);
run('git', ['commit', '-m', 'init']);
// Copy plan-ceo-review skill
fs.mkdirSync(path.join(benefitsDir, 'plan-ceo-review'), { recursive: true });
fs.copyFileSync(
path.join(ROOT, 'plan-ceo-review', 'SKILL.md'),
path.join(benefitsDir, 'plan-ceo-review', 'SKILL.md'),
);
});
afterAll(() => {
try { fs.rmSync(benefitsDir, { recursive: true, force: true }); } catch {}
});
test('/plan-ceo-review SKILL.md contains prerequisite skill offer', async () => {
const result = await runSkillTest({
prompt: `Read plan-ceo-review/SKILL.md. Search for sections about "Prerequisite" or "office-hours" or "design doc found".
Summarize what happens when no design doc is found — specifically:
1. Is /office-hours offered as a prerequisite?
2. What options does the user get?
3. Is there a mid-session detection for when the user seems lost?
Write your summary to ${benefitsDir}/benefits-summary.md`,
workingDirectory: benefitsDir,
maxTurns: 8,
timeout: 120_000,
testName: 'plan-ceo-review-benefits',
runId,
});
logCost('/plan-ceo-review benefits-from', result);
recordE2E('/plan-ceo-review-benefits', 'Plan CEO Review Benefits-From E2E', result);
expect(result.exitReason).toBe('success');
const summaryPath = path.join(benefitsDir, 'benefits-summary.md');
if (fs.existsSync(summaryPath)) {
const summary = fs.readFileSync(summaryPath, 'utf-8').toLowerCase();
// Verify the agent understood the skill chaining
expect(summary).toMatch(/office.hours/);
expect(summary).toMatch(/design doc|no design/i);
}
}, 180_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {