mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-21 12:18:24 +08:00
feat: LLM-as-judge evals for SKILL.md documentation quality
4 eval tests using Anthropic API (claude-haiku, ~$0.01-0.03/run): - Command reference table: clarity/completeness/actionability >= 4/5 - Snapshot flags section: same thresholds - browse/SKILL.md overall quality - Regression: generated version must score >= hand-maintained baseline Requires ANTHROPIC_API_KEY. Auto-skips without it. Run: bun run test:eval (or ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts)
This commit is contained in:
@@ -12,9 +12,10 @@
|
|||||||
"gen:skill-docs": "bun run scripts/gen-skill-docs.ts",
|
"gen:skill-docs": "bun run scripts/gen-skill-docs.ts",
|
||||||
"dev": "bun run browse/src/cli.ts",
|
"dev": "bun run browse/src/cli.ts",
|
||||||
"server": "bun run browse/src/server.ts",
|
"server": "bun run browse/src/server.ts",
|
||||||
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts",
|
"test": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts",
|
||||||
"test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
"test:e2e": "SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||||
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
"test:eval": "bun test test/skill-llm-eval.test.ts",
|
||||||
|
"test:all": "bun test browse/test/ test/ --ignore test/skill-e2e.test.ts --ignore test/skill-llm-eval.test.ts && SKILL_E2E=1 bun test test/skill-e2e.test.ts",
|
||||||
"skill:check": "bun run scripts/skill-check.ts",
|
"skill:check": "bun run scripts/skill-check.ts",
|
||||||
"dev:skill": "bun run scripts/dev-skill.ts",
|
"dev:skill": "bun run scripts/dev-skill.ts",
|
||||||
"start": "bun run browse/src/server.ts"
|
"start": "bun run browse/src/server.ts"
|
||||||
@@ -37,6 +38,7 @@
|
|||||||
"devtools"
|
"devtools"
|
||||||
],
|
],
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@anthropic-ai/claude-agent-sdk": "^0.2.75"
|
"@anthropic-ai/claude-agent-sdk": "^0.2.75",
|
||||||
|
"@anthropic-ai/sdk": "^0.78.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
194
test/skill-llm-eval.test.ts
Normal file
194
test/skill-llm-eval.test.ts
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
/**
|
||||||
|
* LLM-as-a-Judge evals for generated SKILL.md quality.
|
||||||
|
*
|
||||||
|
* Uses the Anthropic API directly (not Agent SDK) to evaluate whether
|
||||||
|
* generated command docs are clear, complete, and actionable for an AI agent.
|
||||||
|
*
|
||||||
|
* Requires: ANTHROPIC_API_KEY env var
|
||||||
|
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
||||||
|
*
|
||||||
|
* Cost: ~$0.01-0.03 per run (haiku)
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { describe, test, expect } from 'bun:test';
|
||||||
|
import Anthropic from '@anthropic-ai/sdk';
|
||||||
|
import * as fs from 'fs';
|
||||||
|
import * as path from 'path';
|
||||||
|
|
||||||
|
const ROOT = path.resolve(import.meta.dir, '..');
|
||||||
|
const hasApiKey = !!process.env.ANTHROPIC_API_KEY;
|
||||||
|
const describeEval = hasApiKey ? describe : describe.skip;
|
||||||
|
|
||||||
|
interface JudgeScore {
|
||||||
|
clarity: number; // 1-5: can an agent understand what each command does?
|
||||||
|
completeness: number; // 1-5: are all args, flags, valid values documented?
|
||||||
|
actionability: number; // 1-5: can an agent use this to construct correct commands?
|
||||||
|
reasoning: string; // why the scores were given
|
||||||
|
}
|
||||||
|
|
||||||
|
async function judge(section: string, prompt: string): Promise<JudgeScore> {
|
||||||
|
const client = new Anthropic();
|
||||||
|
|
||||||
|
const response = await client.messages.create({
|
||||||
|
model: 'claude-haiku-4-5-20251001',
|
||||||
|
max_tokens: 1024,
|
||||||
|
messages: [{
|
||||||
|
role: 'user',
|
||||||
|
content: `You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
||||||
|
|
||||||
|
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
||||||
|
1. Understand what each command does
|
||||||
|
2. Know what arguments to pass
|
||||||
|
3. Know valid values for enum-like parameters
|
||||||
|
4. Construct correct command invocations without guessing
|
||||||
|
|
||||||
|
Rate the following ${section} on three dimensions (1-5 scale):
|
||||||
|
|
||||||
|
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
||||||
|
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
||||||
|
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
||||||
|
|
||||||
|
Scoring guide:
|
||||||
|
- 5: Excellent — no ambiguity, all info present
|
||||||
|
- 4: Good — minor gaps an experienced agent could infer
|
||||||
|
- 3: Adequate — some guessing required
|
||||||
|
- 2: Poor — significant info missing
|
||||||
|
- 1: Unusable — agent would fail without external help
|
||||||
|
|
||||||
|
Respond with ONLY valid JSON in this exact format:
|
||||||
|
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
||||||
|
|
||||||
|
Here is the ${section} to evaluate:
|
||||||
|
|
||||||
|
${prompt}`,
|
||||||
|
}],
|
||||||
|
});
|
||||||
|
|
||||||
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||||
|
// Extract JSON from response (handle markdown code blocks)
|
||||||
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||||
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||||
|
return JSON.parse(jsonMatch[0]) as JudgeScore;
|
||||||
|
}
|
||||||
|
|
||||||
|
describeEval('LLM-as-judge quality evals', () => {
|
||||||
|
test('command reference table scores >= 4 on all dimensions', async () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||||
|
// Extract just the command reference section
|
||||||
|
const start = content.indexOf('## Command Reference');
|
||||||
|
const end = content.indexOf('## Tips');
|
||||||
|
const section = content.slice(start, end);
|
||||||
|
|
||||||
|
const scores = await judge('command reference table', section);
|
||||||
|
console.log('Command reference scores:', JSON.stringify(scores, null, 2));
|
||||||
|
|
||||||
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||||
|
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||||
|
}, 30_000);
|
||||||
|
|
||||||
|
test('snapshot flags section scores >= 4 on all dimensions', async () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||||
|
const start = content.indexOf('## Snapshot System');
|
||||||
|
const end = content.indexOf('## Command Reference');
|
||||||
|
const section = content.slice(start, end);
|
||||||
|
|
||||||
|
const scores = await judge('snapshot flags reference', section);
|
||||||
|
console.log('Snapshot flags scores:', JSON.stringify(scores, null, 2));
|
||||||
|
|
||||||
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||||
|
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||||
|
}, 30_000);
|
||||||
|
|
||||||
|
test('browse/SKILL.md overall scores >= 4', async () => {
|
||||||
|
const content = fs.readFileSync(path.join(ROOT, 'browse', 'SKILL.md'), 'utf-8');
|
||||||
|
// Just the reference sections (skip examples/patterns)
|
||||||
|
const start = content.indexOf('## Snapshot Flags');
|
||||||
|
const section = content.slice(start);
|
||||||
|
|
||||||
|
const scores = await judge('browse skill reference (flags + commands)', section);
|
||||||
|
console.log('Browse SKILL.md scores:', JSON.stringify(scores, null, 2));
|
||||||
|
|
||||||
|
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||||
|
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||||
|
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||||
|
}, 30_000);
|
||||||
|
|
||||||
|
test('regression check: compare branch vs baseline quality', async () => {
|
||||||
|
// This test compares the generated output against the hand-maintained
|
||||||
|
// baseline from main. The generated version should score equal or higher.
|
||||||
|
const generated = fs.readFileSync(path.join(ROOT, 'SKILL.md'), 'utf-8');
|
||||||
|
const genStart = generated.indexOf('## Command Reference');
|
||||||
|
const genEnd = generated.indexOf('## Tips');
|
||||||
|
const genSection = generated.slice(genStart, genEnd);
|
||||||
|
|
||||||
|
const baseline = `## Command Reference
|
||||||
|
|
||||||
|
### Navigation
|
||||||
|
| Command | Description |
|
||||||
|
|---------|-------------|
|
||||||
|
| \`goto <url>\` | Navigate to URL |
|
||||||
|
| \`back\` / \`forward\` | History navigation |
|
||||||
|
| \`reload\` | Reload page |
|
||||||
|
| \`url\` | Print current URL |
|
||||||
|
|
||||||
|
### Interaction
|
||||||
|
| Command | Description |
|
||||||
|
|---------|-------------|
|
||||||
|
| \`click <sel>\` | Click element |
|
||||||
|
| \`fill <sel> <val>\` | Fill input |
|
||||||
|
| \`select <sel> <val>\` | Select dropdown |
|
||||||
|
| \`hover <sel>\` | Hover element |
|
||||||
|
| \`type <text>\` | Type into focused element |
|
||||||
|
| \`press <key>\` | Press key (Enter, Tab, Escape) |
|
||||||
|
| \`scroll [sel]\` | Scroll element into view |
|
||||||
|
| \`wait <sel>\` | Wait for element (max 10s) |
|
||||||
|
| \`wait --networkidle\` | Wait for network to be idle |
|
||||||
|
| \`wait --load\` | Wait for page load event |
|
||||||
|
|
||||||
|
### Inspection
|
||||||
|
| Command | Description |
|
||||||
|
|---------|-------------|
|
||||||
|
| \`js <expr>\` | Run JavaScript |
|
||||||
|
| \`css <sel> <prop>\` | Computed CSS |
|
||||||
|
| \`attrs <sel>\` | Element attributes |
|
||||||
|
| \`is <prop> <sel>\` | State check (visible/hidden/enabled/disabled/checked/editable/focused) |
|
||||||
|
| \`console [--clear\\|--errors]\` | Console messages (--errors filters to error/warning) |`;
|
||||||
|
|
||||||
|
const client = new Anthropic();
|
||||||
|
const response = await client.messages.create({
|
||||||
|
model: 'claude-haiku-4-5-20251001',
|
||||||
|
max_tokens: 1024,
|
||||||
|
messages: [{
|
||||||
|
role: 'user',
|
||||||
|
content: `You are comparing two versions of CLI documentation for an AI coding agent.
|
||||||
|
|
||||||
|
VERSION A (baseline — hand-maintained):
|
||||||
|
${baseline}
|
||||||
|
|
||||||
|
VERSION B (auto-generated from source):
|
||||||
|
${genSection}
|
||||||
|
|
||||||
|
Which version is better for an AI agent trying to use these commands? Consider:
|
||||||
|
- Completeness (more commands documented? all args shown?)
|
||||||
|
- Clarity (descriptions helpful?)
|
||||||
|
- Coverage (missing commands in either version?)
|
||||||
|
|
||||||
|
Respond with ONLY valid JSON:
|
||||||
|
{"winner": "A" or "B" or "tie", "reasoning": "brief explanation", "a_score": N, "b_score": N}
|
||||||
|
|
||||||
|
Scores are 1-5 overall quality.`,
|
||||||
|
}],
|
||||||
|
});
|
||||||
|
|
||||||
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
||||||
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
||||||
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
||||||
|
const result = JSON.parse(jsonMatch[0]);
|
||||||
|
console.log('Regression comparison:', JSON.stringify(result, null, 2));
|
||||||
|
|
||||||
|
// Generated version should be at least as good as hand-maintained
|
||||||
|
expect(result.b_score).toBeGreaterThanOrEqual(result.a_score);
|
||||||
|
}, 30_000);
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user