mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-17 17:51:27 +08:00
Existing AskUserQuestion format-regression tests only regex-match
"Recommendation:[*\s]*Choose" — they confirm the line exists but say nothing
about whether the "because Y" clause is present, specific, or substantive.
Agents frequently produce the line with boilerplate reasoning ("because it's
better"), and the regex passes anyway.
Add judgeRecommendation:
- Deterministic regex parses present / commits / has_because — no LLM call
needed for booleans, and skipping the LLM when has_because is false avoids
burning tokens on cases that already failed the format spec.
- Haiku 4.5 grades reason_substance 1-5 on a tight rubric scoped to the
because-clause itself (not the surrounding pros/cons menu — that menu is
context only). 5 = specific tradeoff vs an alternative; 3 = generic
("because it's faster"); 1 = boilerplate ("because it's better").
- callJudge generalized with a model arg, default Sonnet for back-compat
with judge / outcomeJudge / judgePosture callers.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
302 lines
13 KiB
TypeScript
302 lines
13 KiB
TypeScript
/**
|
|
* Shared LLM-as-judge helpers for eval and E2E tests.
|
|
*
|
|
* Provides callJudge (generic JSON-from-LLM), judge (doc quality scorer),
|
|
* outcomeJudge (planted-bug detection scorer), judgePosture (mode-posture
|
|
* regression scorer), and judgeRecommendation (AskUserQuestion recommendation
|
|
* substance scorer).
|
|
*
|
|
* Requires: ANTHROPIC_API_KEY env var
|
|
*/
|
|
|
|
import Anthropic from '@anthropic-ai/sdk';
|
|
|
|
export interface JudgeScore {
|
|
clarity: number; // 1-5
|
|
completeness: number; // 1-5
|
|
actionability: number; // 1-5
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface OutcomeJudgeResult {
|
|
detected: string[];
|
|
missed: string[];
|
|
false_positives: number;
|
|
detection_rate: number;
|
|
evidence_quality: number;
|
|
reasoning: string;
|
|
}
|
|
|
|
export interface PostureScore {
|
|
axis_a: number; // 1-5 — mode-specific primary rubric axis
|
|
axis_b: number; // 1-5 — mode-specific secondary rubric axis
|
|
reasoning: string;
|
|
}
|
|
|
|
export type PostureMode = 'expansion' | 'forcing' | 'builder';
|
|
|
|
export interface RecommendationScore {
|
|
/** Deterministic: a "Recommendation:" / "RECOMMENDATION:" line is present. */
|
|
present: boolean;
|
|
/** Deterministic: the recommendation names exactly one option (no hedging). */
|
|
commits: boolean;
|
|
/** Deterministic: the literal token "because " follows the choice. */
|
|
has_because: boolean;
|
|
/** Haiku judge, 1-5: specificity of the because-clause. See rubric in judgeRecommendation. */
|
|
reason_substance: number;
|
|
/** Extracted because-clause text, for diagnostics in test output. */
|
|
reason_text: string;
|
|
/** Judge's brief explanation. Empty when judge was skipped (no because-clause). */
|
|
reasoning: string;
|
|
}
|
|
|
|
/**
|
|
* Call an Anthropic model with a prompt, extract JSON response.
|
|
* Retries once on 429 rate limit errors. Defaults to Sonnet 4.6 for
|
|
* existing callers; pass a model id (e.g. claude-haiku-4-5-20251001)
|
|
* for cheaper bounded judgments like judgeRecommendation.
|
|
*/
|
|
export async function callJudge<T>(prompt: string, model: string = 'claude-sonnet-4-6'): Promise<T> {
|
|
const client = new Anthropic();
|
|
|
|
const makeRequest = () => client.messages.create({
|
|
model,
|
|
max_tokens: 1024,
|
|
messages: [{ role: 'user', content: prompt }],
|
|
});
|
|
|
|
let response;
|
|
try {
|
|
response = await makeRequest();
|
|
} catch (err: any) {
|
|
if (err.status === 429) {
|
|
await new Promise(r => setTimeout(r, 1000));
|
|
response = await makeRequest();
|
|
} else {
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
const text = response.content[0].type === 'text' ? response.content[0].text : '';
|
|
const jsonMatch = text.match(/\{[\s\S]*\}/);
|
|
if (!jsonMatch) throw new Error(`Judge returned non-JSON: ${text.slice(0, 200)}`);
|
|
return JSON.parse(jsonMatch[0]) as T;
|
|
}
|
|
|
|
/**
|
|
* Score documentation quality on clarity/completeness/actionability (1-5).
|
|
*/
|
|
export async function judge(section: string, content: string): Promise<JudgeScore> {
|
|
return callJudge<JudgeScore>(`You are evaluating documentation quality for an AI coding agent's CLI tool reference.
|
|
|
|
The agent reads this documentation to learn how to use a headless browser CLI. It needs to:
|
|
1. Understand what each command does
|
|
2. Know what arguments to pass
|
|
3. Know valid values for enum-like parameters
|
|
4. Construct correct command invocations without guessing
|
|
|
|
Rate the following ${section} on three dimensions (1-5 scale):
|
|
|
|
- **clarity** (1-5): Can an agent understand what each command/flag does from the description alone?
|
|
- **completeness** (1-5): Are arguments, valid values, and important behaviors documented? Would an agent need to guess anything?
|
|
- **actionability** (1-5): Can an agent construct correct command invocations from this reference alone?
|
|
|
|
Scoring guide:
|
|
- 5: Excellent — no ambiguity, all info present
|
|
- 4: Good — minor gaps an experienced agent could infer
|
|
- 3: Adequate — some guessing required
|
|
- 2: Poor — significant info missing
|
|
- 1: Unusable — agent would fail without external help
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{"clarity": N, "completeness": N, "actionability": N, "reasoning": "brief explanation"}
|
|
|
|
Here is the ${section} to evaluate:
|
|
|
|
${content}`);
|
|
}
|
|
|
|
/**
|
|
* Evaluate a QA report against planted-bug ground truth.
|
|
* Returns detection metrics for the planted bugs.
|
|
*/
|
|
export async function outcomeJudge(
|
|
groundTruth: any,
|
|
report: string,
|
|
): Promise<OutcomeJudgeResult> {
|
|
return callJudge<OutcomeJudgeResult>(`You are evaluating a QA testing report against known ground truth bugs.
|
|
|
|
GROUND TRUTH (${groundTruth.total_bugs} planted bugs):
|
|
${JSON.stringify(groundTruth.bugs, null, 2)}
|
|
|
|
QA REPORT (generated by an AI agent):
|
|
${report}
|
|
|
|
For each planted bug, determine if the report identified it. A bug counts as
|
|
"detected" if the report describes the same defect, even if the wording differs.
|
|
Use the detection_hint keywords as guidance.
|
|
|
|
Also count false positives: issues in the report that don't correspond to any
|
|
planted bug AND aren't legitimate issues with the page.
|
|
|
|
Respond with ONLY valid JSON:
|
|
{
|
|
"detected": ["bug-id-1", "bug-id-2"],
|
|
"missed": ["bug-id-3"],
|
|
"false_positives": 0,
|
|
"detection_rate": 2,
|
|
"evidence_quality": 4,
|
|
"reasoning": "brief explanation"
|
|
}
|
|
|
|
Rules:
|
|
- "detected" and "missed" arrays must only contain IDs from the ground truth: ${groundTruth.bugs.map((b: any) => b.id).join(', ')}
|
|
- detection_rate = length of detected array
|
|
- evidence_quality (1-5): Do detected bugs have screenshots, repro steps, or specific element references?
|
|
5 = excellent evidence for every bug, 1 = no evidence at all`);
|
|
}
|
|
|
|
/**
|
|
* Score mode-specific prose posture on two mode-dependent axes (1-5 each).
|
|
*
|
|
* Used by mode-posture regression tests to detect whether V1's Writing Style
|
|
* rules have flattened the distinctive energy of expansion / forcing / builder
|
|
* modes. See docs/designs/PLAN_TUNING_V1.md and the V1.1 mode-posture fix.
|
|
*
|
|
* The generator model is whatever the skill runs with (often Opus for
|
|
* plan-ceo-review). The judge is always Sonnet via callJudge() for cost.
|
|
*/
|
|
export async function judgePosture(mode: PostureMode, text: string): Promise<PostureScore> {
|
|
const rubrics: Record<PostureMode, { axis_a: string; axis_b: string; context: string }> = {
|
|
expansion: {
|
|
context: 'This text is expansion proposals emitted by /plan-ceo-review in SCOPE EXPANSION or SELECTIVE EXPANSION mode. The skill is supposed to lead with felt-experience vision, then close with concrete effort and impact.',
|
|
axis_a: 'surface_framing (1-5): Does each proposal lead with felt-experience framing ("imagine", "when the user sees", "the moment X happens", or equivalent) BEFORE closing with concrete metrics? Penalize pure feature bullets ("Add X. Improves Y by Z%").',
|
|
axis_b: 'decision_preservation (1-5): Does each proposal contain the elements a scope-expansion decision needs — what to build (concrete shape), effort (ideally both human and CC scales), risk or integration note? Penalize pure prose with no actionable content.',
|
|
},
|
|
forcing: {
|
|
context: 'This text is the Q3 Desperate Specificity question emitted by /office-hours startup mode. The skill is supposed to force the founder to name a specific person and consequence, stacking multiple pressures.',
|
|
axis_a: 'stacking_preserved (1-5): Does the question include at least 3 distinct sub-pressures (e.g., title? promoted? fired? up at night? OR career? day? weekend?) rather than a single neutral ask? Penalize "Who is your target user?" style collapses.',
|
|
axis_b: 'domain_matched_consequence (1-5): Does the named consequence match the domain context in the input (B2B → career impact, consumer → daily pain, hobby/open-source → weekend project)? Penalize one-size-fits-all B2B career framing for non-B2B ideas.',
|
|
},
|
|
builder: {
|
|
context: 'This text is builder-mode response from /office-hours. The skill is supposed to riff creatively — "what if you also..." adjacent unlocks, cross-domain combinations, the "whoa" moment — not emit a structured product roadmap.',
|
|
axis_a: 'unexpected_combinations (1-5): Does the output include at least 2 cross-domain or surprising adjacent unlocks ("what if you also...", "pipe it into X", etc.)? Penalize structured feature lists with no creative leaps.',
|
|
axis_b: 'excitement_over_optimization (1-5): Does the output read as a creative riff (enthusiastic, opinionated, evocative) or as a PRD / product roadmap (structured, metric-driven, conservative)? Penalize PRD-voice language like "improve retention", "enable virality", "consider adding".',
|
|
},
|
|
};
|
|
|
|
const r = rubrics[mode];
|
|
return callJudge<PostureScore>(`You are evaluating prose quality for a mode-specific posture regression test.
|
|
|
|
Context: ${r.context}
|
|
|
|
Rate the following output on two dimensions (1-5 scale each):
|
|
|
|
- **axis_a** — ${r.axis_a}
|
|
- **axis_b** — ${r.axis_b}
|
|
|
|
Scoring guide:
|
|
- 5: Excellent — strong, unambiguous match for the posture
|
|
- 4: Good — matches posture with minor weakness
|
|
- 3: Adequate — partial match, noticeable flatness or structure
|
|
- 2: Poor — posture mostly flattened / collapsed
|
|
- 1: Fail — posture entirely missing, reads as the opposite mode
|
|
|
|
Respond with ONLY valid JSON in this exact format:
|
|
{"axis_a": N, "axis_b": N, "reasoning": "brief explanation naming specific phrases that drove the score"}
|
|
|
|
Here is the output to evaluate:
|
|
|
|
${text}`);
|
|
}
|
|
|
|
/**
|
|
* Score the quality of an AskUserQuestion's recommendation line.
|
|
*
|
|
* Layered design:
|
|
* 1. Deterministic regex parse for present / commits / has_because. These
|
|
* don't need an LLM.
|
|
* 2. Haiku 4.5 judges only the 1-5 reason_substance axis on a tight rubric
|
|
* scoped to the because-clause itself (with the menu as context).
|
|
*
|
|
* Returns reason_substance = 1 with diagnostic reasoning when the because-clause
|
|
* is missing — no LLM call needed; substance is implicitly absent.
|
|
*
|
|
* Format spec: scripts/resolvers/preamble/generate-ask-user-format.ts
|
|
* Recommendation: <choice> because <one-line reason>
|
|
*/
|
|
export async function judgeRecommendation(askUserText: string): Promise<RecommendationScore> {
|
|
// Deterministic checks. The format spec requires:
|
|
// "Recommendation: <choice> because <reason>"
|
|
// Match case-insensitive on the leading word, allow optional markdown
|
|
// emphasis markers (** or __) the agent sometimes adds.
|
|
const recLine = askUserText.match(
|
|
/^[*_]*\s*recommendation\s*[*_]*\s*:\s*(.+)$/im,
|
|
);
|
|
const present = !!recLine;
|
|
const recBody = recLine?.[1]?.trim() ?? '';
|
|
|
|
// commits: reject obvious hedging language. The format-spec self-check
|
|
// requires the recommendation to name ONE choice; "either", "depending on",
|
|
// "if X then" all signal the model bailed on the commitment.
|
|
const commits = present && !/\b(either|depends? on|depending|if .+ then|or maybe|whichever)\b/i.test(recBody);
|
|
|
|
// has_because: literal "because" token in the body, per the format spec.
|
|
const becauseMatch = recBody.match(/\bbecause\s+(.+?)$/i);
|
|
const has_because = !!becauseMatch;
|
|
const reason_text = becauseMatch?.[1]?.trim() ?? '';
|
|
|
|
// If the because-clause is absent, the substance score is implicitly 1.
|
|
// Skip the LLM call — there is nothing to grade.
|
|
if (!present || !has_because || !reason_text) {
|
|
return {
|
|
present,
|
|
commits,
|
|
has_because,
|
|
reason_substance: 1,
|
|
reason_text,
|
|
reasoning: present
|
|
? 'No "because <reason>" clause found in recommendation line — substance scored 1 by deterministic check.'
|
|
: 'No "Recommendation:" line found in captured text — substance scored 1 by deterministic check.',
|
|
};
|
|
}
|
|
|
|
// LLM judge: rate the because-clause specifically, 1-5.
|
|
// The full askUserText is included as context so the judge can tell whether
|
|
// the reason names a tradeoff specific to the chosen option vs an alternative,
|
|
// but the score is about the because-clause itself, not the surrounding menu.
|
|
const prompt = `You are scoring the quality of one specific line in an AskUserQuestion: the "Recommendation: <choice> because <reason>" line. Score the because-clause substance on a 1-5 scale.
|
|
|
|
Rubric:
|
|
- 5: Reason names a SPECIFIC TRADEOFF that distinguishes the chosen option from at least one alternative (e.g. "because hybrid ships V1 in gstack-only without blocking on cross-repo gbrain coordination", "because Postgres preserves ACID guarantees the workflow already depends on").
|
|
- 4: Reason is concrete and option-specific but does NOT explicitly compare against an alternative (e.g. "because Redis gives sub-millisecond reads under load", "because the new schema removes the JOIN we were paying for").
|
|
- 3: Reason is real but generic — could apply to many options ("because it's faster", "because it's simpler", "because it ships sooner").
|
|
- 2: Reason restates the option label or is near-tautological ("because it's the hybrid one", "because that's the recommended approach").
|
|
- 1: Reason is boilerplate / empty ("because it's better", "because it works", "because it's the right choice").
|
|
|
|
You are scoring the because-clause itself, not the surrounding pros/cons or option labels. The menu is context only.
|
|
|
|
Extracted because-clause:
|
|
${reason_text}
|
|
|
|
Full AskUserQuestion (context only — do NOT score this):
|
|
${askUserText.slice(0, 4000)}
|
|
|
|
Respond with ONLY valid JSON:
|
|
{"reason_substance": N, "reasoning": "one sentence explanation citing the specific words that drove the score"}`;
|
|
|
|
const out = await callJudge<{ reason_substance: number; reasoning: string }>(
|
|
prompt,
|
|
'claude-haiku-4-5-20251001',
|
|
);
|
|
|
|
return {
|
|
present,
|
|
commits,
|
|
has_because,
|
|
reason_substance: out.reason_substance,
|
|
reason_text,
|
|
reasoning: out.reasoning,
|
|
};
|
|
}
|