mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 10:31:30 +08:00
Extends the v1.25.1.0 AskUserQuestion recommendation-quality coverage to the
cross-model synthesis surfaces that were previously emitting prose without a
structured recommendation:
- /codex review (Step 2A) — after presenting Codex output + GATE verdict,
must emit `Recommendation: <action> because <reason>` line. Reason must
compare against alternatives (other findings, fix-vs-ship, fix-order).
- /codex challenge (Step 2B) — same requirement after adversarial output.
- /codex consult (Step 2C) — same requirement after consult presentation,
with examples for plan-review consults that engage with specific Codex
insights.
- Claude adversarial subagent (scripts/resolvers/review.ts:446, used by
/ship Step 11 + standalone /review) — subagent prompt now ends with
"After listing findings, end your output with ONE line in the canonical
format Recommendation: <action> because <reason>". Codex adversarial
command (line 461) gets the same final-line requirement.
The same `judgeRecommendation` helper grades both AskUserQuestion and
cross-model synthesis — one rubric, two surfaces. Substance-5 cross-model
recommendations explicitly compare against alternatives (a different
finding, fix-vs-ship, fix-order). Generic synthesis ("because adversarial
review found things") fails at threshold ≥ 4.
Tests:
- test/llm-judge-recommendation.test.ts gains 5 cross-model fixtures (3
substance ≥ 4, 2 substance < 4). Existing rubric correctly grades them.
- test/skill-cross-model-recommendation-emit.test.ts (new, free-tier) —
static guard greps codex/SKILL.md.tmpl + scripts/resolvers/review.ts for
the canonical emit instruction. Trips before any paid eval if the
templates drift.
Touchfile: extended `llm-judge-recommendation` entry with codex/SKILL.md.tmpl
and scripts/resolvers/review.ts so synthesis-template edits invalidate the
fixture re-run.
Verified: free `bun test` exits 0 (5/5 static emit-guard tests pass), paid
fixture passes 45/45 expect calls in 24s with the cross-model substance-5
fixtures correctly judged at >= 4.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
186 lines
8.9 KiB
TypeScript
186 lines
8.9 KiB
TypeScript
/**
|
|
* Fixture-based sanity test for judgeRecommendation.
|
|
*
|
|
* Replaces the original "manually inject bad text into a captured file
|
|
* and revert the SKILL template" sabotage step with deterministic
|
|
* negative coverage: hand-graded good/bad recommendation strings, asserted
|
|
* against the same threshold the production E2E tests use (>= 4).
|
|
*
|
|
* Costs ~$0.04 per run (4 Haiku calls + 3 deterministic-only fixtures).
|
|
* Touchfile-gated to test/helpers/llm-judge.ts so it fires on rubric
|
|
* tweaks but not every test run. Runs only under EVALS=1 with an API key.
|
|
*/
|
|
|
|
import { expect } from 'bun:test';
|
|
import { judgeRecommendation } from './helpers/llm-judge';
|
|
import { describeIfSelected, testIfSelected } from './helpers/e2e-helpers';
|
|
|
|
// Fixtures wrap a realistic AskUserQuestion shape so the judge sees the menu
|
|
// as context. The because-clause is what gets graded.
|
|
function buildAUQ(recommendation: string): string {
|
|
return `D1 — Where should the retrieval smarts live?
|
|
ELI10: Two ways to ship the retrieval layer that powers cross-skill memory. The choice changes who else can use it and how fast we ship V1.
|
|
Stakes if we pick wrong: V1 ships months later, OR every other agent has to rebuild the same logic.
|
|
${recommendation}
|
|
Note: options differ in kind, not coverage — no completeness score.
|
|
Pros / cons:
|
|
A) Server-side (gbrain ships the smarts)
|
|
✅ Reusable across every agent that calls gbrain — Codex, Cursor, etc.
|
|
❌ Cross-repo work; gbrain release tied to gstack release; slower V1
|
|
B) Client-side (gstack ships the smarts) (recommended)
|
|
✅ Ships entirely in gstack — no gbrain release dependency; faster V1
|
|
❌ Every other agent has to rebuild the same logic; multi-call overhead
|
|
C) Hybrid — V1 client-side, V1.5 promotes to gbrain
|
|
✅ Ships V1 value without cross-repo coordination; clear migration path
|
|
❌ Two-phase shipping; V1.5 risks slipping if priorities shift
|
|
Net: optimize for V1 ship velocity vs long-term agent reusability.`;
|
|
}
|
|
|
|
describeIfSelected('judgeRecommendation rubric sanity', ['llm-judge-recommendation'], () => {
|
|
testIfSelected('llm-judge-recommendation', async () => {
|
|
// Run all 7 fixtures sequentially in one test entry so the eval-store sees
|
|
// a single result; individual assertions surface as failed expectations.
|
|
|
|
// SUBSTANCE 5: option-specific reason that contrasts an alternative.
|
|
const good5 = await judgeRecommendation(buildAUQ(
|
|
'Recommendation: Choose C because hybrid ships V1 in gstack-only without blocking on cross-repo gbrain coordination, and locks the migration path before other agents take a hard dependency.',
|
|
));
|
|
expect(good5.present).toBe(true);
|
|
expect(good5.commits).toBe(true);
|
|
expect(good5.has_because).toBe(true);
|
|
expect(
|
|
good5.reason_substance,
|
|
`expected >=4 for option-specific cross-alternative reason; got ${good5.reason_substance}: ${good5.reasoning}`,
|
|
).toBeGreaterThanOrEqual(4);
|
|
|
|
// SUBSTANCE 4: concrete option-specific reason without alternative comparison.
|
|
const good4 = await judgeRecommendation(buildAUQ(
|
|
'Recommendation: Choose B because client-side composition uses MCP tools that already exist in gstack and avoids any gbrain release dependency for V1.',
|
|
));
|
|
expect(good4.present).toBe(true);
|
|
expect(
|
|
good4.reason_substance,
|
|
`expected >=4 for concrete option-specific reason; got ${good4.reason_substance}: ${good4.reasoning}`,
|
|
).toBeGreaterThanOrEqual(4);
|
|
|
|
// SUBSTANCE ~1: boilerplate.
|
|
const bad1 = await judgeRecommendation(buildAUQ(
|
|
'Recommendation: Choose B because it is better.',
|
|
));
|
|
expect(bad1.present).toBe(true);
|
|
expect(bad1.has_because).toBe(true);
|
|
expect(
|
|
bad1.reason_substance,
|
|
`expected <4 for boilerplate "because it is better"; got ${bad1.reason_substance}: ${bad1.reasoning}`,
|
|
).toBeLessThan(4);
|
|
|
|
// SUBSTANCE ~3: generic.
|
|
const bad3 = await judgeRecommendation(buildAUQ(
|
|
'Recommendation: Choose B because it is faster.',
|
|
));
|
|
expect(bad3.present).toBe(true);
|
|
expect(bad3.has_because).toBe(true);
|
|
expect(
|
|
bad3.reason_substance,
|
|
`expected <4 for generic "because it is faster"; got ${bad3.reason_substance}: ${bad3.reasoning}`,
|
|
).toBeLessThan(4);
|
|
|
|
// NO BECAUSE: missing causal connective.
|
|
const noBecause = await judgeRecommendation(buildAUQ(
|
|
'Recommendation: Choose B (it has the best tradeoffs).',
|
|
));
|
|
expect(noBecause.present).toBe(true);
|
|
expect(noBecause.has_because).toBe(false);
|
|
expect(noBecause.reason_substance).toBe(1);
|
|
|
|
// NO RECOMMENDATION: line missing entirely.
|
|
const noRec = await judgeRecommendation(`D1 — Where should the smarts live?
|
|
ELI10: ...
|
|
Pros / cons:
|
|
A) Server-side
|
|
B) Client-side
|
|
Net: ...`);
|
|
expect(noRec.present).toBe(false);
|
|
expect(noRec.has_because).toBe(false);
|
|
expect(noRec.reason_substance).toBe(1);
|
|
|
|
// CROSS-MODEL synthesis recommendations: when /codex or the Claude
|
|
// adversarial subagent emit a synthesis Recommendation line, it follows
|
|
// the same canonical shape and is graded by the same rubric. These
|
|
// fixtures pin the v1.25.1.0+ cross-model-skill emit format documented
|
|
// in codex/SKILL.md.tmpl Steps 2A/2B/2C and scripts/resolvers/review.ts.
|
|
// Substance-5 cross-model fixtures explicitly compare against an
|
|
// alternative (a different finding, a different recommended action, or
|
|
// no-fix vs fix). The same rubric the AskUserQuestion judge uses applies:
|
|
// strong reasons name a tradeoff distinguishing the chosen action from
|
|
// at least one alternative. Cross-model synthesis has implicit
|
|
// alternatives — different findings, different fix orders, ship-vs-fix —
|
|
// so the same shape applies.
|
|
const crossModelCases = [
|
|
[
|
|
'codex-review good',
|
|
'Recommendation: Fix the SQL injection at users_controller.rb:42 first because its auth-bypass blast radius is higher than the LFI Codex also flagged, and the parameterized-query fix is three lines vs the LFI session-handling rewrite.',
|
|
true, // expect substance >= 4
|
|
],
|
|
[
|
|
'adversarial good',
|
|
'Recommendation: Fix the unbounded retry loop at queue.ts:78 because it DoSes the worker pool under sustained 429s, which is higher-blast-radius than the timing leak Codex also flagged that only touches a debug endpoint.',
|
|
true,
|
|
],
|
|
[
|
|
'consult good',
|
|
'Recommendation: Adopt the sharding approach Codex suggested because it eliminates the head-of-line blocking the current writer-pool has, while the cache-layer alternative Codex also floated still has a single-writer hot path.',
|
|
true,
|
|
],
|
|
// SUBSTANCE ~1-2: boilerplate cross-model synthesis.
|
|
[
|
|
'cross-model boilerplate',
|
|
'Recommendation: Look at the findings because adversarial review found things.',
|
|
false, // expect substance < 4
|
|
],
|
|
[
|
|
'cross-model generic',
|
|
'Recommendation: Ship as-is because the diff is fine.',
|
|
false,
|
|
],
|
|
] as Array<[string, string, boolean]>;
|
|
for (const [label, text, shouldPass] of crossModelCases) {
|
|
const score = await judgeRecommendation(text);
|
|
expect(score.present, `[cross-model:${label}] present should be true`).toBe(true);
|
|
expect(score.has_because, `[cross-model:${label}] has_because should be true`).toBe(true);
|
|
if (shouldPass) {
|
|
expect(
|
|
score.reason_substance,
|
|
`[cross-model:${label}] expected substance >=4; got ${score.reason_substance}: ${score.reasoning}`,
|
|
).toBeGreaterThanOrEqual(4);
|
|
} else {
|
|
expect(
|
|
score.reason_substance,
|
|
`[cross-model:${label}] expected substance <4; got ${score.reason_substance}: ${score.reasoning}`,
|
|
).toBeLessThan(4);
|
|
}
|
|
}
|
|
|
|
// HEDGING: each alternate in the hedging regex is exercised separately.
|
|
// Most are no-because forms that short-circuit the LLM call entirely (the
|
|
// judge skips Haiku when has_because is false). The "either B or C
|
|
// because..." form does call Haiku, but cost is bounded — total <$0.02.
|
|
const hedgeForms = [
|
|
['either B or C', 'Recommendation: Choose either B or C because both ship faster than A.'],
|
|
['depends on traffic', 'Recommendation: A depends on traffic — pick B if read-heavy.'],
|
|
['depending on the team', 'Recommendation: depending on the team, A or B is fine.'],
|
|
['if X then Y', 'Recommendation: if low-traffic then A, otherwise B because both work.'],
|
|
['or maybe', 'Recommendation: A or maybe B because both ship in V1.'],
|
|
['whichever fits', 'Recommendation: whichever fits the team — A or B both work.'],
|
|
];
|
|
for (const [label, text] of hedgeForms) {
|
|
const score = await judgeRecommendation(buildAUQ(text));
|
|
expect(score.present, `[hedge:${label}] present should be true`).toBe(true);
|
|
expect(
|
|
score.commits,
|
|
`[hedge:${label}] expected commits=false; got ${score.commits}. text="${text}"`,
|
|
).toBe(false);
|
|
}
|
|
}, 240_000);
|
|
});
|