| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- #!/usr/bin/env node
- // Accuracy judge. For each run in results.jsonl:
- // - end-to-end: agent finalAnswer vs verified ground truth (all arms)
- // - fidelity: offload synthesized answer vs ground truth (offload arm only)
- // Judge = claude -p sonnet --effort high, no tools, run from a neutral cwd,
- // JSON-only verdicts. Writes judged.jsonl (one line per run, verdicts merged).
- //
- // Usage: judge.mjs --results <f> --truth <f> --out <f> [--concurrency 4]
- import { readFileSync, writeFileSync, existsSync } from 'fs';
- import { execFile } from 'child_process';
- const A = {};
- for (let i = 2; i < process.argv.length; i += 2) A[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
- const results = readFileSync(A.results, 'utf8').split('\n').filter(Boolean).map(l => JSON.parse(l));
- const truth = JSON.parse(readFileSync(A.truth, 'utf8'));
- const OUT = A.out || '/tmp/cg-offload-eval/judged.jsonl';
- const CONC = Number(A.concurrency || 4);
- function askJudge(prompt) {
- return new Promise((resolve) => {
- execFile('claude', ['-p', prompt, '--model', 'sonnet', '--effort', 'high',
- '--max-budget-usd', '0.5', '--strict-mcp-config', '--mcp-config', '{"mcpServers":{}}'],
- // Run from a neutral dir with no repo files so the judge can't "cheat" by reading source.
- { cwd: process.env.AGENT_EVAL_OUT || '/tmp', maxBuffer: 1 << 24, timeout: 120000 },
- (err, stdout) => {
- const raw = (stdout || '').trim();
- const m = raw.match(/\{[\s\S]*\}/);
- if (!m) return resolve({ verdict: 'error', score: null, note: (err ? 'exec ' + err.message : 'no json').slice(0, 80) });
- try { resolve(JSON.parse(m[0])); } catch { resolve({ verdict: 'error', score: null, note: 'parse fail' }); }
- });
- });
- }
- const e2ePrompt = (gt, ans) => `You are scoring whether an AI coding agent correctly answered a code-flow question about a repository. Judge ONLY against the verified ground truth. Do NOT use any tools.
- QUESTION: ${gt.question}
- VERIFIED GROUND TRUTH (the actual call path + files):
- ${gt.truth}
- AGENT'S ANSWER:
- ${ans || '(empty)'}
- Score how correct the agent's answer is vs the ground truth. A "pass" means it identifies the core mechanism and the major hops with the right files/symbols and makes no materially wrong claim. "partial" = right area but misses major hops or has notable errors. "fail" = wrong layer, fabricated, or misses the mechanism.
- Output ONLY minified JSON, no prose, no code fences:
- {"verdict":"pass|partial|fail","score":<0-100>,"missedHops":["..."],"wrongClaims":["..."],"note":"<=20 words"}`;
- const fidPrompt = (gt, ans) => `You are scoring the FIDELITY of a machine-synthesized code-exploration answer against verified ground truth. The synthesized answer claims to trace a flow and cite file:line locations. Do NOT use any tools.
- QUESTION: ${gt.question}
- VERIFIED GROUND TRUTH (the actual call path + files):
- ${gt.truth}
- SYNTHESIZED ANSWER (to score):
- ${ans || '(empty)'}
- Judge: (1) is the traced call path correct vs ground truth? (2) are the cited files/symbols correct (not fabricated)? (3) if it gave a "Coverage:" verdict, was that verdict honest about what it actually covered? A confident WRONG trace is the worst outcome — penalize it harder than an honest "partial/not found".
- Output ONLY minified JSON, no prose, no code fences:
- {"verdict":"pass|partial|fail","score":<0-100>,"fabrication":<true|false>,"coverageHonest":<true|false>,"missedHops":["..."],"note":"<=20 words"}`;
- // Build the job list
- const jobs = [];
- for (const r of results) {
- const gt = truth[r.repo];
- if (!gt) { r._nojudge = true; continue; }
- jobs.push({ r, kind: 'e2e', prompt: e2ePrompt(gt, r.finalAnswer) });
- if (r.arm === 'offload' && Array.isArray(r.offloadAnswers))
- r.offloadAnswers.forEach((ans, i) => { if (ans && ans.trim()) jobs.push({ r, kind: 'fid', idx: i, prompt: fidPrompt(gt, ans) }); });
- }
- console.error(`judging ${jobs.length} verdicts across ${results.length} runs (concurrency ${CONC})...`);
- let done = 0;
- async function worker(queue) {
- while (queue.length) {
- const job = queue.shift();
- const v = await askJudge(job.prompt);
- if (job.kind === 'e2e') job.r.e2e = v; else (job.r._fid ??= []).push(v);
- console.error(` [${++done}/${jobs.length}] ${job.r.repo}/${job.r.arm}#${job.r.rep} ${job.kind}: ${v.verdict}${v.score != null ? ' ' + v.score : ''}`);
- }
- }
- const q = [...jobs];
- await Promise.all(Array.from({ length: CONC }, () => worker(q)));
- // Aggregate per-answer fidelity verdicts into one fidelity object per offload run.
- const medOf = (a) => { a = [...a].sort((x, y) => x - y); return a.length ? (a.length % 2 ? a[(a.length - 1) / 2] : (a[a.length / 2 - 1] + a[a.length / 2]) / 2) : null; };
- for (const r of results) {
- if (r._fid?.length) {
- const scores = r._fid.map(v => v.score).filter(x => x != null);
- r.fidelity = {
- n: r._fid.length, scores,
- max: scores.length ? Math.max(...scores) : null,
- min: scores.length ? Math.min(...scores) : null,
- median: medOf(scores),
- anyFabrication: r._fid.some(v => v.fabrication === true),
- allCoverageHonest: r._fid.every(v => v.coverageHonest !== false),
- verdicts: r._fid.map(v => v.verdict),
- };
- }
- delete r._fid;
- }
- writeFileSync(OUT, results.map(r => JSON.stringify(r)).join('\n') + '\n');
- console.error(`wrote ${OUT}`);
|