offload-eval-judge.mjs 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. #!/usr/bin/env node
  2. // Accuracy judge. For each run in results.jsonl:
  3. // - end-to-end: agent finalAnswer vs verified ground truth (all arms)
  4. // - fidelity: offload synthesized answer vs ground truth (offload arm only)
  5. // Judge = claude -p sonnet --effort high, no tools, run from a neutral cwd,
  6. // JSON-only verdicts. Writes judged.jsonl (one line per run, verdicts merged).
  7. //
  8. // Usage: judge.mjs --results <f> --truth <f> --out <f> [--concurrency 4]
  9. import { readFileSync, writeFileSync, existsSync } from 'fs';
  10. import { execFile } from 'child_process';
  11. const A = {};
  12. for (let i = 2; i < process.argv.length; i += 2) A[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
  13. const results = readFileSync(A.results, 'utf8').split('\n').filter(Boolean).map(l => JSON.parse(l));
  14. const truth = JSON.parse(readFileSync(A.truth, 'utf8'));
  15. const OUT = A.out || '/tmp/cg-offload-eval/judged.jsonl';
  16. const CONC = Number(A.concurrency || 4);
  17. function askJudge(prompt) {
  18. return new Promise((resolve) => {
  19. execFile('claude', ['-p', prompt, '--model', 'sonnet', '--effort', 'high',
  20. '--max-budget-usd', '0.5', '--strict-mcp-config', '--mcp-config', '{"mcpServers":{}}'],
  21. // Run from a neutral dir with no repo files so the judge can't "cheat" by reading source.
  22. { cwd: process.env.AGENT_EVAL_OUT || '/tmp', maxBuffer: 1 << 24, timeout: 120000 },
  23. (err, stdout) => {
  24. const raw = (stdout || '').trim();
  25. const m = raw.match(/\{[\s\S]*\}/);
  26. if (!m) return resolve({ verdict: 'error', score: null, note: (err ? 'exec ' + err.message : 'no json').slice(0, 80) });
  27. try { resolve(JSON.parse(m[0])); } catch { resolve({ verdict: 'error', score: null, note: 'parse fail' }); }
  28. });
  29. });
  30. }
  31. const e2ePrompt = (gt, ans) => `You are scoring whether an AI coding agent correctly answered a code-flow question about a repository. Judge ONLY against the verified ground truth. Do NOT use any tools.
  32. QUESTION: ${gt.question}
  33. VERIFIED GROUND TRUTH (the actual call path + files):
  34. ${gt.truth}
  35. AGENT'S ANSWER:
  36. ${ans || '(empty)'}
  37. Score how correct the agent's answer is vs the ground truth. A "pass" means it identifies the core mechanism and the major hops with the right files/symbols and makes no materially wrong claim. "partial" = right area but misses major hops or has notable errors. "fail" = wrong layer, fabricated, or misses the mechanism.
  38. Output ONLY minified JSON, no prose, no code fences:
  39. {"verdict":"pass|partial|fail","score":<0-100>,"missedHops":["..."],"wrongClaims":["..."],"note":"<=20 words"}`;
  40. const fidPrompt = (gt, ans) => `You are scoring the FIDELITY of a machine-synthesized code-exploration answer against verified ground truth. The synthesized answer claims to trace a flow and cite file:line locations. Do NOT use any tools.
  41. QUESTION: ${gt.question}
  42. VERIFIED GROUND TRUTH (the actual call path + files):
  43. ${gt.truth}
  44. SYNTHESIZED ANSWER (to score):
  45. ${ans || '(empty)'}
  46. Judge: (1) is the traced call path correct vs ground truth? (2) are the cited files/symbols correct (not fabricated)? (3) if it gave a "Coverage:" verdict, was that verdict honest about what it actually covered? A confident WRONG trace is the worst outcome — penalize it harder than an honest "partial/not found".
  47. Output ONLY minified JSON, no prose, no code fences:
  48. {"verdict":"pass|partial|fail","score":<0-100>,"fabrication":<true|false>,"coverageHonest":<true|false>,"missedHops":["..."],"note":"<=20 words"}`;
  49. // Build the job list
  50. const jobs = [];
  51. for (const r of results) {
  52. const gt = truth[r.repo];
  53. if (!gt) { r._nojudge = true; continue; }
  54. jobs.push({ r, kind: 'e2e', prompt: e2ePrompt(gt, r.finalAnswer) });
  55. if (r.arm === 'offload' && Array.isArray(r.offloadAnswers))
  56. r.offloadAnswers.forEach((ans, i) => { if (ans && ans.trim()) jobs.push({ r, kind: 'fid', idx: i, prompt: fidPrompt(gt, ans) }); });
  57. }
  58. console.error(`judging ${jobs.length} verdicts across ${results.length} runs (concurrency ${CONC})...`);
  59. let done = 0;
  60. async function worker(queue) {
  61. while (queue.length) {
  62. const job = queue.shift();
  63. const v = await askJudge(job.prompt);
  64. if (job.kind === 'e2e') job.r.e2e = v; else (job.r._fid ??= []).push(v);
  65. console.error(` [${++done}/${jobs.length}] ${job.r.repo}/${job.r.arm}#${job.r.rep} ${job.kind}: ${v.verdict}${v.score != null ? ' ' + v.score : ''}`);
  66. }
  67. }
  68. const q = [...jobs];
  69. await Promise.all(Array.from({ length: CONC }, () => worker(q)));
  70. // Aggregate per-answer fidelity verdicts into one fidelity object per offload run.
  71. const medOf = (a) => { a = [...a].sort((x, y) => x - y); return a.length ? (a.length % 2 ? a[(a.length - 1) / 2] : (a[a.length / 2 - 1] + a[a.length / 2]) / 2) : null; };
  72. for (const r of results) {
  73. if (r._fid?.length) {
  74. const scores = r._fid.map(v => v.score).filter(x => x != null);
  75. r.fidelity = {
  76. n: r._fid.length, scores,
  77. max: scores.length ? Math.max(...scores) : null,
  78. min: scores.length ? Math.min(...scores) : null,
  79. median: medOf(scores),
  80. anyFabrication: r._fid.some(v => v.fabrication === true),
  81. allCoverageHonest: r._fid.every(v => v.coverageHonest !== false),
  82. verdicts: r._fid.map(v => v.verdict),
  83. };
  84. }
  85. delete r._fid;
  86. }
  87. writeFileSync(OUT, results.map(r => JSON.stringify(r)).join('\n') + '\n');
  88. console.error(`wrote ${OUT}`);