1
0

offload-eval-effort.mjs 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. #!/usr/bin/env node
  2. // Effort A/B — does CODEGRAPH_OFFLOAD_EFFORT=high improve offload SYNTHESIS FIDELITY vs low?
  3. // Probe-based (no agent): for each repo × effort × rep, run codegraph_explore with the offload
  4. // ON on the canonical question, capture the synthesized answer + AI tokens/cost/latency, then
  5. // Sonnet-judge that answer's fidelity vs source-verified ground truth. Isolates the synthesis
  6. // from agent/adoption noise. Requires `codegraph login` (managed offload) + indexed repos.
  7. //
  8. // Env: REPS (default 3) · CG_ENGINE (engine repo) · AGENT_EVAL_OUT (repos under /repos) · CONC (judge concurrency)
  9. import { pathToFileURL, fileURLToPath } from 'node:url';
  10. import { resolve, dirname, join } from 'node:path';
  11. import { readFileSync, writeFileSync, existsSync, rmSync } from 'node:fs';
  12. import { execFile } from 'node:child_process';
  13. import { tmpdir } from 'node:os';
  14. const HERE = dirname(fileURLToPath(import.meta.url));
  15. const ENGINE = process.env.CG_ENGINE || resolve(HERE, '..', '..');
  16. const OUT = process.env.AGENT_EVAL_OUT || '/tmp/cg-offload-eval';
  17. const REPOS = join(OUT, 'repos');
  18. const GT = JSON.parse(readFileSync(resolve(HERE, 'offload-eval-ground-truth.json'), 'utf8'));
  19. const REPS = Number(process.env.REPS || 3);
  20. const CONC = Number(process.env.CONC || 4);
  21. const EFFORTS = (process.env.EFFORTS_FILTER || 'low,high').split(',');
  22. const ONLY = process.env.REPOS_FILTER ? new Set(process.env.REPOS_FILTER.split(',')) : null;
  23. const TIER = { mtkruto: 'small', postybirb: 'medium', shapeshift: 'complex', trezor: 'large' };
  24. const load = async (rel) => import(pathToFileURL(resolve(ENGINE, rel)).href);
  25. const idx = await load('dist/index.js');
  26. const toolsMod = await load('dist/mcp/tools.js');
  27. const CodeGraph = idx.default?.default ?? idx.default ?? idx.CodeGraph;
  28. const ToolHandler = toolsMod.ToolHandler ?? toolsMod.default?.ToolHandler;
  29. if (typeof CodeGraph?.openSync !== 'function' || typeof ToolHandler !== 'function') {
  30. console.error('could not load engine from', ENGINE); process.exit(2);
  31. }
  32. const fidPrompt = (gt, ans) => `You are scoring the FIDELITY of a machine-synthesized code-exploration answer against verified ground truth. Do NOT use any tools.
  33. QUESTION: ${gt.question}
  34. VERIFIED GROUND TRUTH (the actual call path + files):
  35. ${gt.truth}
  36. SYNTHESIZED ANSWER (to score):
  37. ${ans || '(empty)'}
  38. Judge: (1) is the traced call path correct vs ground truth? (2) are the cited files/symbols correct (not fabricated)? (3) if it gave a "Coverage:" verdict, was it honest? A confident WRONG trace is the worst outcome — penalize it harder than an honest partial.
  39. Output ONLY minified JSON: {"verdict":"pass|partial|fail","score":<0-100>,"fabrication":<true|false>,"coverageHonest":<true|false>,"note":"<=20 words"}`;
  40. const askJudge = (prompt) => new Promise((res) => {
  41. execFile('claude', ['-p', prompt, '--model', 'sonnet', '--effort', 'high', '--max-budget-usd', '0.5',
  42. '--strict-mcp-config', '--mcp-config', '{"mcpServers":{}}'],
  43. { cwd: OUT, maxBuffer: 1 << 24, timeout: 120000 }, (err, stdout) => {
  44. const m = (stdout || '').match(/\{[\s\S]*\}/);
  45. if (!m) return res({ verdict: 'error', score: null, note: (err ? err.message : 'no json').slice(0, 60) });
  46. try { res(JSON.parse(m[0])); } catch { res({ verdict: 'error', score: null }); }
  47. });
  48. });
  49. // ---- 1. Probe: collect synthesized answers at each effort -------------------
  50. const records = [];
  51. for (const repo of Object.keys(GT)) {
  52. if (ONLY && !ONLY.has(repo)) continue;
  53. const dir = join(REPOS, repo);
  54. if (!existsSync(join(dir, '.codegraph'))) { console.error('skip (not indexed):', repo); continue; }
  55. const cg = CodeGraph.openSync(dir);
  56. const h = new ToolHandler(cg);
  57. for (const effort of EFFORTS) {
  58. for (let rep = 1; rep <= REPS; rep++) {
  59. process.env.CODEGRAPH_OFFLOAD_EFFORT = effort;
  60. const usageLog = join(tmpdir(), `effort-${repo}-${effort}-${rep}.jsonl`);
  61. try { rmSync(usageLog); } catch { /* none */ }
  62. process.env.CODEGRAPH_OFFLOAD_USAGE_LOG = usageLog;
  63. let answer = '';
  64. try { answer = (await h.execute('codegraph_explore', { query: GT[repo].question }))?.content?.[0]?.text ?? ''; }
  65. catch (e) { console.error(` ${repo}/${effort}#${rep} explore failed: ${e?.message}`); }
  66. const fired = /Synthesized by CodeGraph/.test(answer);
  67. const ai = { tokens: 0, cost: 0, ms: 0 };
  68. if (existsSync(usageLog)) for (const e of readFileSync(usageLog, 'utf8').split('\n').filter(Boolean).map(JSON.parse)) {
  69. ai.tokens += e.totalTokens || 0; ai.cost += e.costUsd || 0; ai.ms += e.ms || 0;
  70. }
  71. records.push({ repo, tier: TIER[repo], effort, rep, fired, ai, answer });
  72. console.error(` ${repo}/${effort}#${rep}: fired=${fired} ${ai.tokens}tok $${ai.cost.toFixed(4)} ${ai.ms}ms`);
  73. }
  74. }
  75. try { cg.close?.(); } catch { /* none */ }
  76. }
  77. // ---- 2. Judge fidelity (concurrency) ---------------------------------------
  78. console.error(`\njudging ${records.length} answers (concurrency ${CONC})...`);
  79. let done = 0;
  80. const q = [...records];
  81. async function worker() { while (q.length) { const r = q.shift(); r.fid = await askJudge(fidPrompt(GT[r.repo], r.answer)); console.error(` [${++done}/${records.length}] ${r.repo}/${r.effort}#${r.rep}: ${r.fid.verdict} ${r.fid.score ?? ''}`); } }
  82. await Promise.all(Array.from({ length: CONC }, worker));
  83. writeFileSync(join(OUT, 'effort-results.jsonl'), records.map((r) => JSON.stringify(r)).join('\n') + '\n');
  84. // ---- 3. Aggregate: low vs high per repo ------------------------------------
  85. const med = (a) => { a = a.filter((x) => x != null).sort((x, y) => x - y); return a.length ? (a.length % 2 ? a[(a.length - 1) / 2] : (a[a.length / 2 - 1] + a[a.length / 2]) / 2) : null; };
  86. console.log(`\n${'='.repeat(80)}\nEFFORT A/B — offload synthesis fidelity (probe, n=${REPS}/cell)\n${'='.repeat(80)}`);
  87. console.log(`${'repo'.padEnd(11)} ${'tier'.padEnd(8)} ${'effort'.padEnd(6)} fired ${'fid(med)'.padStart(8)} ${'fab%'.padStart(5)} ${'AItok'.padStart(7)} ${'AIcost'.padStart(8)} ${'ms(med)'.padStart(8)}`);
  88. for (const repo of Object.keys(GT)) {
  89. for (const effort of EFFORTS) {
  90. const rs = records.filter((r) => r.repo === repo && r.effort === effort);
  91. if (!rs.length) continue;
  92. const fids = rs.map((r) => r.fid?.score).filter((x) => x != null);
  93. const fab = rs.filter((r) => r.fid?.fabrication === true).length;
  94. console.log(`${repo.padEnd(11)} ${TIER[repo].padEnd(8)} ${effort.padEnd(6)} ${rs.filter((r) => r.fired).length}/${rs.length} ${String(med(fids) ?? '—').padStart(8)} ${String(Math.round(100 * fab / rs.length) + '%').padStart(5)} ${String(Math.round(med(rs.map((r) => r.ai.tokens)) / 1000) + 'k').padStart(7)} ${('$' + (med(rs.map((r) => r.ai.cost)) ?? 0).toFixed(4)).padStart(8)} ${String(med(rs.map((r) => r.ai.ms)) ?? '—').padStart(8)}`);
  95. }
  96. }
  97. console.log('');