#!/usr/bin/env node // Effort A/B — does CODEGRAPH_OFFLOAD_EFFORT=high improve offload SYNTHESIS FIDELITY vs low? // Probe-based (no agent): for each repo × effort × rep, run codegraph_explore with the offload // ON on the canonical question, capture the synthesized answer + AI tokens/cost/latency, then // Sonnet-judge that answer's fidelity vs source-verified ground truth. Isolates the synthesis // from agent/adoption noise. Requires `codegraph login` (managed offload) + indexed repos. // // Env: REPS (default 3) · CG_ENGINE (engine repo) · AGENT_EVAL_OUT (repos under /repos) · CONC (judge concurrency) import { pathToFileURL, fileURLToPath } from 'node:url'; import { resolve, dirname, join } from 'node:path'; import { readFileSync, writeFileSync, existsSync, rmSync } from 'node:fs'; import { execFile } from 'node:child_process'; import { tmpdir } from 'node:os'; const HERE = dirname(fileURLToPath(import.meta.url)); const ENGINE = process.env.CG_ENGINE || resolve(HERE, '..', '..'); const OUT = process.env.AGENT_EVAL_OUT || '/tmp/cg-offload-eval'; const REPOS = join(OUT, 'repos'); const GT = JSON.parse(readFileSync(resolve(HERE, 'offload-eval-ground-truth.json'), 'utf8')); const REPS = Number(process.env.REPS || 3); const CONC = Number(process.env.CONC || 4); const EFFORTS = (process.env.EFFORTS_FILTER || 'low,high').split(','); const ONLY = process.env.REPOS_FILTER ? new Set(process.env.REPOS_FILTER.split(',')) : null; const TIER = { mtkruto: 'small', postybirb: 'medium', shapeshift: 'complex', trezor: 'large' }; const load = async (rel) => import(pathToFileURL(resolve(ENGINE, rel)).href); const idx = await load('dist/index.js'); const toolsMod = await load('dist/mcp/tools.js'); const CodeGraph = idx.default?.default ?? idx.default ?? idx.CodeGraph; const ToolHandler = toolsMod.ToolHandler ?? toolsMod.default?.ToolHandler; if (typeof CodeGraph?.openSync !== 'function' || typeof ToolHandler !== 'function') { console.error('could not load engine from', ENGINE); process.exit(2); } const fidPrompt = (gt, ans) => `You are scoring the FIDELITY of a machine-synthesized code-exploration answer against verified ground truth. Do NOT use any tools. QUESTION: ${gt.question} VERIFIED GROUND TRUTH (the actual call path + files): ${gt.truth} SYNTHESIZED ANSWER (to score): ${ans || '(empty)'} Judge: (1) is the traced call path correct vs ground truth? (2) are the cited files/symbols correct (not fabricated)? (3) if it gave a "Coverage:" verdict, was it honest? A confident WRONG trace is the worst outcome — penalize it harder than an honest partial. Output ONLY minified JSON: {"verdict":"pass|partial|fail","score":<0-100>,"fabrication":,"coverageHonest":,"note":"<=20 words"}`; const askJudge = (prompt) => new Promise((res) => { execFile('claude', ['-p', prompt, '--model', 'sonnet', '--effort', 'high', '--max-budget-usd', '0.5', '--strict-mcp-config', '--mcp-config', '{"mcpServers":{}}'], { cwd: OUT, maxBuffer: 1 << 24, timeout: 120000 }, (err, stdout) => { const m = (stdout || '').match(/\{[\s\S]*\}/); if (!m) return res({ verdict: 'error', score: null, note: (err ? err.message : 'no json').slice(0, 60) }); try { res(JSON.parse(m[0])); } catch { res({ verdict: 'error', score: null }); } }); }); // ---- 1. Probe: collect synthesized answers at each effort ------------------- const records = []; for (const repo of Object.keys(GT)) { if (ONLY && !ONLY.has(repo)) continue; const dir = join(REPOS, repo); if (!existsSync(join(dir, '.codegraph'))) { console.error('skip (not indexed):', repo); continue; } const cg = CodeGraph.openSync(dir); const h = new ToolHandler(cg); for (const effort of EFFORTS) { for (let rep = 1; rep <= REPS; rep++) { process.env.CODEGRAPH_OFFLOAD_EFFORT = effort; const usageLog = join(tmpdir(), `effort-${repo}-${effort}-${rep}.jsonl`); try { rmSync(usageLog); } catch { /* none */ } process.env.CODEGRAPH_OFFLOAD_USAGE_LOG = usageLog; let answer = ''; try { answer = (await h.execute('codegraph_explore', { query: GT[repo].question }))?.content?.[0]?.text ?? ''; } catch (e) { console.error(` ${repo}/${effort}#${rep} explore failed: ${e?.message}`); } const fired = /Synthesized by CodeGraph/.test(answer); const ai = { tokens: 0, cost: 0, ms: 0 }; if (existsSync(usageLog)) for (const e of readFileSync(usageLog, 'utf8').split('\n').filter(Boolean).map(JSON.parse)) { ai.tokens += e.totalTokens || 0; ai.cost += e.costUsd || 0; ai.ms += e.ms || 0; } records.push({ repo, tier: TIER[repo], effort, rep, fired, ai, answer }); console.error(` ${repo}/${effort}#${rep}: fired=${fired} ${ai.tokens}tok $${ai.cost.toFixed(4)} ${ai.ms}ms`); } } try { cg.close?.(); } catch { /* none */ } } // ---- 2. Judge fidelity (concurrency) --------------------------------------- console.error(`\njudging ${records.length} answers (concurrency ${CONC})...`); let done = 0; const q = [...records]; async function worker() { while (q.length) { const r = q.shift(); r.fid = await askJudge(fidPrompt(GT[r.repo], r.answer)); console.error(` [${++done}/${records.length}] ${r.repo}/${r.effort}#${r.rep}: ${r.fid.verdict} ${r.fid.score ?? ''}`); } } await Promise.all(Array.from({ length: CONC }, worker)); writeFileSync(join(OUT, 'effort-results.jsonl'), records.map((r) => JSON.stringify(r)).join('\n') + '\n'); // ---- 3. Aggregate: low vs high per repo ------------------------------------ const med = (a) => { a = a.filter((x) => x != null).sort((x, y) => x - y); return a.length ? (a.length % 2 ? a[(a.length - 1) / 2] : (a[a.length / 2 - 1] + a[a.length / 2]) / 2) : null; }; console.log(`\n${'='.repeat(80)}\nEFFORT A/B — offload synthesis fidelity (probe, n=${REPS}/cell)\n${'='.repeat(80)}`); console.log(`${'repo'.padEnd(11)} ${'tier'.padEnd(8)} ${'effort'.padEnd(6)} fired ${'fid(med)'.padStart(8)} ${'fab%'.padStart(5)} ${'AItok'.padStart(7)} ${'AIcost'.padStart(8)} ${'ms(med)'.padStart(8)}`); for (const repo of Object.keys(GT)) { for (const effort of EFFORTS) { const rs = records.filter((r) => r.repo === repo && r.effort === effort); if (!rs.length) continue; const fids = rs.map((r) => r.fid?.score).filter((x) => x != null); const fab = rs.filter((r) => r.fid?.fabrication === true).length; console.log(`${repo.padEnd(11)} ${TIER[repo].padEnd(8)} ${effort.padEnd(6)} ${rs.filter((r) => r.fired).length}/${rs.length} ${String(med(fids) ?? '—').padStart(8)} ${String(Math.round(100 * fab / rs.length) + '%').padStart(5)} ${String(Math.round(med(rs.map((r) => r.ai.tokens)) / 1000) + 'k').padStart(7)} ${('$' + (med(rs.map((r) => r.ai.cost)) ?? 0).toFixed(4)).padStart(8)} ${String(med(rs.map((r) => r.ai.ms)) ?? '—').padStart(8)}`); } } console.log('');