haiany
/
codegraph
огледало од https://github.com/colbymchenry/codegraph.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							#!/usr/bin/env node
// Analyze the tool-surface ablation (/tmp/arms/<repo>/<arm>-r<n>.jsonl).
// Compares arms A–E on trace adoption, Read/Grep fallback, codegraph payload,
// round-trips, and duration — averaged across runs per arm.
//
// The decisive signal is READS: if removing a tool raises Reads on a question
// class, that tool was load-bearing for it (not redundant). If removing it
// changes nothing, it was redundant.
//
//   A control       all tools            no steering   (baseline)
//   B steer         all tools            trace-first   (adoption)
//   C no-explore    hide explore         trace-first   (is explore redundant?)
//   D trace-centric hide explore+context trace-first   (is the survey pair redundant?)
//   E control-probe hide explore+context trace-first   (NON-flow Q — should degrade)
//
// Usage: node scripts/agent-eval/parse-arms.mjs [/tmp/arms]
import { readFileSync, readdirSync, existsSync, statSync } from 'fs';
import { join } from 'path';

const ROOT = process.argv[2] || '/tmp/arms';
const cgShort = (n) => n.replace('mcp__codegraph__codegraph_', '').replace('mcp__codegraph__', '');

function parse(file) {
  if (!existsSync(file)) return null;
  const lines = readFileSync(file, 'utf8').split('\n').filter(Boolean);
  const calls = []; let result = null, initCg = 0;
  for (const l of lines) {
    let ev; try { ev = JSON.parse(l); } catch { continue; }
    if (ev.type === 'system' && ev.subtype === 'init') initCg = (ev.tools || []).filter(t => /codegraph/.test(t)).length;
    if (ev.type === 'assistant') for (const b of (ev.message?.content || [])) if (b.type === 'tool_use')
      calls.push({ id: b.id, name: b.name, out: 0 });
    if (ev.type === 'user') for (const b of (ev.message?.content || [])) if (b.type === 'tool_result') {
      const c = b.content;
      const txt = typeof c === 'string' ? c : Array.isArray(c) ? c.map(x => x?.text || '').join('') : '';
      const call = calls.find(k => k.id === b.tool_use_id); if (call) call.out = txt.length;
    }
    if (ev.type === 'result') result = ev;
  }
  const cg = calls.filter(c => c.name.includes('codegraph'));
  return {
    initCg,
    reads: calls.filter(c => c.name === 'Read').length,
    greps: calls.filter(c => c.name === 'Grep').length + calls.filter(c => c.name === 'Glob').length,
    cgCalls: cg.length,
    cgSeq: cg.map(c => cgShort(c.name)),
    cgOut: cg.reduce((s, c) => s + c.out, 0),
    traceUsed: cg.some(c => c.name.includes('trace')),
    turns: result?.num_turns ?? null,
    dur: result?.duration_ms ? Math.round(result.duration_ms / 1000) : null,
    cost: result?.total_cost_usd || 0,
    ok: result?.subtype === 'success',
  };
}

// repo -> arm -> [runs]
const data = {};
if (!existsSync(ROOT)) { console.error(`no ${ROOT}`); process.exit(1); }
for (const repo of readdirSync(ROOT)) {
  const rdir = join(ROOT, repo);
  if (!statSync(rdir).isDirectory()) continue;
  for (const f of readdirSync(rdir)) {
    const m = f.match(/^([A-I])-r(\d+)\.jsonl$/); if (!m) continue;
    const p = parse(join(rdir, f)); if (!p || !p.ok) continue;
    (((data[repo] ??= {})[m[1]]) ??= []).push(p);
  }
}

const avg = (a, f) => a.length ? a.reduce((s, x) => s + (f(x) || 0), 0) / a.length : 0;
const k = (n) => (n / 1000).toFixed(1);
const pad = (s, n) => String(s).padEnd(n);
const ARMS = ['A', 'H', 'I', 'B', 'F', 'G', 'C', 'D', 'E'];
const LABEL = { A: 'A all/none(old)', H: 'H body-trace/none', I: 'I bodytrace+dest', B: 'B all/steer(thin)', F: 'F all/steer(body)', G: 'G ported(noprompt)', C: 'C no-explore', D: 'D trace-centric', E: 'E nonflow-probe' };

// ---- per repo × arm ----
console.log('\n=== PER REPO × ARM (avg over runs) ===');
console.log(pad('repo', 22), pad('arm', 16), 'tools', 'trace', pad('reads', 6), pad('cgOutK', 7), pad('turns', 6), 'dur');
for (const repo of Object.keys(data).sort()) {
  for (const arm of ARMS) {
    const runs = data[repo][arm]; if (!runs?.length) continue;
    console.log(
      pad(repo, 22), pad(LABEL[arm], 16),
      pad(runs[0].initCg, 5),
      pad(runs.filter(r => r.traceUsed).length + '/' + runs.length, 5),
      pad(avg(runs, r => r.reads).toFixed(1), 6),
      pad(k(avg(runs, r => r.cgOut)), 7),
      pad(avg(runs, r => r.turns).toFixed(1), 6),
      avg(runs, r => r.dur).toFixed(0) + 's',
    );
  }
}

// ---- aggregate per arm (flow arms A–D over the flow repos; E shown apart) ----
console.log('\n=== AGGREGATE PER ARM (mean across repos) ===');
console.log(pad('arm', 16), pad('adoption', 9), pad('reads', 7), pad('greps', 7), pad('cgOutK', 8), pad('turns', 7), pad('dur', 6), 'cost');
for (const arm of ARMS) {
  const all = [];
  for (const repo of Object.keys(data)) for (const r of (data[repo][arm] || [])) all.push({ ...r, repo });
  if (!all.length) continue;
  const repos = new Set(all.map(r => r.repo)).size;
  const adopt = all.filter(r => r.traceUsed).length;
  console.log(
    pad(LABEL[arm], 16),
    pad(`${adopt}/${all.length}`, 9),
    pad(avg(all, r => r.reads).toFixed(2), 7),
    pad(avg(all, r => r.greps).toFixed(2), 7),
    pad(k(avg(all, r => r.cgOut)), 8),
    pad(avg(all, r => r.turns).toFixed(1), 7),
    pad(avg(all, r => r.dur).toFixed(0) + 's', 6),
    '$' + avg(all, r => r.cost).toFixed(3),
    `  (${repos} repos)`,
  );
}

console.log('\nRead the signal: B vs A = does steering alone fix adoption + cut payload.');
console.log('C vs B = is explore redundant (reads should NOT jump). D vs C = is context redundant.');
console.log('E = non-flow under trace-centric; reads SHOULD jump (proves survey tools are load-bearing).');