1
0

parse-arms.mjs 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. #!/usr/bin/env node
  2. // Analyze the tool-surface ablation (/tmp/arms/<repo>/<arm>-r<n>.jsonl).
  3. // Compares arms A–E on trace adoption, Read/Grep fallback, codegraph payload,
  4. // round-trips, and duration — averaged across runs per arm.
  5. //
  6. // The decisive signal is READS: if removing a tool raises Reads on a question
  7. // class, that tool was load-bearing for it (not redundant). If removing it
  8. // changes nothing, it was redundant.
  9. //
  10. // A control all tools no steering (baseline)
  11. // B steer all tools trace-first (adoption)
  12. // C no-explore hide explore trace-first (is explore redundant?)
  13. // D trace-centric hide explore+context trace-first (is the survey pair redundant?)
  14. // E control-probe hide explore+context trace-first (NON-flow Q — should degrade)
  15. //
  16. // Usage: node scripts/agent-eval/parse-arms.mjs [/tmp/arms]
  17. import { readFileSync, readdirSync, existsSync, statSync } from 'fs';
  18. import { join } from 'path';
  19. const ROOT = process.argv[2] || '/tmp/arms';
  20. const cgShort = (n) => n.replace('mcp__codegraph__codegraph_', '').replace('mcp__codegraph__', '');
  21. function parse(file) {
  22. if (!existsSync(file)) return null;
  23. const lines = readFileSync(file, 'utf8').split('\n').filter(Boolean);
  24. const calls = []; let result = null, initCg = 0;
  25. for (const l of lines) {
  26. let ev; try { ev = JSON.parse(l); } catch { continue; }
  27. if (ev.type === 'system' && ev.subtype === 'init') initCg = (ev.tools || []).filter(t => /codegraph/.test(t)).length;
  28. if (ev.type === 'assistant') for (const b of (ev.message?.content || [])) if (b.type === 'tool_use')
  29. calls.push({ id: b.id, name: b.name, out: 0 });
  30. if (ev.type === 'user') for (const b of (ev.message?.content || [])) if (b.type === 'tool_result') {
  31. const c = b.content;
  32. const txt = typeof c === 'string' ? c : Array.isArray(c) ? c.map(x => x?.text || '').join('') : '';
  33. const call = calls.find(k => k.id === b.tool_use_id); if (call) call.out = txt.length;
  34. }
  35. if (ev.type === 'result') result = ev;
  36. }
  37. const cg = calls.filter(c => c.name.includes('codegraph'));
  38. return {
  39. initCg,
  40. reads: calls.filter(c => c.name === 'Read').length,
  41. greps: calls.filter(c => c.name === 'Grep').length + calls.filter(c => c.name === 'Glob').length,
  42. cgCalls: cg.length,
  43. cgSeq: cg.map(c => cgShort(c.name)),
  44. cgOut: cg.reduce((s, c) => s + c.out, 0),
  45. traceUsed: cg.some(c => c.name.includes('trace')),
  46. turns: result?.num_turns ?? null,
  47. dur: result?.duration_ms ? Math.round(result.duration_ms / 1000) : null,
  48. cost: result?.total_cost_usd || 0,
  49. ok: result?.subtype === 'success',
  50. };
  51. }
  52. // repo -> arm -> [runs]
  53. const data = {};
  54. if (!existsSync(ROOT)) { console.error(`no ${ROOT}`); process.exit(1); }
  55. for (const repo of readdirSync(ROOT)) {
  56. const rdir = join(ROOT, repo);
  57. if (!statSync(rdir).isDirectory()) continue;
  58. for (const f of readdirSync(rdir)) {
  59. const m = f.match(/^([A-I])-r(\d+)\.jsonl$/); if (!m) continue;
  60. const p = parse(join(rdir, f)); if (!p || !p.ok) continue;
  61. (((data[repo] ??= {})[m[1]]) ??= []).push(p);
  62. }
  63. }
  64. const avg = (a, f) => a.length ? a.reduce((s, x) => s + (f(x) || 0), 0) / a.length : 0;
  65. const k = (n) => (n / 1000).toFixed(1);
  66. const pad = (s, n) => String(s).padEnd(n);
  67. const ARMS = ['A', 'H', 'I', 'B', 'F', 'G', 'C', 'D', 'E'];
  68. const LABEL = { A: 'A all/none(old)', H: 'H body-trace/none', I: 'I bodytrace+dest', B: 'B all/steer(thin)', F: 'F all/steer(body)', G: 'G ported(noprompt)', C: 'C no-explore', D: 'D trace-centric', E: 'E nonflow-probe' };
  69. // ---- per repo × arm ----
  70. console.log('\n=== PER REPO × ARM (avg over runs) ===');
  71. console.log(pad('repo', 22), pad('arm', 16), 'tools', 'trace', pad('reads', 6), pad('cgOutK', 7), pad('turns', 6), 'dur');
  72. for (const repo of Object.keys(data).sort()) {
  73. for (const arm of ARMS) {
  74. const runs = data[repo][arm]; if (!runs?.length) continue;
  75. console.log(
  76. pad(repo, 22), pad(LABEL[arm], 16),
  77. pad(runs[0].initCg, 5),
  78. pad(runs.filter(r => r.traceUsed).length + '/' + runs.length, 5),
  79. pad(avg(runs, r => r.reads).toFixed(1), 6),
  80. pad(k(avg(runs, r => r.cgOut)), 7),
  81. pad(avg(runs, r => r.turns).toFixed(1), 6),
  82. avg(runs, r => r.dur).toFixed(0) + 's',
  83. );
  84. }
  85. }
  86. // ---- aggregate per arm (flow arms A–D over the flow repos; E shown apart) ----
  87. console.log('\n=== AGGREGATE PER ARM (mean across repos) ===');
  88. console.log(pad('arm', 16), pad('adoption', 9), pad('reads', 7), pad('greps', 7), pad('cgOutK', 8), pad('turns', 7), pad('dur', 6), 'cost');
  89. for (const arm of ARMS) {
  90. const all = [];
  91. for (const repo of Object.keys(data)) for (const r of (data[repo][arm] || [])) all.push({ ...r, repo });
  92. if (!all.length) continue;
  93. const repos = new Set(all.map(r => r.repo)).size;
  94. const adopt = all.filter(r => r.traceUsed).length;
  95. console.log(
  96. pad(LABEL[arm], 16),
  97. pad(`${adopt}/${all.length}`, 9),
  98. pad(avg(all, r => r.reads).toFixed(2), 7),
  99. pad(avg(all, r => r.greps).toFixed(2), 7),
  100. pad(k(avg(all, r => r.cgOut)), 8),
  101. pad(avg(all, r => r.turns).toFixed(1), 7),
  102. pad(avg(all, r => r.dur).toFixed(0) + 's', 6),
  103. '$' + avg(all, r => r.cost).toFixed(3),
  104. ` (${repos} repos)`,
  105. );
  106. }
  107. console.log('\nRead the signal: B vs A = does steering alone fix adoption + cut payload.');
  108. console.log('C vs B = is explore redundant (reads should NOT jump). D vs C = is context redundant.');
  109. console.log('E = non-flow under trace-centric; reads SHOULD jump (proves survey tools are load-bearing).');