parse-bench-readme.mjs 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env node
  2. // Aggregate the README A/B (bench-readme.sh output): per repo, median of N runs
  3. // per arm → time, tool calls, tokens, cost, and % saved. Plus an average row.
  4. //
  5. // Tokens = SUM of per-turn assistant `usage` (input + output + cache read +
  6. // cache creation) — the cumulative "total tokens processed". NOTE: `result.usage`
  7. // is last-turn-only in current Claude Code, so it under-counts badly; don't use it.
  8. // `total_cost_usd` and `duration_ms` are already cumulative.
  9. //
  10. // Usage: node parse-bench-readme.mjs [/tmp/ab-readme]
  11. import { readFileSync, existsSync, readdirSync } from 'fs';
  12. import { join } from 'path';
  13. const ROOT = process.argv[2] || '/tmp/ab-readme';
  14. const REPOS = ['vscode', 'excalidraw', 'django', 'tokio', 'okhttp', 'gin', 'alamofire'];
  15. function parse(file) {
  16. if (!existsSync(file)) return null;
  17. const L = readFileSync(file, 'utf8').split('\n').filter(Boolean);
  18. let tools = 0, reads = 0, grep = 0, cg = 0, tokens = 0, r = null, raced = false;
  19. for (const l of L) { let e; try { e = JSON.parse(l); } catch { continue; }
  20. if (e.type === 'assistant') {
  21. const u = e.message?.usage;
  22. if (u) tokens += (u.input_tokens || 0) + (u.output_tokens || 0) + (u.cache_read_input_tokens || 0) + (u.cache_creation_input_tokens || 0);
  23. for (const b of (e.message?.content || [])) if (b.type === 'tool_use') {
  24. const n = b.name;
  25. if (n === 'ToolSearch') continue;
  26. tools++;
  27. if (n === 'Read') reads++;
  28. else if (n === 'Grep' || n === 'Glob') grep++;
  29. else if (/codegraph/.test(n)) cg++;
  30. }
  31. }
  32. // MCP cold-start race: the headless agent fired before `codegraph serve --mcp`
  33. // finished registering its tools, so early calls returned "No such tool
  34. // available" and the agent floundered into grep/Read. That measures CodeGraph's
  35. // startup latency, NOT its steady-state value — flag the run so the aggregate
  36. // can exclude it (an artifact of headless first-turn timing, not the tool).
  37. if (e.type === 'user') for (const b of (Array.isArray(e.message?.content) ? e.message.content : [])) {
  38. if (b.type === 'tool_result') {
  39. const t = Array.isArray(b.content) ? b.content.map(c => c.text || '').join('') : (b.content || '');
  40. if (/No such tool available/.test(t)) raced = true;
  41. }
  42. }
  43. if (e.type === 'result') r = e;
  44. }
  45. if (!r || r.subtype !== 'success') return null;
  46. return { dur: r.duration_ms / 1000, tools, reads, grep, cg, tokens, cost: r.total_cost_usd || 0, raced };
  47. }
  48. const median = (arr) => { const v = [...arr].sort((a, b) => a - b); const n = v.length; return n === 0 ? 0 : n % 2 ? v[(n - 1) / 2] : (v[n / 2 - 1] + v[n / 2]) / 2; };
  49. const fmtTime = (s) => s >= 60 ? `${Math.floor(s / 60)}m ${Math.round(s % 60)}s` : `${Math.round(s)}s`;
  50. const fmtTok = (t) => t >= 1e6 ? `${(t / 1e6).toFixed(1)}M` : `${Math.round(t / 1000)}k`;
  51. const pct = (w, wo) => wo > 0 ? Math.round((1 - w / wo) * 100) : 0;
  52. console.log('repo n(w/wo) time WITH→WITHOUT tools W→WO tokens W→WO (saved) cost W→WO (saved)');
  53. const savings = { cost: [], tokens: [], time: [], tools: [] };
  54. for (const repo of REPOS) {
  55. const dir = join(ROOT, repo);
  56. const runDirs = existsSync(dir) ? readdirSync(dir).filter(d => /^run\d+$/.test(d)) : [];
  57. // Exclude MCP-cold-start-raced WITH runs by default — they measure a startup
  58. // race, not steady-state value. `CG_INCLUDE_RACED=1` keeps them (to see the raw
  59. // distribution). The WITHOUT arm has no MCP, so it's never raced.
  60. const includeRaced = process.env.CG_INCLUDE_RACED === '1';
  61. const W = [], WO = []; let racedExcluded = 0;
  62. for (const rd of runDirs) {
  63. const w = parse(join(dir, rd, 'run-headless-with.jsonl'));
  64. if (w) { if (w.raced && !includeRaced) racedExcluded++; else W.push(w); }
  65. const wo = parse(join(dir, rd, 'run-headless-without.jsonl')); if (wo) WO.push(wo);
  66. }
  67. if (!W.length || !WO.length) { console.log(`${repo.padEnd(11)} (incomplete: w=${W.length} wo=${WO.length})`); continue; }
  68. const m = (arr, k) => median(arr.map(x => x[k]));
  69. const wT = m(W, 'dur'), woT = m(WO, 'dur'), wTok = m(W, 'tokens'), woTok = m(WO, 'tokens');
  70. const wC = m(W, 'cost'), woC = m(WO, 'cost'), wTl = m(W, 'tools'), woTl = m(WO, 'tools');
  71. savings.time.push(pct(wT, woT)); savings.tokens.push(pct(wTok, woTok)); savings.cost.push(pct(wC, woC)); savings.tools.push(pct(wTl, woTl));
  72. console.log(
  73. `${repo.padEnd(11)} ${W.length}/${WO.length} ` +
  74. `${(fmtTime(wT) + '→' + fmtTime(woT)).padEnd(22)}` +
  75. `${(Math.round(wTl) + '→' + Math.round(woTl)).padEnd(12)}` +
  76. `${(fmtTok(wTok) + '→' + fmtTok(woTok) + ' (' + pct(wTok, woTok) + '%)').padEnd(24)}` +
  77. `$${wC.toFixed(2)}→$${woC.toFixed(2)} (${pct(wC, woC)}%)` +
  78. (racedExcluded ? ` [${racedExcluded} raced run${racedExcluded === 1 ? '' : 's'} excluded]` : '')
  79. );
  80. }
  81. const avg = (a) => a.length ? Math.round(a.reduce((s, x) => s + x, 0) / a.length) : 0;
  82. console.log(`\nAVERAGE saved: cost ${avg(savings.cost)}% · tokens ${avg(savings.tokens)}% · time ${avg(savings.time)}% · tool calls ${avg(savings.tools)}%`);