seq-matrix.mjs 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. #!/usr/bin/env node
  2. // Mine the surviving A/B stream-json logs (/tmp/ab-matrix/<Cell>/run-headless-*.jsonl)
  3. // for what the aggregate matrix can't see: the call SEQUENCE and per-call output SIZE.
  4. //
  5. // Answers three questions:
  6. // 1. Trace adoption — on a flow question, does the with-arm actually call codegraph_trace?
  7. // 2. Payload size vs repo size — is trace path-scoped (tiny, size-independent) while
  8. // explore is breadth-scoped (grows with the repo / over-returns on small repos)?
  9. // 3. Round-trips — num_turns with vs without (the real wall-clock driver).
  10. //
  11. // Usage: node scripts/agent-eval/seq-matrix.mjs [/tmp/ab-matrix]
  12. import { readFileSync, readdirSync, existsSync } from 'fs';
  13. import { join } from 'path';
  14. const AB = process.argv[2] || '/tmp/ab-matrix';
  15. const MD = new URL('../../docs/benchmarks/codegraph-ab-matrix.md', import.meta.url).pathname;
  16. // repo -> {lang,size,files} from the published matrix table
  17. const repoMeta = {};
  18. if (existsSync(MD)) for (const line of readFileSync(MD, 'utf8').split('\n')) {
  19. const m = line.match(/^\|\s*([^|]+?)\s*\|\s*(S|M|L)\s*\|\s*`([^`]+)`\s*\|\s*(\d+)\s*\|/);
  20. if (m) repoMeta[m[3]] = { lang: m[1].trim(), size: m[2], files: +m[4] };
  21. }
  22. const cgShort = (n) => n.replace('mcp__codegraph__codegraph_', '').replace('mcp__codegraph__', '');
  23. const tag = (n) => n === 'Read' ? 'R' : n === 'Grep' ? 'G' : n === 'Glob' ? 'Gl'
  24. : n === 'Bash' ? 'B' : n === 'Task' ? 'Ag' : n === 'ToolSearch' ? 'TS'
  25. : n.includes('codegraph') ? cgShort(n) : n;
  26. function parse(file) {
  27. if (!existsSync(file)) return null;
  28. const lines = readFileSync(file, 'utf8').split('\n').filter(Boolean);
  29. const calls = []; let result = null, initCg = 0;
  30. for (const l of lines) {
  31. let ev; try { ev = JSON.parse(l); } catch { continue; }
  32. if (ev.type === 'system' && ev.subtype === 'init') initCg = (ev.tools || []).filter(t => /codegraph/.test(t)).length;
  33. if (ev.type === 'assistant') for (const b of (ev.message?.content || [])) if (b.type === 'tool_use') {
  34. const i = b.input || {};
  35. const q = i.query ?? i.symbol ?? i.task ?? (i.from && i.to ? `${i.from}->${i.to}` : (i.file_path || i.command || ''));
  36. calls.push({ id: b.id, name: b.name, q: String(q ?? '').slice(0, 38), out: 0 });
  37. }
  38. if (ev.type === 'user') for (const b of (ev.message?.content || [])) if (b.type === 'tool_result') {
  39. const c = b.content;
  40. const txt = typeof c === 'string' ? c : Array.isArray(c) ? c.map(x => x?.text || '').join('') : '';
  41. const call = calls.find(k => k.id === b.tool_use_id); if (call) call.out = txt.length;
  42. }
  43. if (ev.type === 'result') result = ev;
  44. }
  45. const cg = calls.filter(c => c.name.includes('codegraph'));
  46. const perTool = {};
  47. for (const c of cg) { const k = cgShort(c.name); (perTool[k] ??= { n: 0, out: 0 }); perTool[k].n++; perTool[k].out += c.out; }
  48. const traceIdx = cg.findIndex(c => c.name.includes('trace'));
  49. const u = result?.usage || {};
  50. return {
  51. initCg, cg, perTool,
  52. cgSeq: cg.map(c => cgShort(c.name)),
  53. seq: calls.map(c => tag(c.name)),
  54. reads: calls.filter(c => c.name === 'Read').length,
  55. greps: calls.filter(c => c.name === 'Grep').length,
  56. cgOut: cg.reduce((s, c) => s + c.out, 0),
  57. traceUsed: traceIdx >= 0,
  58. afterTrace: traceIdx >= 0 ? cg.slice(traceIdx + 1).map(c => cgShort(c.name)) : null,
  59. turns: result?.num_turns ?? null,
  60. dur: result?.duration_ms ? Math.round(result.duration_ms / 1000) : null,
  61. cost: result?.total_cost_usd || 0,
  62. };
  63. }
  64. const cells = [];
  65. for (const d of readdirSync(AB)) {
  66. const dir = join(AB, d);
  67. if (!existsSync(join(dir, 'run-headless-with.jsonl'))) continue;
  68. const log = existsSync(join(AB, d + '.log')) ? readFileSync(join(AB, d + '.log'), 'utf8') : '';
  69. const repo = (log.match(/repo:\s*\S*\/([^\s/]+)/) || [])[1] || d;
  70. const question = (log.match(/question:\s*(.+)/) || [])[1] || '';
  71. cells.push({ cell: d, repo, question, ...(repoMeta[repo] || {}),
  72. with: parse(join(dir, 'run-headless-with.jsonl')),
  73. without: parse(join(dir, 'run-headless-without.jsonl')) });
  74. }
  75. cells.sort((a, b) => (a.files || 0) - (b.files || 0));
  76. const k = (n) => (n / 1000).toFixed(1);
  77. const pad = (s, n) => String(s).padEnd(n);
  78. // ---- per-cell sequence table ----
  79. console.log('\n=== PER-CELL: with-arm codegraph sequence + payload (sorted by repo size) ===');
  80. console.log(pad('repo', 22), pad('files', 6), 'trace', pad('cg-call sequence', 40), pad('cgOutK', 7), 'turns(w/wo)');
  81. for (const c of cells) {
  82. const w = c.with;
  83. console.log(
  84. pad(c.repo, 22), pad(c.files ?? '?', 6),
  85. pad(w.traceUsed ? 'YES' : 'no', 5),
  86. pad(w.cgSeq.join(',') || '(none)', 40),
  87. pad(k(w.cgOut), 7),
  88. `${w.turns}/${c.without?.turns}`,
  89. );
  90. }
  91. // ---- trace adoption ----
  92. const flow = cells; // every matrix question is a canonical flow question by design
  93. const used = flow.filter(c => c.with.traceUsed);
  94. console.log(`\n=== TRACE ADOPTION (all ${flow.length} cells are flow questions) ===`);
  95. console.log(`trace called in ${used.length}/${flow.length} cells`);
  96. console.log('used trace:', used.map(c => c.repo).join(', ') || '(none)');
  97. if (used.length) console.log('after-trace follow-ups:', used.map(c => `${c.repo}[${c.with.afterTrace.join(',') || 'none'}]`).join(' '));
  98. // ---- payload size by repo-size tier ----
  99. const tier = (f) => f < 200 ? 'S(<200)' : f < 2000 ? 'M(<2000)' : 'L(>=2000)';
  100. const byTier = {};
  101. for (const c of cells) { (byTier[tier(c.files || 0)] ??= []).push(c.with.cgOut); }
  102. console.log('\n=== with-arm TOTAL codegraph payload by repo-size tier ===');
  103. for (const t of ['S(<200)', 'M(<2000)', 'L(>=2000)']) {
  104. const a = byTier[t] || []; if (!a.length) continue;
  105. const avg = a.reduce((s, x) => s + x, 0) / a.length;
  106. console.log(` ${pad(t, 10)} n=${a.length} avg cgOut=${k(avg)}K range ${k(Math.min(...a))}-${k(Math.max(...a))}K`);
  107. }
  108. // ---- per-tool usage + avg payload (breadth vs path evidence) ----
  109. const tot = {};
  110. for (const c of cells) for (const [name, v] of Object.entries(c.with.perTool)) {
  111. (tot[name] ??= { n: 0, out: 0 }); tot[name].n += v.n; tot[name].out += v.out;
  112. }
  113. console.log('\n=== codegraph tool usage across all cells (n calls, avg payload/call) ===');
  114. for (const [name, v] of Object.entries(tot).sort((a, b) => b[1].n - a[1].n)) {
  115. console.log(` ${pad(name, 10)} calls=${pad(v.n, 4)} avg=${k(v.out / v.n)}K/call total=${k(v.out)}K`);
  116. }
  117. // ---- round-trips ----
  118. const sum = (arr, f) => arr.reduce((s, x) => s + (f(x) || 0), 0);
  119. const wTurns = sum(cells, c => c.with.turns), woTurns = sum(cells, c => c.without?.turns);
  120. const wCalls = sum(cells, c => c.with.cg.length);
  121. const tsAll = cells.every(c => c.with.seq[0] === 'TS');
  122. console.log('\n=== ROUND-TRIPS ===');
  123. console.log(`turns: with=${wTurns} without=${woTurns} (${((1 - wTurns / woTurns) * 100).toFixed(0)}% fewer with)`);
  124. console.log(`avg turns/cell: with=${(wTurns / cells.length).toFixed(1)} without=${(woTurns / cells.length).toFixed(1)}`);
  125. console.log(`total codegraph calls=${wCalls} (avg ${(wCalls / cells.length).toFixed(1)}/cell)`);
  126. console.log(`every with-arm opens with a ToolSearch round-trip (deferred tools): ${tsAll ? 'YES — 1 fixed tax/run' : 'no'}`);