1
0

offload-eval-summarize.mjs 4.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. #!/usr/bin/env node
  2. // Aggregate judged.jsonl (or results.jsonl) into a per-repo, per-arm report:
  3. // time, main tokens/cost, AI tokens/cost, total cost, tool mix, accuracy.
  4. // Usage: summarize.mjs <judged-or-results.jsonl>
  5. import { readFileSync } from 'fs';
  6. const rows = readFileSync(process.argv[2], 'utf8').split('\n').filter(Boolean).map(l => JSON.parse(l));
  7. const med = (xs) => { const a = xs.filter(x => x != null).sort((p, q) => p - q); if (!a.length) return null; const m = Math.floor(a.length / 2); return a.length % 2 ? a[m] : (a[m - 1] + a[m]) / 2; };
  8. const rng = (xs) => { const a = xs.filter(x => x != null); return a.length ? `${Math.min(...a)}–${Math.max(...a)}` : '—'; };
  9. const d2 = (x) => x == null ? '—' : (+x).toFixed(2);
  10. const d3 = (x) => x == null ? '—' : (+x).toFixed(3);
  11. const d4 = (x) => x == null ? '—' : (+x).toFixed(4);
  12. const ARM_ORDER = ['frontload', 'offload', 'raw', 'nocg'];
  13. const byRepo = {};
  14. for (const r of rows) (byRepo[r.repo] ??= {});
  15. for (const r of rows) ((byRepo[r.repo][r.arm] ??= []).push(r));
  16. const verdictTally = (rs, field) => {
  17. const t = { pass: 0, partial: 0, fail: 0, error: 0 };
  18. for (const r of rs) { const v = r[field]?.verdict; if (v in t) t[v]++; }
  19. return t;
  20. };
  21. for (const repo of Object.keys(byRepo)) {
  22. const tier = byRepo[repo][Object.keys(byRepo[repo])[0]][0].tier;
  23. console.log(`\n${'='.repeat(78)}\n${repo} [${tier}]\n${'='.repeat(78)}`);
  24. console.log(`${'arm'.padEnd(9)} n ${'time(s)'.padStart(9)} ${'mainCost'.padStart(9)} ${'aiCost'.padStart(8)} ${'totCost'.padStart(8)} ${'mainTok'.padStart(8)} ${'aiTok'.padStart(7)} ${'rd'.padStart(3)} ${'gr'.padStart(3)} ${'exp'.padStart(3)} ${'off'.padStart(3)} e2e(P/p/F) fidScore`);
  25. for (const arm of ARM_ORDER) {
  26. const rs = byRepo[repo][arm]; if (!rs) continue;
  27. const n = rs.length;
  28. const mainCost = med(rs.map(r => r.costUsdMain));
  29. const aiCost = med(rs.map(r => r.ai?.costUsd ?? 0));
  30. const totCost = (mainCost ?? 0) + (aiCost ?? 0);
  31. const e2e = verdictTally(rs, 'e2e');
  32. const fidScores = arm === 'offload' ? rs.flatMap(r => r.fidelity?.scores ?? []) : [];
  33. const fid = fidScores.length ? med(fidScores) : null;
  34. const fab = arm === 'offload' && rs.some(r => r.fidelity?.anyFabrication);
  35. const e2eScore = med(rs.map(r => r.e2e?.score).filter(x => x != null));
  36. console.log(
  37. `${arm.padEnd(9)} ${String(n).padStart(1)} ${String(med(rs.map(r => r.durationSec))).padStart(9)} ` +
  38. `${('$' + d3(mainCost)).padStart(9)} ${('$' + d3(aiCost)).padStart(8)} ${('$' + d3(totCost)).padStart(8)} ` +
  39. `${String(Math.round(med(rs.map(r => r.tokBillable)) / 1000) + 'k').padStart(8)} ${String(Math.round(med(rs.map(r => r.ai?.totalTokens ?? 0)) / 1000) + 'k').padStart(7)} ` +
  40. `${String(med(rs.map(r => r.read))).padStart(3)} ${String(med(rs.map(r => r.grep))).padStart(3)} ${String(med(rs.map(r => r.explore))).padStart(3)} ${String(med(rs.map(r => r.offloadFired))).padStart(3)} ` +
  41. `${(e2e.pass + '/' + e2e.partial + '/' + e2e.fail).padStart(9)} ${e2eScore != null ? 'e2e=' + e2eScore : ''} ${fid != null ? 'fid=' + fid + (fab ? ' FAB!' : '') : ''}`
  42. );
  43. }
  44. // ranges line for the two key metrics (variance matters)
  45. for (const arm of ARM_ORDER) {
  46. const rs = byRepo[repo][arm]; if (!rs) continue;
  47. console.log(` ${arm} ranges: time ${rng(rs.map(r => r.durationSec))}s · mainCost $${rng(rs.map(r => r.costUsdMain))} · read ${rng(rs.map(r => r.read))} · explore ${rng(rs.map(r => r.explore))} · offloadFired ${rng(rs.map(r => r.offloadFired))}`);
  48. }
  49. }
  50. // Cross-repo roll-up: offload vs raw vs nocg deltas
  51. console.log(`\n${'='.repeat(78)}\nCROSS-REPO SUMMARY (medians per repo, then averaged)\n${'='.repeat(78)}`);
  52. console.log(`${'repo'.padEnd(12)} ${'arm'.padEnd(8)} ${'time'.padStart(7)} ${'totCost'.padStart(8)} ${'read'.padStart(5)} ${'e2e pass%'.padStart(9)} ${'fid'.padStart(5)}`);
  53. for (const repo of Object.keys(byRepo)) {
  54. for (const arm of ARM_ORDER) {
  55. const rs = byRepo[repo][arm]; if (!rs) continue;
  56. const e2e = verdictTally(rs, 'e2e');
  57. const passPct = Math.round(100 * e2e.pass / rs.length);
  58. const totCost = (med(rs.map(r => r.costUsdMain)) ?? 0) + (med(rs.map(r => r.ai?.costUsd ?? 0)) ?? 0);
  59. const fid = arm === 'offload' ? med(rs.flatMap(r => r.fidelity?.scores ?? [])) : null;
  60. console.log(`${repo.padEnd(12)} ${arm.padEnd(8)} ${(med(rs.map(r => r.durationSec)) + 's').padStart(7)} ${('$' + d3(totCost)).padStart(8)} ${String(med(rs.map(r => r.read))).padStart(5)} ${(passPct + '%').padStart(9)} ${String(fid ?? '—').padStart(5)}`);
  61. }
  62. }
  63. console.log('');