offload-eval-metrics.mjs 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env node
  2. // Extract one eval run's metrics from its Claude stream-json transcript + the
  3. // offload usage sidecar log, emit ONE merged JSON line.
  4. //
  5. // Usage: extract-metrics.mjs --run <run.jsonl> --usage <usage.jsonl|-> \
  6. // --arm <a> --rep <n> --repo <r> --tier <t> --q <question>
  7. import { readFileSync, existsSync } from 'fs';
  8. const args = {};
  9. for (let i = 2; i < process.argv.length; i += 2) args[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
  10. const runFile = args.run;
  11. const lines = existsSync(runFile) ? readFileSync(runFile, 'utf8').split('\n').filter(Boolean) : [];
  12. const toolCounts = {};
  13. let result = null;
  14. const tok = { gen: 0, fresh: 0, cached: 0 };
  15. const offloadAnswers = [];
  16. let exploreResults = 0; // tool_results from explore (offload or raw)
  17. let lastAssistantText = '';
  18. for (const line of lines) {
  19. let ev; try { ev = JSON.parse(line); } catch { continue; }
  20. // per-turn token usage (authoritative token measure; result.usage is last-turn only)
  21. const u = ev.message?.usage;
  22. if (u) {
  23. tok.gen += u.output_tokens || 0;
  24. tok.fresh += (u.input_tokens || 0) + (u.cache_creation_input_tokens || 0);
  25. tok.cached += u.cache_read_input_tokens || 0;
  26. }
  27. if (ev.type === 'assistant' && Array.isArray(ev.message?.content)) {
  28. for (const b of ev.message.content) {
  29. if (b.type === 'tool_use') toolCounts[b.name] = (toolCounts[b.name] || 0) + 1;
  30. if (b.type === 'text' && b.text?.trim()) lastAssistantText = b.text.trim();
  31. }
  32. }
  33. // tool_results arrive in user messages
  34. if (ev.type === 'user' && Array.isArray(ev.message?.content)) {
  35. for (const b of ev.message.content) {
  36. if (b.type !== 'tool_result') continue;
  37. const text = Array.isArray(b.content)
  38. ? b.content.map(c => (typeof c === 'string' ? c : c.text || '')).join('')
  39. : (typeof b.content === 'string' ? b.content : '');
  40. if (/Synthesized by CodeGraph/.test(text)) { offloadAnswers.push(text); exploreResults++; }
  41. else if (/Found \d+ symbols? across|## Exploration:/.test(text)) exploreResults++;
  42. }
  43. }
  44. if (ev.type === 'result') result = ev;
  45. }
  46. // offload usage sidecar (CodeGraph AI tokens + cost) — one JSON line per offload call
  47. const ai = { calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0, credits: 0, costUsd: 0, ms: 0 };
  48. if (args.usage && args.usage !== '-' && existsSync(args.usage)) {
  49. for (const line of readFileSync(args.usage, 'utf8').split('\n').filter(Boolean)) {
  50. let e; try { e = JSON.parse(line); } catch { continue; }
  51. ai.calls++;
  52. ai.promptTokens += e.promptTokens || 0;
  53. ai.completionTokens += e.completionTokens || 0;
  54. ai.totalTokens += e.totalTokens || 0;
  55. ai.credits += e.creditsCharged || 0;
  56. ai.costUsd += e.costUsd || 0;
  57. ai.ms += e.ms || 0;
  58. }
  59. }
  60. // front-load hook fired iff its injected header appears in the transcript
  61. const frontload = lines.some(l => l.includes('auto-retrieved for this question'));
  62. const get = (n) => toolCounts[n] || 0;
  63. const read = get('Read');
  64. const grep = get('Grep') + get('Bash') + get('Glob');
  65. const explore = get('mcp__codegraph__codegraph_explore');
  66. const cgAny = Object.keys(toolCounts).filter(k => /mcp__codegraph__/.test(k)).reduce((s, k) => s + toolCounts[k], 0);
  67. const out = {
  68. repo: args.repo, tier: args.tier, arm: args.arm, rep: Number(args.rep), question: args.q,
  69. ok: result?.subtype === 'success',
  70. durationSec: result ? +(result.duration_ms / 1000).toFixed(1) : null,
  71. numTurns: result?.num_turns ?? null,
  72. costUsdMain: result ? +(result.total_cost_usd || 0).toFixed(4) : null,
  73. tokGen: tok.gen, tokFresh: tok.fresh, tokCached: tok.cached, tokBillable: tok.gen + tok.fresh,
  74. read, grep, explore, cgAny, frontload,
  75. offloadFired: offloadAnswers.length,
  76. ai,
  77. // text payloads for the accuracy judge (kept separate; large)
  78. finalAnswer: (result?.result || lastAssistantText || '').slice(0, 8000),
  79. offloadAnswers: offloadAnswers.map(a => a.slice(0, 6000)),
  80. };
  81. process.stdout.write(JSON.stringify(out) + '\n');