1
0

offload-eval-metrics.mjs 4.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. #!/usr/bin/env node
  2. // Extract one eval run's metrics from its Claude stream-json transcript + the
  3. // offload usage sidecar log, emit ONE merged JSON line.
  4. //
  5. // Usage: extract-metrics.mjs --run <run.jsonl> --usage <usage.jsonl|-> \
  6. // --arm <a> --rep <n> --repo <r> --tier <t> --q <question>
  7. import { readFileSync, existsSync } from 'fs';
  8. const args = {};
  9. for (let i = 2; i < process.argv.length; i += 2) args[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
  10. const runFile = args.run;
  11. const lines = existsSync(runFile) ? readFileSync(runFile, 'utf8').split('\n').filter(Boolean) : [];
  12. const toolCounts = {};
  13. let result = null;
  14. const tok = { gen: 0, fresh: 0, cached: 0 };
  15. const offloadAnswers = [];
  16. let exploreResults = 0; // tool_results from explore (offload or raw)
  17. let lastAssistantText = '';
  18. for (const line of lines) {
  19. let ev; try { ev = JSON.parse(line); } catch { continue; }
  20. // per-turn token usage (authoritative token measure; result.usage is last-turn only)
  21. const u = ev.message?.usage;
  22. if (u) {
  23. tok.gen += u.output_tokens || 0;
  24. tok.fresh += (u.input_tokens || 0) + (u.cache_creation_input_tokens || 0);
  25. tok.cached += u.cache_read_input_tokens || 0;
  26. }
  27. if (ev.type === 'assistant' && Array.isArray(ev.message?.content)) {
  28. for (const b of ev.message.content) {
  29. if (b.type === 'tool_use') toolCounts[b.name] = (toolCounts[b.name] || 0) + 1;
  30. if (b.type === 'text' && b.text?.trim()) lastAssistantText = b.text.trim();
  31. }
  32. }
  33. // tool_results arrive in user messages
  34. if (ev.type === 'user' && Array.isArray(ev.message?.content)) {
  35. for (const b of ev.message.content) {
  36. if (b.type !== 'tool_result') continue;
  37. const text = Array.isArray(b.content)
  38. ? b.content.map(c => (typeof c === 'string' ? c : c.text || '')).join('')
  39. : (typeof b.content === 'string' ? b.content : '');
  40. // An offload answer is either the 'plain'/'report' synthesis (carries the
  41. // "Synthesized by CodeGraph" footer) or a 'refs' answer (carries the re-expanded
  42. // "### Referenced source — verbatim" appendix). A refs call that cited nothing
  43. // valid falls back to RAW source, which is correctly counted as a raw explore below.
  44. if (/Synthesized by CodeGraph|### Referenced source — verbatim/.test(text)) { offloadAnswers.push(text); exploreResults++; }
  45. else if (/Found \d+ symbols? across|\*\*Exploration:/.test(text)) exploreResults++;
  46. }
  47. }
  48. if (ev.type === 'result') result = ev;
  49. }
  50. // offload usage sidecar (CodeGraph AI tokens + cost) — one JSON line per offload call
  51. const ai = { calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0, credits: 0, costUsd: 0, ms: 0 };
  52. if (args.usage && args.usage !== '-' && existsSync(args.usage)) {
  53. for (const line of readFileSync(args.usage, 'utf8').split('\n').filter(Boolean)) {
  54. let e; try { e = JSON.parse(line); } catch { continue; }
  55. ai.calls++;
  56. ai.promptTokens += e.promptTokens || 0;
  57. ai.completionTokens += e.completionTokens || 0;
  58. ai.totalTokens += e.totalTokens || 0;
  59. ai.credits += e.creditsCharged || 0;
  60. ai.costUsd += e.costUsd || 0;
  61. ai.ms += e.ms || 0;
  62. }
  63. }
  64. // front-load hook fired iff its injected header appears in the transcript
  65. const frontload = lines.some(l => l.includes('auto-retrieved for this question'));
  66. const get = (n) => toolCounts[n] || 0;
  67. const read = get('Read');
  68. const grep = get('Grep') + get('Bash') + get('Glob');
  69. const explore = get('mcp__codegraph__codegraph_explore');
  70. const cgAny = Object.keys(toolCounts).filter(k => /mcp__codegraph__/.test(k)).reduce((s, k) => s + toolCounts[k], 0);
  71. const out = {
  72. repo: args.repo, tier: args.tier, arm: args.arm, rep: Number(args.rep), question: args.q,
  73. ok: result?.subtype === 'success',
  74. durationSec: result ? +(result.duration_ms / 1000).toFixed(1) : null,
  75. numTurns: result?.num_turns ?? null,
  76. costUsdMain: result ? +(result.total_cost_usd || 0).toFixed(4) : null,
  77. tokGen: tok.gen, tokFresh: tok.fresh, tokCached: tok.cached, tokBillable: tok.gen + tok.fresh,
  78. read, grep, explore, cgAny, frontload,
  79. offloadFired: offloadAnswers.length,
  80. ai,
  81. // text payloads for the accuracy judge (kept separate; large)
  82. finalAnswer: (result?.result || lastAssistantText || '').slice(0, 8000),
  83. offloadAnswers: offloadAnswers.map(a => a.slice(0, 6000)),
  84. };
  85. process.stdout.write(JSON.stringify(out) + '\n');