| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394 |
- #!/usr/bin/env node
- // Extract one eval run's metrics from its Claude stream-json transcript + the
- // offload usage sidecar log, emit ONE merged JSON line.
- //
- // Usage: extract-metrics.mjs --run <run.jsonl> --usage <usage.jsonl|-> \
- // --arm <a> --rep <n> --repo <r> --tier <t> --q <question>
- import { readFileSync, existsSync } from 'fs';
- const args = {};
- for (let i = 2; i < process.argv.length; i += 2) args[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
- const runFile = args.run;
- const lines = existsSync(runFile) ? readFileSync(runFile, 'utf8').split('\n').filter(Boolean) : [];
- const toolCounts = {};
- let result = null;
- const tok = { gen: 0, fresh: 0, cached: 0 };
- const offloadAnswers = [];
- let exploreResults = 0; // tool_results from explore (offload or raw)
- let lastAssistantText = '';
- for (const line of lines) {
- let ev; try { ev = JSON.parse(line); } catch { continue; }
- // per-turn token usage (authoritative token measure; result.usage is last-turn only)
- const u = ev.message?.usage;
- if (u) {
- tok.gen += u.output_tokens || 0;
- tok.fresh += (u.input_tokens || 0) + (u.cache_creation_input_tokens || 0);
- tok.cached += u.cache_read_input_tokens || 0;
- }
- if (ev.type === 'assistant' && Array.isArray(ev.message?.content)) {
- for (const b of ev.message.content) {
- if (b.type === 'tool_use') toolCounts[b.name] = (toolCounts[b.name] || 0) + 1;
- if (b.type === 'text' && b.text?.trim()) lastAssistantText = b.text.trim();
- }
- }
- // tool_results arrive in user messages
- if (ev.type === 'user' && Array.isArray(ev.message?.content)) {
- for (const b of ev.message.content) {
- if (b.type !== 'tool_result') continue;
- const text = Array.isArray(b.content)
- ? b.content.map(c => (typeof c === 'string' ? c : c.text || '')).join('')
- : (typeof b.content === 'string' ? b.content : '');
- // An offload answer is either the 'plain'/'report' synthesis (carries the
- // "Synthesized by CodeGraph" footer) or a 'refs' answer (carries the re-expanded
- // "### Referenced source — verbatim" appendix). A refs call that cited nothing
- // valid falls back to RAW source, which is correctly counted as a raw explore below.
- if (/Synthesized by CodeGraph|### Referenced source — verbatim/.test(text)) { offloadAnswers.push(text); exploreResults++; }
- else if (/Found \d+ symbols? across|\*\*Exploration:/.test(text)) exploreResults++;
- }
- }
- if (ev.type === 'result') result = ev;
- }
- // offload usage sidecar (CodeGraph AI tokens + cost) — one JSON line per offload call
- const ai = { calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0, credits: 0, costUsd: 0, ms: 0 };
- if (args.usage && args.usage !== '-' && existsSync(args.usage)) {
- for (const line of readFileSync(args.usage, 'utf8').split('\n').filter(Boolean)) {
- let e; try { e = JSON.parse(line); } catch { continue; }
- ai.calls++;
- ai.promptTokens += e.promptTokens || 0;
- ai.completionTokens += e.completionTokens || 0;
- ai.totalTokens += e.totalTokens || 0;
- ai.credits += e.creditsCharged || 0;
- ai.costUsd += e.costUsd || 0;
- ai.ms += e.ms || 0;
- }
- }
- // front-load hook fired iff its injected header appears in the transcript
- const frontload = lines.some(l => l.includes('auto-retrieved for this question'));
- const get = (n) => toolCounts[n] || 0;
- const read = get('Read');
- const grep = get('Grep') + get('Bash') + get('Glob');
- const explore = get('mcp__codegraph__codegraph_explore');
- const cgAny = Object.keys(toolCounts).filter(k => /mcp__codegraph__/.test(k)).reduce((s, k) => s + toolCounts[k], 0);
- const out = {
- repo: args.repo, tier: args.tier, arm: args.arm, rep: Number(args.rep), question: args.q,
- ok: result?.subtype === 'success',
- durationSec: result ? +(result.duration_ms / 1000).toFixed(1) : null,
- numTurns: result?.num_turns ?? null,
- costUsdMain: result ? +(result.total_cost_usd || 0).toFixed(4) : null,
- tokGen: tok.gen, tokFresh: tok.fresh, tokCached: tok.cached, tokBillable: tok.gen + tok.fresh,
- read, grep, explore, cgAny, frontload,
- offloadFired: offloadAnswers.length,
- ai,
- // text payloads for the accuracy judge (kept separate; large)
- finalAnswer: (result?.result || lastAssistantText || '').slice(0, 8000),
- offloadAnswers: offloadAnswers.map(a => a.slice(0, 6000)),
- };
- process.stdout.write(JSON.stringify(out) + '\n');
|