feat: multi-provider model benchmark (boil the ocean)

Adds the full spec Codex asked for: real provider adapters with auth detection, normalized RunResult, pricing tables, tool compatibility maps, parallel execution with error isolation, and table/JSON/markdown output. Judge stays on Anthropic SDK as the single stable source of quality scoring, gated behind --judge. Codex flagged the original plan as massively under-scoped — the existing runner is Claude-only and the judge is Anthropic-only. You can't benchmark GPT or Gemini without real provider infrastructure. This commit ships it. New architecture: test/helpers/providers/types.ts ProviderAdapter interface test/helpers/providers/claude.ts wraps `claude -p --output-format json` test/helpers/providers/gpt.ts wraps `codex exec --json` test/helpers/providers/gemini.ts wraps `gemini -p --output-format stream-json --yolo` test/helpers/pricing.ts per-model USD cost tables (quarterly) test/helpers/tool-map.ts which tools each CLI exposes test/helpers/benchmark-runner.ts orchestrator (Promise.allSettled) test/helpers/benchmark-judge.ts Anthropic SDK quality scorer bin/gstack-model-benchmark CLI entry test/benchmark-runner.test.ts 9 unit tests (cost math, formatters, tool-map) Per-provider error isolation: - auth → record reason, don't abort batch - timeout → record reason, don't abort batch - rate_limit → record reason, don't abort batch - binary_missing → record in available() check, skip if --skip-unavailable Pricing correction: cached input tokens are disjoint from uncached input tokens (Anthropic/OpenAI report them separately). Original math subtracted them, producing negative costs. Now adds cached at the 10% discount alongside the full uncached input cost. CLI: gstack-model-benchmark --prompt "..." --models claude,gpt,gemini gstack-model-benchmark ./prompt.txt --output json --judge gstack-model-benchmark ./prompt.txt --models claude --timeout-ms 60000 Output formats: table (default), json, markdown. Each shows model, latency, in→out tokens, cost, quality (when --judge used), tool calls, and any errors. Known limitations for v1: - Claude adapter approximates toolCalls as num_turns (stream-json would give exact counts; v2 can upgrade). - Live E2E tests (test/providers.e2e.test.ts) not included — they require CI secrets for all three providers. Unit tests cover the shape and math. - Provider CLIs sometimes return non-JSON error text to stdout; the parsers fall back to treating raw output as plain text in that case. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 12:18:24 +08:00 · 2026-04-17 06:16:42 +08:00
parent 9e95a9dc50
commit 614354fc41
10 changed files with 1092 additions and 0 deletions
--- a/test/helpers/pricing.ts
+++ b/test/helpers/pricing.ts
@@ -0,0 +1,61 @@
+/**
+ * Per-model pricing tables.
+ *
+ * Prices are USD per million tokens as of `as_of`. Update quarterly.
+ * Link to provider pricing pages:
+ *   - Anthropic: https://www.anthropic.com/pricing#api
+ *   - OpenAI: https://openai.com/api/pricing/
+ *   - Google AI: https://ai.google.dev/pricing
+ *
+ * When a model isn't in the table, estimateCost returns 0 with a console warning.
+ * Prefer adding a new row to the table over guessing.
+ */
+
+export interface ModelPricing {
+  input_per_mtok: number;
+  output_per_mtok: number;
+  as_of: string; // YYYY-MM
+}
+
+export const PRICING: Record<string, ModelPricing> = {
+  // Claude (Anthropic)
+  'claude-opus-4-7':    { input_per_mtok: 15.00, output_per_mtok: 75.00, as_of: '2026-04' },
+  'claude-sonnet-4-6':  { input_per_mtok: 3.00,  output_per_mtok: 15.00, as_of: '2026-04' },
+  'claude-haiku-4-5':   { input_per_mtok: 1.00,  output_per_mtok: 5.00,  as_of: '2026-04' },
+
+  // OpenAI (GPT + o-series)
+  'gpt-5.4':            { input_per_mtok: 2.50,  output_per_mtok: 10.00, as_of: '2026-04' },
+  'gpt-5.4-mini':       { input_per_mtok: 0.60,  output_per_mtok: 2.40,  as_of: '2026-04' },
+  'o3':                 { input_per_mtok: 15.00, output_per_mtok: 60.00, as_of: '2026-04' },
+  'o4-mini':            { input_per_mtok: 1.10,  output_per_mtok: 4.40,  as_of: '2026-04' },
+
+  // Google
+  'gemini-2.5-pro':     { input_per_mtok: 1.25,  output_per_mtok: 5.00,  as_of: '2026-04' },
+  'gemini-2.5-flash':   { input_per_mtok: 0.30,  output_per_mtok: 1.20,  as_of: '2026-04' },
+};
+
+const WARNED = new Set<string>();
+
+export function estimateCostUsd(
+  tokens: { input: number; output: number; cached?: number },
+  model: string | undefined
+): number {
+  if (!model) return 0;
+  const row = PRICING[model];
+  if (!row) {
+    if (!WARNED.has(model)) {
+      WARNED.add(model);
+      console.error(`WARN: no pricing for model ${model}; returning 0. Add it to test/helpers/pricing.ts.`);
+    }
+    return 0;
+  }
+  // Anthropic and OpenAI report cached tokens as a separate (disjoint) field from
+  // uncached input tokens. tokens.input is already the uncached portion; tokens.cached
+  // is the cache-read count billed at 10% of the regular input rate. Do NOT subtract
+  // cached from input — they don't overlap.
+  const cachedDiscount = 0.1;
+  const inputCost = tokens.input * row.input_per_mtok / 1_000_000;
+  const cachedCost = (tokens.cached ?? 0) * row.input_per_mtok * cachedDiscount / 1_000_000;
+  const outputCost = tokens.output * row.output_per_mtok / 1_000_000;
+  return +(inputCost + cachedCost + outputCost).toFixed(6);
+}