perf: add model pinning infrastructure + rate-limit telemetry to E2E runner

Default E2E model changed from Opus to Sonnet (5x faster, 5x cheaper).
Session runner now accepts `model` option with EVALS_MODEL env var override.
Added timing telemetry (first_response_ms, max_inter_turn_ms) and wall_clock_ms
to eval-store for diagnosing rate-limit impact. Added EVALS_FAST test filtering.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-21 13:12:17 -07:00
parent d68a70d351
commit d442aadf4a
3 changed files with 52 additions and 2 deletions

View File

@@ -42,6 +42,11 @@ export interface EvalTestEntry {
timeout_at_turn?: number; // which turn was active when timeout hit
last_tool_call?: string; // e.g. "Write(review-output.md)"
// Model + timing diagnostics (added for Sonnet/Opus split)
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
first_response_ms?: number; // time from spawn to first NDJSON line
max_inter_turn_ms?: number; // peak latency between consecutive tool calls
// Outcome eval
detection_rate?: number;
false_positives?: number;
@@ -65,6 +70,7 @@ export interface EvalResult {
failed: number;
total_cost_usd: number;
total_duration_ms: number;
wall_clock_ms?: number; // wall-clock from collector creation to finalization (shows parallelism)
tests: EvalTestEntry[];
_partial?: boolean; // true for incremental saves, absent in final
}
@@ -546,6 +552,7 @@ export class EvalCollector {
private tests: EvalTestEntry[] = [];
private finalized = false;
private evalDir: string;
private createdAt = Date.now();
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
this.tier = tier;
@@ -615,6 +622,7 @@ export class EvalCollector {
failed: this.tests.length - passed,
total_cost_usd: Math.round(totalCost * 100) / 100,
total_duration_ms: totalDuration,
wall_clock_ms: Date.now() - this.createdAt,
tests: this.tests,
};