perf: add model pinning infrastructure + rate-limit telemetry to E2E runner

Default E2E model changed from Opus to Sonnet (5x faster, 5x cheaper).
Session runner now accepts `model` option with EVALS_MODEL env var override.
Added timing telemetry (first_response_ms, max_inter_turn_ms) and wall_clock_ms
to eval-store for diagnosing rate-limit impact. Added EVALS_FAST test filtering.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-21 13:12:17 -07:00
parent d68a70d351
commit d442aadf4a
3 changed files with 52 additions and 2 deletions

View File

@@ -41,6 +41,12 @@ export interface SkillTestResult {
output: string;
costEstimate: CostEstimate;
transcript: any[];
/** Which model was used for this test (added for Sonnet/Opus split diagnostics) */
model: string;
/** Time from spawn to first NDJSON line, in ms (added for rate-limit diagnostics) */
firstResponseMs: number;
/** Peak latency between consecutive tool calls, in ms */
maxInterTurnMs: number;
}
const BROWSE_ERROR_PATTERNS = [
@@ -116,6 +122,8 @@ export async function runSkillTest(options: {
timeout?: number;
testName?: string;
runId?: string;
/** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */
model?: string;
}): Promise<SkillTestResult> {
const {
prompt,
@@ -126,6 +134,7 @@ export async function runSkillTest(options: {
testName,
runId,
} = options;
const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6';
const startTime = Date.now();
const startedAt = new Date().toISOString();
@@ -144,6 +153,7 @@ export async function runSkillTest(options: {
// avoid shell escaping issues. --verbose is required for stream-json mode.
const args = [
'-p',
'--model', model,
'--output-format', 'stream-json',
'--verbose',
'--dangerously-skip-permissions',
@@ -175,6 +185,9 @@ export async function runSkillTest(options: {
const collectedLines: string[] = [];
let liveTurnCount = 0;
let liveToolCount = 0;
let firstResponseMs = 0;
let lastToolTime = 0;
let maxInterTurnMs = 0;
const stderrPromise = new Response(proc.stderr).text();
const reader = proc.stdout.getReader();
@@ -201,7 +214,15 @@ export async function runSkillTest(options: {
for (const item of content) {
if (item.type === 'tool_use') {
liveToolCount++;
const elapsed = Math.round((Date.now() - startTime) / 1000);
const now = Date.now();
const elapsed = Math.round((now - startTime) / 1000);
// Track timing telemetry
if (firstResponseMs === 0) firstResponseMs = now - startTime;
if (lastToolTime > 0) {
const interTurn = now - lastToolTime;
if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn;
}
lastToolTime = now;
const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
process.stderr.write(progressLine);
@@ -330,5 +351,5 @@ export async function runSkillTest(options: {
turnsUsed,
};
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, model, firstResponseMs, maxInterTurnMs };
}