mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-20 03:12:28 +08:00
perf: add model pinning infrastructure + rate-limit telemetry to E2E runner
Default E2E model changed from Opus to Sonnet (5x faster, 5x cheaper). Session runner now accepts `model` option with EVALS_MODEL env var override. Added timing telemetry (first_response_ms, max_inter_turn_ms) and wall_clock_ms to eval-store for diagnosing rate-limit impact. Added EVALS_FAST test filtering. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -42,6 +42,11 @@ export interface EvalTestEntry {
|
||||
timeout_at_turn?: number; // which turn was active when timeout hit
|
||||
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
||||
|
||||
// Model + timing diagnostics (added for Sonnet/Opus split)
|
||||
model?: string; // e.g. 'claude-sonnet-4-6' or 'claude-opus-4-6'
|
||||
first_response_ms?: number; // time from spawn to first NDJSON line
|
||||
max_inter_turn_ms?: number; // peak latency between consecutive tool calls
|
||||
|
||||
// Outcome eval
|
||||
detection_rate?: number;
|
||||
false_positives?: number;
|
||||
@@ -65,6 +70,7 @@ export interface EvalResult {
|
||||
failed: number;
|
||||
total_cost_usd: number;
|
||||
total_duration_ms: number;
|
||||
wall_clock_ms?: number; // wall-clock from collector creation to finalization (shows parallelism)
|
||||
tests: EvalTestEntry[];
|
||||
_partial?: boolean; // true for incremental saves, absent in final
|
||||
}
|
||||
@@ -546,6 +552,7 @@ export class EvalCollector {
|
||||
private tests: EvalTestEntry[] = [];
|
||||
private finalized = false;
|
||||
private evalDir: string;
|
||||
private createdAt = Date.now();
|
||||
|
||||
constructor(tier: 'e2e' | 'llm-judge', evalDir?: string) {
|
||||
this.tier = tier;
|
||||
@@ -615,6 +622,7 @@ export class EvalCollector {
|
||||
failed: this.tests.length - passed,
|
||||
total_cost_usd: Math.round(totalCost * 100) / 100,
|
||||
total_duration_ms: totalDuration,
|
||||
wall_clock_ms: Date.now() - this.createdAt,
|
||||
tests: this.tests,
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user