Merge branch 'main' into garrytan/team-supabase-store

Resolved 15 conflicts: - session-runner.ts: kept both costs array (ours) and model/timing telemetry (main), renamed loop var to avoid shadowing model param - skill-e2e.test.ts: accepted deletion (main split into separate files) - gen-skill-docs.ts: kept our codex review flow with $PROJECTS_DIR paths - plan review templates: took main + re-applied $PROJECTS_DIR paths, replaced gstack-review-log with inline approach - TODOS.md: took main's shipped status for E2E model pinning - Generated SKILL.md + .agents/ files: regenerated from resolved templates Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-20 03:12:28 +08:00 · 2026-03-21 16:28:27 -07:00
parent 2769cd043d 2c0d4b39c7
commit 8444626c6a
91 changed files with 12161 additions and 3312 deletions
--- a/test/helpers/session-runner.ts
+++ b/test/helpers/session-runner.ts
@@ -37,6 +37,12 @@ export interface SkillTestResult {
  costEstimate: CostEstimate;
  transcript: any[];
  costs: CostEntry[];
+  /** Which model was used for this test (added for Sonnet/Opus split diagnostics) */
+  model: string;
+  /** Time from spawn to first NDJSON line, in ms (added for rate-limit diagnostics) */
+  firstResponseMs: number;
+  /** Peak latency between consecutive tool calls, in ms */
+  maxInterTurnMs: number;
 }

 const BROWSE_ERROR_PATTERNS = [
@@ -112,6 +118,8 @@ export async function runSkillTest(options: {
  timeout?: number;
  testName?: string;
  runId?: string;
+  /** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */
+  model?: string;
 }): Promise<SkillTestResult> {
  const {
    prompt,
@@ -122,6 +130,7 @@ export async function runSkillTest(options: {
    testName,
    runId,
  } = options;
+  const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6';

  const startTime = Date.now();
  const startedAt = new Date().toISOString();
@@ -142,7 +151,7 @@ export async function runSkillTest(options: {
  const evalModel = tierToModel(resolveTier());
  const args = [
    '-p',
-    '--model', evalModel,
+    '--model', model,
    '--output-format', 'stream-json',
    '--verbose',
    '--dangerously-skip-permissions',
@@ -150,8 +159,10 @@ export async function runSkillTest(options: {
    '--allowed-tools', ...allowedTools,
  ];

-  // Write prompt to a temp file and pipe it via shell to avoid stdin buffering issues
-  const promptFile = path.join(workingDirectory, '.prompt-tmp');
+  // Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
+  // where afterAll cleanup deletes the dir before cat reads the file (especially
+  // with --concurrent --retry). Using os.tmpdir() + unique suffix keeps it stable.
+  const promptFile = path.join(os.tmpdir(), `.prompt-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2)}`);
  fs.writeFileSync(promptFile, prompt);

  const proc = Bun.spawn(['sh', '-c', `cat "${promptFile}" | claude ${args.map(a => `"${a}"`).join(' ')}`], {
@@ -174,6 +185,9 @@ export async function runSkillTest(options: {
  const collectedLines: string[] = [];
  let liveTurnCount = 0;
  let liveToolCount = 0;
+  let firstResponseMs = 0;
+  let lastToolTime = 0;
+  let maxInterTurnMs = 0;
  const stderrPromise = new Response(proc.stderr).text();

  const reader = proc.stdout.getReader();
@@ -200,7 +214,15 @@ export async function runSkillTest(options: {
            for (const item of content) {
              if (item.type === 'tool_use') {
                liveToolCount++;
-                const elapsed = Math.round((Date.now() - startTime) / 1000);
+                const now = Date.now();
+                const elapsed = Math.round((now - startTime) / 1000);
+                // Track timing telemetry
+                if (firstResponseMs === 0) firstResponseMs = now - startTime;
+                if (lastToolTime > 0) {
+                  const interTurn = now - lastToolTime;
+                  if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn;
+                }
+                lastToolTime = now;
                const progressLine = `  [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
                process.stderr.write(progressLine);

@@ -332,9 +354,9 @@ export async function runSkillTest(options: {
  // Extract per-model costs from resultLine.modelUsage (camelCase → snake_case)
  const costs: CostEntry[] = [];
  if (resultLine?.modelUsage) {
-    for (const [model, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
+    for (const [modelName, usage] of Object.entries(resultLine.modelUsage as Record<string, any>)) {
      costs.push({
-        model,
+        model: modelName,
        calls: 1,
        input_tokens: usage.inputTokens || 0,
        output_tokens: usage.outputTokens || 0,
@@ -345,5 +367,5 @@ export async function runSkillTest(options: {
    }
  }

-  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs };
+  return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, costs, model, firstResponseMs, maxInterTurnMs };
 }