mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 02:22:04 +08:00
merge: resolve conflicts with origin/main
Accept main's generated SKILL.md files (will be regenerated by bun run build). Resolve gen-skill-docs.ts: keep community tier 3-option prompt from branch, keep error context fields from branch, add PLAN MODE EXCEPTION from main. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -41,6 +41,12 @@ export interface SkillTestResult {
|
||||
output: string;
|
||||
costEstimate: CostEstimate;
|
||||
transcript: any[];
|
||||
/** Which model was used for this test (added for Sonnet/Opus split diagnostics) */
|
||||
model: string;
|
||||
/** Time from spawn to first NDJSON line, in ms (added for rate-limit diagnostics) */
|
||||
firstResponseMs: number;
|
||||
/** Peak latency between consecutive tool calls, in ms */
|
||||
maxInterTurnMs: number;
|
||||
}
|
||||
|
||||
const BROWSE_ERROR_PATTERNS = [
|
||||
@@ -116,6 +122,8 @@ export async function runSkillTest(options: {
|
||||
timeout?: number;
|
||||
testName?: string;
|
||||
runId?: string;
|
||||
/** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */
|
||||
model?: string;
|
||||
}): Promise<SkillTestResult> {
|
||||
const {
|
||||
prompt,
|
||||
@@ -126,6 +134,7 @@ export async function runSkillTest(options: {
|
||||
testName,
|
||||
runId,
|
||||
} = options;
|
||||
const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6';
|
||||
|
||||
const startTime = Date.now();
|
||||
const startedAt = new Date().toISOString();
|
||||
@@ -144,6 +153,7 @@ export async function runSkillTest(options: {
|
||||
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
||||
const args = [
|
||||
'-p',
|
||||
'--model', model,
|
||||
'--output-format', 'stream-json',
|
||||
'--verbose',
|
||||
'--dangerously-skip-permissions',
|
||||
@@ -151,8 +161,10 @@ export async function runSkillTest(options: {
|
||||
'--allowed-tools', ...allowedTools,
|
||||
];
|
||||
|
||||
// Write prompt to a temp file and pipe it via shell to avoid stdin buffering issues
|
||||
const promptFile = path.join(workingDirectory, '.prompt-tmp');
|
||||
// Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions
|
||||
// where afterAll cleanup deletes the dir before cat reads the file (especially
|
||||
// with --concurrent --retry). Using os.tmpdir() + unique suffix keeps it stable.
|
||||
const promptFile = path.join(os.tmpdir(), `.prompt-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
||||
fs.writeFileSync(promptFile, prompt);
|
||||
|
||||
// Isolate telemetry: E2E tests use a temp state dir so they don't pollute
|
||||
@@ -181,6 +193,9 @@ export async function runSkillTest(options: {
|
||||
const collectedLines: string[] = [];
|
||||
let liveTurnCount = 0;
|
||||
let liveToolCount = 0;
|
||||
let firstResponseMs = 0;
|
||||
let lastToolTime = 0;
|
||||
let maxInterTurnMs = 0;
|
||||
const stderrPromise = new Response(proc.stderr).text();
|
||||
|
||||
const reader = proc.stdout.getReader();
|
||||
@@ -207,7 +222,15 @@ export async function runSkillTest(options: {
|
||||
for (const item of content) {
|
||||
if (item.type === 'tool_use') {
|
||||
liveToolCount++;
|
||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||
const now = Date.now();
|
||||
const elapsed = Math.round((now - startTime) / 1000);
|
||||
// Track timing telemetry
|
||||
if (firstResponseMs === 0) firstResponseMs = now - startTime;
|
||||
if (lastToolTime > 0) {
|
||||
const interTurn = now - lastToolTime;
|
||||
if (interTurn > maxInterTurnMs) maxInterTurnMs = interTurn;
|
||||
}
|
||||
lastToolTime = now;
|
||||
const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
|
||||
process.stderr.write(progressLine);
|
||||
|
||||
@@ -336,5 +359,5 @@ export async function runSkillTest(options: {
|
||||
turnsUsed,
|
||||
};
|
||||
|
||||
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript };
|
||||
return { toolCalls, browseErrors, exitReason, duration, output: resultLine?.result || '', costEstimate, transcript, model, firstResponseMs, maxInterTurnMs };
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user