mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-21 12:18:24 +08:00
feat: add E2E observability — heartbeat, progress.log, NDJSON persistence, savePartial()
session-runner: atomic heartbeat file (e2e-live.json), per-run log directory
(~/.gstack-dev/e2e-runs/{runId}/), progress.log + per-test NDJSON persistence,
failure transcripts to persistent run dir instead of tmpdir.
eval-store: 3 new diagnostic fields (exit_reason, timeout_at_turn, last_tool_call),
savePartial() writes _partial-e2e.json after each addTest() for crash resilience,
finalize() cleans up partial file.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,11 @@ export interface EvalTestEntry {
|
|||||||
judge_scores?: Record<string, number>;
|
judge_scores?: Record<string, number>;
|
||||||
judge_reasoning?: string;
|
judge_reasoning?: string;
|
||||||
|
|
||||||
|
// Machine-readable diagnostics
|
||||||
|
exit_reason?: string; // 'success' | 'timeout' | 'error_max_turns' | 'exit_code_N'
|
||||||
|
timeout_at_turn?: number; // which turn was active when timeout hit
|
||||||
|
last_tool_call?: string; // e.g. "Write(review-output.md)"
|
||||||
|
|
||||||
// Outcome eval
|
// Outcome eval
|
||||||
detection_rate?: number;
|
detection_rate?: number;
|
||||||
false_positives?: number;
|
false_positives?: number;
|
||||||
@@ -61,6 +66,7 @@ export interface EvalResult {
|
|||||||
total_cost_usd: number;
|
total_cost_usd: number;
|
||||||
total_duration_ms: number;
|
total_duration_ms: number;
|
||||||
tests: EvalTestEntry[];
|
tests: EvalTestEntry[];
|
||||||
|
_partial?: boolean; // true for incremental saves, absent in final
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface TestDelta {
|
export interface TestDelta {
|
||||||
@@ -374,6 +380,41 @@ export class EvalCollector {
|
|||||||
|
|
||||||
addTest(entry: EvalTestEntry): void {
|
addTest(entry: EvalTestEntry): void {
|
||||||
this.tests.push(entry);
|
this.tests.push(entry);
|
||||||
|
this.savePartial();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Write incremental results after each test. Atomic write, non-fatal. */
|
||||||
|
savePartial(): void {
|
||||||
|
try {
|
||||||
|
const git = getGitInfo();
|
||||||
|
const version = getVersion();
|
||||||
|
const totalCost = this.tests.reduce((s, t) => s + t.cost_usd, 0);
|
||||||
|
const totalDuration = this.tests.reduce((s, t) => s + t.duration_ms, 0);
|
||||||
|
const passed = this.tests.filter(t => t.passed).length;
|
||||||
|
|
||||||
|
const partial: EvalResult = {
|
||||||
|
schema_version: SCHEMA_VERSION,
|
||||||
|
version,
|
||||||
|
branch: git.branch,
|
||||||
|
git_sha: git.sha,
|
||||||
|
timestamp: new Date().toISOString(),
|
||||||
|
hostname: os.hostname(),
|
||||||
|
tier: this.tier,
|
||||||
|
total_tests: this.tests.length,
|
||||||
|
passed,
|
||||||
|
failed: this.tests.length - passed,
|
||||||
|
total_cost_usd: Math.round(totalCost * 100) / 100,
|
||||||
|
total_duration_ms: totalDuration,
|
||||||
|
tests: this.tests,
|
||||||
|
_partial: true,
|
||||||
|
};
|
||||||
|
|
||||||
|
fs.mkdirSync(this.evalDir, { recursive: true });
|
||||||
|
const partialPath = path.join(this.evalDir, '_partial-e2e.json');
|
||||||
|
const tmp = partialPath + '.tmp';
|
||||||
|
fs.writeFileSync(tmp, JSON.stringify(partial, null, 2) + '\n');
|
||||||
|
fs.renameSync(tmp, partialPath);
|
||||||
|
} catch { /* non-fatal — partial saves are best-effort */ }
|
||||||
}
|
}
|
||||||
|
|
||||||
async finalize(): Promise<string> {
|
async finalize(): Promise<string> {
|
||||||
@@ -403,6 +444,9 @@ export class EvalCollector {
|
|||||||
tests: this.tests,
|
tests: this.tests,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Delete partial file now that we're writing the final
|
||||||
|
try { fs.unlinkSync(path.join(this.evalDir, '_partial-e2e.json')); } catch { /* may not exist */ }
|
||||||
|
|
||||||
// Write eval file
|
// Write eval file
|
||||||
fs.mkdirSync(this.evalDir, { recursive: true });
|
fs.mkdirSync(this.evalDir, { recursive: true });
|
||||||
const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
|
const dateStr = timestamp.replace(/[:.]/g, '').replace('T', '-').slice(0, 15);
|
||||||
|
|||||||
@@ -8,6 +8,22 @@
|
|||||||
|
|
||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
|
import * as os from 'os';
|
||||||
|
|
||||||
|
const GSTACK_DEV_DIR = path.join(os.homedir(), '.gstack-dev');
|
||||||
|
const HEARTBEAT_PATH = path.join(GSTACK_DEV_DIR, 'e2e-live.json');
|
||||||
|
|
||||||
|
/** Sanitize test name for use as filename: strip leading slashes, replace / with - */
|
||||||
|
export function sanitizeTestName(name: string): string {
|
||||||
|
return name.replace(/^\/+/, '').replace(/\//g, '-');
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Atomic write: write to .tmp then rename. Non-fatal on error. */
|
||||||
|
function atomicWriteSync(filePath: string, data: string): void {
|
||||||
|
const tmp = filePath + '.tmp';
|
||||||
|
fs.writeFileSync(tmp, data);
|
||||||
|
fs.renameSync(tmp, filePath);
|
||||||
|
}
|
||||||
|
|
||||||
export interface CostEstimate {
|
export interface CostEstimate {
|
||||||
inputChars: number;
|
inputChars: number;
|
||||||
@@ -98,6 +114,8 @@ export async function runSkillTest(options: {
|
|||||||
maxTurns?: number;
|
maxTurns?: number;
|
||||||
allowedTools?: string[];
|
allowedTools?: string[];
|
||||||
timeout?: number;
|
timeout?: number;
|
||||||
|
testName?: string;
|
||||||
|
runId?: string;
|
||||||
}): Promise<SkillTestResult> {
|
}): Promise<SkillTestResult> {
|
||||||
const {
|
const {
|
||||||
prompt,
|
prompt,
|
||||||
@@ -105,9 +123,22 @@ export async function runSkillTest(options: {
|
|||||||
maxTurns = 15,
|
maxTurns = 15,
|
||||||
allowedTools = ['Bash', 'Read', 'Write'],
|
allowedTools = ['Bash', 'Read', 'Write'],
|
||||||
timeout = 120_000,
|
timeout = 120_000,
|
||||||
|
testName,
|
||||||
|
runId,
|
||||||
} = options;
|
} = options;
|
||||||
|
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
|
const startedAt = new Date().toISOString();
|
||||||
|
|
||||||
|
// Set up per-run log directory if runId is provided
|
||||||
|
let runDir: string | null = null;
|
||||||
|
const safeName = testName ? sanitizeTestName(testName) : null;
|
||||||
|
if (runId) {
|
||||||
|
try {
|
||||||
|
runDir = path.join(GSTACK_DEV_DIR, 'e2e-runs', runId);
|
||||||
|
fs.mkdirSync(runDir, { recursive: true });
|
||||||
|
} catch { /* non-fatal */ }
|
||||||
|
}
|
||||||
|
|
||||||
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
// Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to
|
||||||
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
// avoid shell escaping issues. --verbose is required for stream-json mode.
|
||||||
@@ -161,7 +192,7 @@ export async function runSkillTest(options: {
|
|||||||
if (!line.trim()) continue;
|
if (!line.trim()) continue;
|
||||||
collectedLines.push(line);
|
collectedLines.push(line);
|
||||||
|
|
||||||
// Real-time progress to stderr
|
// Real-time progress to stderr + persistent logs
|
||||||
try {
|
try {
|
||||||
const event = JSON.parse(line);
|
const event = JSON.parse(line);
|
||||||
if (event.type === 'assistant') {
|
if (event.type === 'assistant') {
|
||||||
@@ -171,13 +202,40 @@ export async function runSkillTest(options: {
|
|||||||
if (item.type === 'tool_use') {
|
if (item.type === 'tool_use') {
|
||||||
liveToolCount++;
|
liveToolCount++;
|
||||||
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
||||||
process.stderr.write(
|
const progressLine = ` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`;
|
||||||
` [${elapsed}s] turn ${liveTurnCount} tool #${liveToolCount}: ${item.name}(${truncate(JSON.stringify(item.input || {}), 80)})\n`
|
process.stderr.write(progressLine);
|
||||||
);
|
|
||||||
|
// Persist progress.log
|
||||||
|
if (runDir) {
|
||||||
|
try { fs.appendFileSync(path.join(runDir, 'progress.log'), progressLine); } catch { /* non-fatal */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write heartbeat (atomic)
|
||||||
|
if (runId && testName) {
|
||||||
|
try {
|
||||||
|
const toolDesc = `${item.name}(${truncate(JSON.stringify(item.input || {}), 60)})`;
|
||||||
|
atomicWriteSync(HEARTBEAT_PATH, JSON.stringify({
|
||||||
|
runId,
|
||||||
|
startedAt,
|
||||||
|
currentTest: testName,
|
||||||
|
status: 'running',
|
||||||
|
turn: liveTurnCount,
|
||||||
|
toolCount: liveToolCount,
|
||||||
|
lastTool: toolDesc,
|
||||||
|
lastToolAt: new Date().toISOString(),
|
||||||
|
elapsedSec: elapsed,
|
||||||
|
}, null, 2) + '\n');
|
||||||
|
} catch { /* non-fatal */ }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch { /* skip — parseNDJSON will handle it later */ }
|
} catch { /* skip — parseNDJSON will handle it later */ }
|
||||||
|
|
||||||
|
// Append raw NDJSON line to per-test transcript file
|
||||||
|
if (runDir && safeName) {
|
||||||
|
try { fs.appendFileSync(path.join(runDir, `${safeName}.ndjson`), line + '\n'); } catch { /* non-fatal */ }
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch { /* stream read error — fall through to exit code handling */ }
|
} catch { /* stream read error — fall through to exit code handling */ }
|
||||||
@@ -226,19 +284,24 @@ export async function runSkillTest(options: {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save transcript on failure
|
// Save failure transcript to persistent run directory (or fallback to workingDirectory)
|
||||||
if (browseErrors.length > 0 || exitReason !== 'success') {
|
if (browseErrors.length > 0 || exitReason !== 'success') {
|
||||||
try {
|
try {
|
||||||
const transcriptDir = path.join(workingDirectory, '.gstack', 'test-transcripts');
|
const failureDir = runDir || path.join(workingDirectory, '.gstack', 'test-transcripts');
|
||||||
fs.mkdirSync(transcriptDir, { recursive: true });
|
fs.mkdirSync(failureDir, { recursive: true });
|
||||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
const failureName = safeName
|
||||||
|
? `${safeName}-failure.json`
|
||||||
|
: `e2e-${new Date().toISOString().replace(/[:.]/g, '-')}.json`;
|
||||||
fs.writeFileSync(
|
fs.writeFileSync(
|
||||||
path.join(transcriptDir, `e2e-${timestamp}.json`),
|
path.join(failureDir, failureName),
|
||||||
JSON.stringify({
|
JSON.stringify({
|
||||||
prompt: prompt.slice(0, 500),
|
prompt: prompt.slice(0, 500),
|
||||||
|
testName: testName || 'unknown',
|
||||||
exitReason,
|
exitReason,
|
||||||
browseErrors,
|
browseErrors,
|
||||||
duration,
|
duration,
|
||||||
|
turnAtTimeout: timedOut ? liveTurnCount : undefined,
|
||||||
|
lastToolCall: liveToolCount > 0 ? `tool #${liveToolCount}` : undefined,
|
||||||
stderr: stderr.slice(0, 2000),
|
stderr: stderr.slice(0, 2000),
|
||||||
result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null,
|
result: resultLine ? { type: resultLine.type, subtype: resultLine.subtype, result: resultLine.result?.slice?.(0, 500) } : null,
|
||||||
}, null, 2),
|
}, null, 2),
|
||||||
|
|||||||
Reference in New Issue
Block a user