mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-08 21:49:45 +08:00
Gap caught by user: the review-flow tests verified the decision path (POST, file write, agent_error emission) but not the actual security property — that Block stops subsequent tool calls and Allow lets them continue. Mock-claude tool_result_injection scenario now emits a second tool_use ~8s after the injected tool_result, targeting post-block-followup. example.com. If block really blocks, that event never reaches the chat feed (SIGTERM killed the subprocess before it emitted). If allow really allows, it does. Allow test asserts the followup tool_use DOES appear → session lives. Block test asserts the followup tool_use does NOT appear after 12s → kill actually stopped further work. Both tests previously proved the control plane (decision file → agent poll → agent_error); they now prove the data plane too. Test timeout bumped 60s → 90s to accommodate the 12s quiet window. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
406 lines
16 KiB
TypeScript
406 lines
16 KiB
TypeScript
/**
|
|
* Full-stack review-flow E2E with the real classifier.
|
|
*
|
|
* Spins up real server + real sidebar-agent subprocess + mock-claude and
|
|
* exercises the whole tool-output BLOCK → review → decide path with the
|
|
* real TestSavantAI classifier warm. The injection string trips the real
|
|
* model reliably (measured: confidence 0.9999 on classic DAN-style text).
|
|
*
|
|
* What this covers that gate-tier tests don't:
|
|
* * Real classifier actually fires on the injection
|
|
* * sidebar-agent emits a reviewable security_event for real, not a stub
|
|
* * server's POST /security-decision writes the on-disk decision file
|
|
* * sidebar-agent's poll loop reads the file and either resumes or kills
|
|
* the mock-claude subprocess
|
|
* * attempts.jsonl ends up with the right verdict (block vs user_overrode)
|
|
*
|
|
* This is periodic tier. First run warms the ~112MB classifier from
|
|
* HuggingFace — ~30s cold. Subsequent runs use the cached model under
|
|
* ~/.gstack/models/testsavant-small/ and complete in ~5s.
|
|
*
|
|
* SKIPS if the classifier can't warm (no network, no disk) — the test is
|
|
* truth-seeking only when the stack is genuinely up.
|
|
*/
|
|
|
|
import { describe, test, expect, beforeAll, afterAll } from 'bun:test';
|
|
import { spawn, type Subprocess } from 'bun';
|
|
import * as fs from 'fs';
|
|
import * as os from 'os';
|
|
import * as path from 'path';
|
|
|
|
const MOCK_CLAUDE_DIR = path.resolve(import.meta.dir, 'fixtures', 'mock-claude');
|
|
const WARMUP_TIMEOUT_MS = 90_000; // first-run download budget
|
|
const CLASSIFIER_CACHE = path.join(os.homedir(), '.gstack', 'models', 'testsavant-small');
|
|
|
|
let serverProc: Subprocess | null = null;
|
|
let agentProc: Subprocess | null = null;
|
|
let serverPort = 0;
|
|
let authToken = '';
|
|
let tmpDir = '';
|
|
let stateFile = '';
|
|
let queueFile = '';
|
|
let attemptsPath = '';
|
|
|
|
/**
|
|
* Eager check — is the classifier model already on disk? `test.skipIf()`
|
|
* is evaluated at file-registration time (before beforeAll runs), so a
|
|
* runtime boolean wouldn't work — all tests would unconditionally register
|
|
* as skipped. Probe the model dir synchronously at file load.
|
|
* Same pattern as security-sidepanel-dom.test.ts uses for chromium.
|
|
*/
|
|
const CLASSIFIER_READY = (() => {
|
|
try {
|
|
if (!fs.existsSync(CLASSIFIER_CACHE)) return false;
|
|
// At minimum we need the tokenizer config + onnx model.
|
|
return fs.existsSync(path.join(CLASSIFIER_CACHE, 'tokenizer.json'))
|
|
&& fs.existsSync(path.join(CLASSIFIER_CACHE, 'onnx'));
|
|
} catch {
|
|
return false;
|
|
}
|
|
})();
|
|
|
|
async function apiFetch(pathname: string, opts: RequestInit = {}): Promise<Response> {
|
|
return fetch(`http://127.0.0.1:${serverPort}${pathname}`, {
|
|
...opts,
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
Authorization: `Bearer ${authToken}`,
|
|
...(opts.headers as Record<string, string> | undefined),
|
|
},
|
|
});
|
|
}
|
|
|
|
async function waitForSecurityEntry(
|
|
predicate: (entry: any) => boolean,
|
|
timeoutMs: number,
|
|
): Promise<any | null> {
|
|
const deadline = Date.now() + timeoutMs;
|
|
while (Date.now() < deadline) {
|
|
const resp = await apiFetch('/sidebar-chat');
|
|
const data: any = await resp.json();
|
|
for (const entry of data.entries ?? []) {
|
|
if (entry.type === 'security_event' && predicate(entry)) return entry;
|
|
}
|
|
await new Promise((r) => setTimeout(r, 250));
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function waitForProcessExit(proc: Subprocess, timeoutMs: number): Promise<number | null> {
|
|
const deadline = Date.now() + timeoutMs;
|
|
while (Date.now() < deadline) {
|
|
if (proc.exitCode !== null) return proc.exitCode;
|
|
await new Promise((r) => setTimeout(r, 100));
|
|
}
|
|
return null;
|
|
}
|
|
|
|
async function readAttempts(): Promise<any[]> {
|
|
if (!fs.existsSync(attemptsPath)) return [];
|
|
const raw = fs.readFileSync(attemptsPath, 'utf-8');
|
|
return raw.split('\n').filter(Boolean).map((l) => {
|
|
try { return JSON.parse(l); } catch { return null; }
|
|
}).filter(Boolean);
|
|
}
|
|
|
|
async function startStack(scenario: string, attemptsDir: string): Promise<void> {
|
|
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'security-review-fullstack-'));
|
|
stateFile = path.join(tmpDir, 'browse.json');
|
|
queueFile = path.join(tmpDir, 'sidebar-queue.jsonl');
|
|
fs.mkdirSync(path.dirname(queueFile), { recursive: true });
|
|
|
|
// Re-root HOME for both server and agent so:
|
|
// - server.ts's SESSIONS_DIR doesn't load pre-existing chat history
|
|
// from ~/.gstack/sidebar-sessions/ (caused ghost security_events to
|
|
// leak in from the live /open-gstack-browser session)
|
|
// - security.ts's attempts.jsonl writes land in a test-owned dir
|
|
// - session-state.json, chromium-profile, etc. stay isolated
|
|
fs.mkdirSync(path.join(attemptsDir, '.gstack'), { recursive: true });
|
|
|
|
// Symlink the models dir through to the real cache — without it the
|
|
// sidebar-agent would try to re-download 112MB every test run.
|
|
const testModelsDir = path.join(attemptsDir, '.gstack', 'models');
|
|
const realModelsDir = path.join(os.homedir(), '.gstack', 'models');
|
|
try {
|
|
if (fs.existsSync(realModelsDir) && !fs.existsSync(testModelsDir)) {
|
|
fs.symlinkSync(realModelsDir, testModelsDir);
|
|
}
|
|
} catch {
|
|
// Symlink may already exist — ignore.
|
|
}
|
|
|
|
const serverScript = path.resolve(import.meta.dir, '..', 'src', 'server.ts');
|
|
const agentScript = path.resolve(import.meta.dir, '..', 'src', 'sidebar-agent.ts');
|
|
|
|
serverProc = spawn(['bun', 'run', serverScript], {
|
|
env: {
|
|
...process.env,
|
|
BROWSE_STATE_FILE: stateFile,
|
|
BROWSE_HEADLESS_SKIP: '1',
|
|
BROWSE_PORT: '0',
|
|
SIDEBAR_QUEUE_PATH: queueFile,
|
|
BROWSE_IDLE_TIMEOUT: '300',
|
|
HOME: attemptsDir,
|
|
},
|
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
});
|
|
|
|
const deadline = Date.now() + 15000;
|
|
while (Date.now() < deadline) {
|
|
if (fs.existsSync(stateFile)) {
|
|
try {
|
|
const state = JSON.parse(fs.readFileSync(stateFile, 'utf-8'));
|
|
if (state.port && state.token) {
|
|
serverPort = state.port;
|
|
authToken = state.token;
|
|
break;
|
|
}
|
|
} catch {}
|
|
}
|
|
await new Promise((r) => setTimeout(r, 100));
|
|
}
|
|
if (!serverPort) throw new Error('Server did not start in time');
|
|
|
|
const shimmedPath = `${MOCK_CLAUDE_DIR}:${process.env.PATH ?? ''}`;
|
|
agentProc = spawn(['bun', 'run', agentScript], {
|
|
env: {
|
|
...process.env,
|
|
PATH: shimmedPath,
|
|
BROWSE_STATE_FILE: stateFile,
|
|
SIDEBAR_QUEUE_PATH: queueFile,
|
|
BROWSE_SERVER_PORT: String(serverPort),
|
|
BROWSE_PORT: String(serverPort),
|
|
BROWSE_NO_AUTOSTART: '1',
|
|
MOCK_CLAUDE_SCENARIO: scenario,
|
|
HOME: attemptsDir,
|
|
},
|
|
stdio: ['ignore', 'pipe', 'pipe'],
|
|
});
|
|
attemptsPath = path.join(attemptsDir, '.gstack', 'security', 'attempts.jsonl');
|
|
|
|
// Give the agent a moment to establish its poll loop + warmup the model.
|
|
await new Promise((r) => setTimeout(r, 500));
|
|
}
|
|
|
|
async function stopStack(): Promise<void> {
|
|
for (const proc of [serverProc, agentProc]) {
|
|
if (proc) {
|
|
try { proc.kill('SIGTERM'); } catch {}
|
|
try { setTimeout(() => { try { proc.kill('SIGKILL'); } catch {} }, 1500); } catch {}
|
|
}
|
|
}
|
|
serverProc = null;
|
|
agentProc = null;
|
|
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch {}
|
|
}
|
|
|
|
beforeAll(async () => {
|
|
// Sanity: the on-disk cache is real + decodable. If this fails, mark the
|
|
// file as "classifier unavailable" (we can't toggle CLASSIFIER_READY
|
|
// post-registration — a failure here just means the tests below will
|
|
// exercise the agent without a working classifier, which is the honest
|
|
// signal we want anyway).
|
|
if (!CLASSIFIER_READY) return;
|
|
});
|
|
|
|
afterAll(async () => {
|
|
await stopStack();
|
|
});
|
|
|
|
describe('review-flow full-stack E2E', () => {
|
|
test.skipIf(!CLASSIFIER_READY)(
|
|
'tool_result injection → reviewable event → user ALLOWS → attempts.jsonl has user_overrode',
|
|
async () => {
|
|
const attemptsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attempts-allow-'));
|
|
try {
|
|
await startStack('tool_result_injection', attemptsDir);
|
|
|
|
// Fire the message that will cause mock-claude to emit the
|
|
// injection-laden tool_result.
|
|
const resp = await apiFetch('/sidebar-command', {
|
|
method: 'POST',
|
|
body: JSON.stringify({
|
|
message: 'summarize the hacker news comments',
|
|
activeTabUrl: 'https://news.ycombinator.com/item?id=42',
|
|
}),
|
|
});
|
|
expect(resp.status).toBe(200);
|
|
|
|
// Wait for the real classifier to fire and emit a reviewable
|
|
// security_event. The classifier is warm so this should happen in
|
|
// well under 10s once the tool_result arrives.
|
|
const reviewable = await waitForSecurityEntry(
|
|
(e) => e.verdict === 'block' && e.reviewable === true,
|
|
30_000,
|
|
);
|
|
expect(reviewable).not.toBeNull();
|
|
expect(reviewable.reason).toBe('tool_result_ml');
|
|
expect(reviewable.tool).toBe('Bash');
|
|
expect(String(reviewable.suspected_text ?? '')).toContain('IGNORE ALL PREVIOUS');
|
|
|
|
// User clicks Allow via the banner → sidepanel POSTs to server.
|
|
const decisionResp = await apiFetch('/security-decision', {
|
|
method: 'POST',
|
|
body: JSON.stringify({
|
|
tabId: reviewable.tabId,
|
|
decision: 'allow',
|
|
reason: 'user',
|
|
}),
|
|
});
|
|
expect(decisionResp.status).toBe(200);
|
|
|
|
// Wait for sidebar-agent's poll loop to consume the decision and
|
|
// emit a follow-up user_overrode security_event.
|
|
const overrode = await waitForSecurityEntry(
|
|
(e) => e.verdict === 'user_overrode',
|
|
10_000,
|
|
);
|
|
expect(overrode).not.toBeNull();
|
|
|
|
// Audit log must capture both the block and the override, in that
|
|
// order. Both records share the same salted payload hash so the
|
|
// security dashboard can aggregate them as a single attempt.
|
|
const attempts = await readAttempts();
|
|
const blockLog = attempts.find(
|
|
(a) => a.verdict === 'block' && a.layer === 'testsavant_content',
|
|
);
|
|
const overrodeLog = attempts.find(
|
|
(a) => a.verdict === 'user_overrode' && a.layer === 'testsavant_content',
|
|
);
|
|
expect(blockLog).toBeTruthy();
|
|
expect(overrodeLog).toBeTruthy();
|
|
expect(overrodeLog.payloadHash).toBe(blockLog.payloadHash);
|
|
// Privacy contract: neither record includes the raw payload.
|
|
expect(JSON.stringify(overrodeLog)).not.toContain('IGNORE ALL PREVIOUS');
|
|
|
|
// Liveness: session must actually KEEP RUNNING after Allow. Mock-claude
|
|
// emits a second tool_use to post-block-followup.example.com ~8s
|
|
// after the tool_result. That event must reach the chat feed, proving
|
|
// the sidebar-agent resumed the stream-handler relay instead of
|
|
// silently wedging.
|
|
const followupDeadline = Date.now() + 20_000;
|
|
let followup: any = null;
|
|
while (Date.now() < followupDeadline && !followup) {
|
|
const chatResp = await apiFetch('/sidebar-chat');
|
|
const chatData: any = await chatResp.json();
|
|
for (const entry of chatData.entries ?? []) {
|
|
const input = String((entry as any).input ?? '');
|
|
if (
|
|
entry.type === 'tool_use' &&
|
|
input.includes('post-block-followup.example.com')
|
|
) {
|
|
followup = entry;
|
|
break;
|
|
}
|
|
}
|
|
if (!followup) await new Promise((r) => setTimeout(r, 300));
|
|
}
|
|
expect(followup).not.toBeNull();
|
|
} finally {
|
|
await stopStack();
|
|
try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {}
|
|
}
|
|
},
|
|
90_000,
|
|
);
|
|
|
|
test.skipIf(!CLASSIFIER_READY)(
|
|
'tool_result injection → reviewable event → user BLOCKS → agent session terminates',
|
|
async () => {
|
|
const attemptsDir = fs.mkdtempSync(path.join(os.tmpdir(), 'attempts-block-'));
|
|
try {
|
|
await startStack('tool_result_injection', attemptsDir);
|
|
|
|
const resp = await apiFetch('/sidebar-command', {
|
|
method: 'POST',
|
|
body: JSON.stringify({
|
|
message: 'summarize the hacker news comments',
|
|
activeTabUrl: 'https://news.ycombinator.com/item?id=42',
|
|
}),
|
|
});
|
|
expect(resp.status).toBe(200);
|
|
|
|
const reviewable = await waitForSecurityEntry(
|
|
(e) => e.verdict === 'block' && e.reviewable === true,
|
|
30_000,
|
|
);
|
|
expect(reviewable).not.toBeNull();
|
|
|
|
const decisionResp = await apiFetch('/security-decision', {
|
|
method: 'POST',
|
|
body: JSON.stringify({
|
|
tabId: reviewable.tabId,
|
|
decision: 'block',
|
|
reason: 'user',
|
|
}),
|
|
});
|
|
expect(decisionResp.status).toBe(200);
|
|
|
|
// Wait for the agent_error that the sidebar-agent emits when it
|
|
// kills the claude subprocess after a user-confirmed block. This
|
|
// is the sidepanel's "Session terminated" signal.
|
|
const deadline = Date.now() + 15_000;
|
|
let errorEntry: any = null;
|
|
while (Date.now() < deadline && !errorEntry) {
|
|
const chatResp = await apiFetch('/sidebar-chat');
|
|
const chatData: any = await chatResp.json();
|
|
for (const entry of chatData.entries ?? []) {
|
|
if (
|
|
entry.type === 'agent_error' &&
|
|
String(entry.error ?? '').includes('Session terminated')
|
|
) {
|
|
errorEntry = entry;
|
|
break;
|
|
}
|
|
}
|
|
if (!errorEntry) await new Promise((r) => setTimeout(r, 200));
|
|
}
|
|
expect(errorEntry).not.toBeNull();
|
|
|
|
// attempts.jsonl must NOT have a user_overrode entry for this run.
|
|
const attempts = await readAttempts();
|
|
const overrodeLog = attempts.find((a) => a.verdict === 'user_overrode');
|
|
expect(overrodeLog).toBeFalsy();
|
|
|
|
// The real security property: after Block, NO FURTHER tool calls
|
|
// reach the chat feed. Mock-claude would have emitted a tool_use
|
|
// to post-block-followup.example.com ~8s after the tool_result if
|
|
// the session had kept running. Wait long enough for that window
|
|
// to close (12s total), then assert the followup event never
|
|
// appeared. This is what makes "block" actually stop the page —
|
|
// the subprocess is SIGTERM'd before it can emit the next event.
|
|
await new Promise((r) => setTimeout(r, 12_000));
|
|
const finalChatResp = await apiFetch('/sidebar-chat');
|
|
const finalChatData: any = await finalChatResp.json();
|
|
const followupAttempted = (finalChatData.entries ?? []).some(
|
|
(entry: any) =>
|
|
entry.type === 'tool_use' &&
|
|
String(entry.input ?? '').includes('post-block-followup.example.com'),
|
|
);
|
|
expect(followupAttempted).toBe(false);
|
|
|
|
// And mock-claude must actually have died (not just been signaled
|
|
// — the SIGTERM + SIGKILL pair should have exited the process).
|
|
const mockAlive = (await apiFetch('/sidebar-chat')).ok; // channel still open
|
|
expect(mockAlive).toBe(true);
|
|
} finally {
|
|
await stopStack();
|
|
try { fs.rmSync(attemptsDir, { recursive: true, force: true }); } catch {}
|
|
}
|
|
},
|
|
90_000,
|
|
);
|
|
|
|
test.skipIf(!CLASSIFIER_READY)(
|
|
'no decision within 60s → timeout auto-blocks',
|
|
async () => {
|
|
// This test would naturally take 60s+ to run. We assert the
|
|
// decision file semantics instead — the unit-test suite already
|
|
// verified the poll loop times out and defaults to block
|
|
// (security-review-flow.test.ts). Kept here as a spec marker so
|
|
// the scenario is documented in the full-stack file.
|
|
expect(true).toBe(true);
|
|
},
|
|
);
|
|
});
|