mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 18:32:28 +08:00
feat(security): wire TestSavantAI + ensemble into sidebar-agent pre-spawn scan
The sidebar-agent now runs a ML security check on the user message BEFORE
spawning claude. If the content classifier and (gated) transcript classifier
ensemble returns BLOCK, the session is refused with a security_event +
agent_error — the sidepanel renders the approved banner.
Two pieces:
1. On agent startup, loadTestsavant() warms the classifier in the background.
First run triggers a 112MB model download from HuggingFace (~30s on
average broadband). Non-blocking — sidebar stays functional during
cold-start, shield just reports 'off' until warmed.
2. preSpawnSecurityCheck() runs the ensemble against the user message:
- L4 (testsavant_content) always runs
- L4b (transcript_classifier via Haiku) runs only if L4 flagged at
>= LOG_ONLY — plan §E1 gating optimization, saves ~70% of Haiku spend
combineVerdict() applies the BLOCK-requires-both-layers rule, which
downgrades any single-layer high confidence to WARN. Stack Overflow-style
instruction-heavy writing false-positives on TestSavantAI alone are
caught by this degrade — Haiku corrects them when called.
Fail-open everywhere: any subprocess/load/inference error returns confidence=0
so the sidebar keeps working on architectural controls alone. Shield icon
reflects degraded state via getClassifierStatus().
BLOCK path emits both:
- security_event {verdict, reason, layer, confidence, domain} (for the
approved canary-leak banner UX mockup — variant A)
- agent_error "Session blocked — prompt injection detected..."
(backward-compat with existing error surface)
Regression test suite still passes (12/12 sidebar-security tests).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,15 @@ import { spawn } from 'child_process';
|
|||||||
import * as fs from 'fs';
|
import * as fs from 'fs';
|
||||||
import * as path from 'path';
|
import * as path from 'path';
|
||||||
import { safeUnlink } from './error-handling';
|
import { safeUnlink } from './error-handling';
|
||||||
import { checkCanaryInStructure, logAttempt, hashPayload, extractDomain } from './security';
|
import {
|
||||||
|
checkCanaryInStructure, logAttempt, hashPayload, extractDomain,
|
||||||
|
combineVerdict, type LayerSignal,
|
||||||
|
} from './security';
|
||||||
|
import {
|
||||||
|
loadTestsavant, scanPageContent, checkTranscript,
|
||||||
|
shouldRunTranscriptCheck, getClassifierStatus,
|
||||||
|
type ToolCallInput,
|
||||||
|
} from './security-classifier';
|
||||||
|
|
||||||
const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
|
const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
|
||||||
const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
|
const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
|
||||||
@@ -370,6 +378,68 @@ async function onCanaryLeaked(params: {
|
|||||||
}, tabId);
|
}, tabId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Pre-spawn ML scan of the user message. If the classifier fires at BLOCK,
|
||||||
|
* we log the attempt, emit a security_event to the sidepanel, and DO NOT
|
||||||
|
* spawn claude. Returns true if the scan blocked the session.
|
||||||
|
*
|
||||||
|
* Fail-open: any classifier error or degraded state returns false (safe) so
|
||||||
|
* the sidebar keeps working. The architectural controls (XML framing +
|
||||||
|
* command allowlist, live in server.ts:554-577) still defend.
|
||||||
|
*/
|
||||||
|
async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
|
||||||
|
const { message, canary, pageUrl, tabId } = entry;
|
||||||
|
if (!message || message.length === 0) return false;
|
||||||
|
const tid = tabId ?? 0;
|
||||||
|
|
||||||
|
// L4: scan the user message for direct injection patterns
|
||||||
|
const contentSignal = await scanPageContent(message);
|
||||||
|
const signals: LayerSignal[] = [contentSignal];
|
||||||
|
|
||||||
|
// L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
|
||||||
|
// Saves ~70% of Haiku calls per plan §E1 "gating optimization".
|
||||||
|
if (shouldRunTranscriptCheck(signals)) {
|
||||||
|
const transcriptSignal = await checkTranscript({
|
||||||
|
user_message: message,
|
||||||
|
tool_calls: [], // no tool calls yet at session start
|
||||||
|
});
|
||||||
|
signals.push(transcriptSignal);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = combineVerdict(signals);
|
||||||
|
if (result.verdict !== 'block') return false;
|
||||||
|
|
||||||
|
// BLOCK verdict. Log + emit + refuse to spawn.
|
||||||
|
const domain = extractDomain(pageUrl ?? '');
|
||||||
|
const leaderSignal = signals.reduce((a, b) => (a.confidence > b.confidence ? a : b));
|
||||||
|
|
||||||
|
logAttempt({
|
||||||
|
ts: new Date().toISOString(),
|
||||||
|
urlDomain: domain,
|
||||||
|
payloadHash: hashPayload(message),
|
||||||
|
confidence: result.confidence,
|
||||||
|
layer: leaderSignal.layer,
|
||||||
|
verdict: 'block',
|
||||||
|
});
|
||||||
|
|
||||||
|
console.warn(`[sidebar-agent] Pre-spawn BLOCK (${result.reason}) for tab ${tid}, confidence=${result.confidence.toFixed(3)}`);
|
||||||
|
|
||||||
|
await sendEvent({
|
||||||
|
type: 'security_event',
|
||||||
|
verdict: 'block',
|
||||||
|
reason: result.reason ?? 'ml_classifier',
|
||||||
|
layer: leaderSignal.layer,
|
||||||
|
confidence: result.confidence,
|
||||||
|
domain,
|
||||||
|
}, tid);
|
||||||
|
await sendEvent({
|
||||||
|
type: 'agent_error',
|
||||||
|
error: `Session blocked — prompt injection detected${domain ? ` from ${domain}` : ' in your message'}`,
|
||||||
|
}, tid);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
async function askClaude(queueEntry: QueueEntry): Promise<void> {
|
async function askClaude(queueEntry: QueueEntry): Promise<void> {
|
||||||
const { prompt, args, stateFile, cwd, tabId, canary, pageUrl } = queueEntry;
|
const { prompt, args, stateFile, cwd, tabId, canary, pageUrl } = queueEntry;
|
||||||
const tid = tabId ?? 0;
|
const tid = tabId ?? 0;
|
||||||
@@ -377,6 +447,13 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
|
|||||||
processingTabs.add(tid);
|
processingTabs.add(tid);
|
||||||
await sendEvent({ type: 'agent_start' }, tid);
|
await sendEvent({ type: 'agent_start' }, tid);
|
||||||
|
|
||||||
|
// Pre-spawn ML scan: if the user message trips the ensemble, refuse to
|
||||||
|
// spawn claude. Fail-open on classifier errors.
|
||||||
|
if (await preSpawnSecurityCheck(queueEntry)) {
|
||||||
|
processingTabs.delete(tid);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
return new Promise((resolve) => {
|
return new Promise((resolve) => {
|
||||||
// Canary context is set after proc is spawned (needs proc reference for kill).
|
// Canary context is set after proc is spawned (needs proc reference for kill).
|
||||||
let canaryCtx: CanaryContext | undefined;
|
let canaryCtx: CanaryContext | undefined;
|
||||||
@@ -616,6 +693,16 @@ async function main() {
|
|||||||
console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
|
console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
|
||||||
console.log(`[sidebar-agent] Browse binary: ${B}`);
|
console.log(`[sidebar-agent] Browse binary: ${B}`);
|
||||||
|
|
||||||
|
// Warm up the ML classifier in the background. First call triggers a 112MB
|
||||||
|
// download (~30s on average broadband). Non-blocking — the sidebar stays
|
||||||
|
// functional on cold start; classifier just reports 'off' until warmed.
|
||||||
|
loadTestsavant((msg) => console.log(`[security-classifier] ${msg}`))
|
||||||
|
.then(() => {
|
||||||
|
const s = getClassifierStatus();
|
||||||
|
console.log(`[sidebar-agent] Classifier warmup complete: ${JSON.stringify(s)}`);
|
||||||
|
})
|
||||||
|
.catch((err) => console.warn('[sidebar-agent] Classifier warmup failed (degraded mode):', err?.message));
|
||||||
|
|
||||||
setInterval(poll, POLL_MS);
|
setInterval(poll, POLL_MS);
|
||||||
setInterval(pollKillFile, POLL_MS);
|
setInterval(pollKillFile, POLL_MS);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user