feat(security): wire TestSavantAI + ensemble into sidebar-agent pre-spawn scan

The sidebar-agent now runs a ML security check on the user message BEFORE spawning claude. If the content classifier and (gated) transcript classifier ensemble returns BLOCK, the session is refused with a security_event + agent_error — the sidepanel renders the approved banner. Two pieces: 1. On agent startup, loadTestsavant() warms the classifier in the background. First run triggers a 112MB model download from HuggingFace (~30s on average broadband). Non-blocking — sidebar stays functional during cold-start, shield just reports 'off' until warmed. 2. preSpawnSecurityCheck() runs the ensemble against the user message: - L4 (testsavant_content) always runs - L4b (transcript_classifier via Haiku) runs only if L4 flagged at >= LOG_ONLY — plan §E1 gating optimization, saves ~70% of Haiku spend combineVerdict() applies the BLOCK-requires-both-layers rule, which downgrades any single-layer high confidence to WARN. Stack Overflow-style instruction-heavy writing false-positives on TestSavantAI alone are caught by this degrade — Haiku corrects them when called. Fail-open everywhere: any subprocess/load/inference error returns confidence=0 so the sidebar keeps working on architectural controls alone. Shield icon reflects degraded state via getClassifierStatus(). BLOCK path emits both: - security_event {verdict, reason, layer, confidence, domain} (for the approved canary-leak banner UX mockup — variant A) - agent_error "Session blocked — prompt injection detected..." (backward-compat with existing error surface) Regression test suite still passes (12/12 sidebar-security tests). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 18:32:28 +08:00 · 2026-04-19 19:05:37 +08:00
parent 63a56e6789
commit 750161bbbe
1 changed files with 88 additions and 1 deletions
--- a/browse/src/sidebar-agent.ts
+++ b/browse/src/sidebar-agent.ts
@@ -13,7 +13,15 @@ import { spawn } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import { safeUnlink } from './error-handling';
-import { checkCanaryInStructure, logAttempt, hashPayload, extractDomain } from './security';
+import {
  checkCanaryInStructure, logAttempt, hashPayload, extractDomain,
  combineVerdict, type LayerSignal,
 } from './security';
 import {
  loadTestsavant, scanPageContent, checkTranscript,
  shouldRunTranscriptCheck, getClassifierStatus,
  type ToolCallInput,
 } from './security-classifier';
 const QUEUE = process.env.SIDEBAR_QUEUE_PATH || path.join(process.env.HOME || '/tmp', '.gstack', 'sidebar-agent-queue.jsonl');
 const KILL_FILE = path.join(path.dirname(QUEUE), 'sidebar-agent-kill');
@@ -370,6 +378,68 @@ async function onCanaryLeaked(params: {
  }, tabId);
 }
 /**
 * Pre-spawn ML scan of the user message. If the classifier fires at BLOCK,
 * we log the attempt, emit a security_event to the sidepanel, and DO NOT
 * spawn claude. Returns true if the scan blocked the session.
 *
 * Fail-open: any classifier error or degraded state returns false (safe) so
 * the sidebar keeps working. The architectural controls (XML framing +
 * command allowlist, live in server.ts:554-577) still defend.
 */
 async function preSpawnSecurityCheck(entry: QueueEntry): Promise<boolean> {
  const { message, canary, pageUrl, tabId } = entry;
  if (!message || message.length === 0) return false;
  const tid = tabId ?? 0;
  // L4: scan the user message for direct injection patterns
  const contentSignal = await scanPageContent(message);
  const signals: LayerSignal[] = [contentSignal];
  // L4b: only bother with Haiku if L4 already lit up at >= LOG_ONLY.
  // Saves ~70% of Haiku calls per plan §E1 "gating optimization".
  if (shouldRunTranscriptCheck(signals)) {
    const transcriptSignal = await checkTranscript({
      user_message: message,
      tool_calls: [], // no tool calls yet at session start
    });
    signals.push(transcriptSignal);
  }
  const result = combineVerdict(signals);
  if (result.verdict !== 'block') return false;
  // BLOCK verdict. Log + emit + refuse to spawn.
  const domain = extractDomain(pageUrl ?? '');
  const leaderSignal = signals.reduce((a, b) => (a.confidence > b.confidence ? a : b));
  logAttempt({
    ts: new Date().toISOString(),
    urlDomain: domain,
    payloadHash: hashPayload(message),
    confidence: result.confidence,
    layer: leaderSignal.layer,
    verdict: 'block',
  });
  console.warn(`[sidebar-agent] Pre-spawn BLOCK (${result.reason}) for tab ${tid}, confidence=${result.confidence.toFixed(3)}`);
  await sendEvent({
    type: 'security_event',
    verdict: 'block',
    reason: result.reason ?? 'ml_classifier',
    layer: leaderSignal.layer,
    confidence: result.confidence,
    domain,
  }, tid);
  await sendEvent({
    type: 'agent_error',
    error: `Session blocked — prompt injection detected${domain ? ` from ${domain}` : ' in your message'}`,
  }, tid);
  return true;
 }
 async function askClaude(queueEntry: QueueEntry): Promise<void> {
  const { prompt, args, stateFile, cwd, tabId, canary, pageUrl } = queueEntry;
  const tid = tabId ?? 0;
@@ -377,6 +447,13 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
  processingTabs.add(tid);
  await sendEvent({ type: 'agent_start' }, tid);
  // Pre-spawn ML scan: if the user message trips the ensemble, refuse to
  // spawn claude. Fail-open on classifier errors.
  if (await preSpawnSecurityCheck(queueEntry)) {
    processingTabs.delete(tid);
    return;
  }
  return new Promise((resolve) => {
    // Canary context is set after proc is spawned (needs proc reference for kill).
    let canaryCtx: CanaryContext | undefined;
@@ -616,6 +693,16 @@ async function main() {
  console.log(`[sidebar-agent] Server: ${SERVER_URL}`);
  console.log(`[sidebar-agent] Browse binary: ${B}`);
  // Warm up the ML classifier in the background. First call triggers a 112MB
  // download (~30s on average broadband). Non-blocking — the sidebar stays
  // functional on cold start; classifier just reports 'off' until warmed.
  loadTestsavant((msg) => console.log(`[security-classifier] ${msg}`))
    .then(() => {
      const s = getClassifierStatus();
      console.log(`[sidebar-agent] Classifier warmup complete: ${JSON.stringify(s)}`);
    })
    .catch((err) => console.warn('[sidebar-agent] Classifier warmup failed (degraded mode):', err?.message));
  setInterval(poll, POLL_MS);
  setInterval(pollKillFile, POLL_MS);
 }