mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 02:22:04 +08:00
feat(security): wait-for-decision instead of hard-kill on tool-output BLOCK
Was: tool-output BLOCK → immediate SIGTERM, session dies, user stranded. A false positive on benign content (e.g. HN comments discussing prompt injection) killed the session and lost the message. Now: tool-output BLOCK → emit security_event with reviewable:true + suspected_text + per-layer scores. Poll ~/.gstack/security/decisions/ for up to 60s. On "allow" — log the override to attempts.jsonl as verdict=user_overrode and let the session continue. On "block" or timeout — kill as before. Canary leaks stay hard-stop (no review path). User-input pre-spawn scans unchanged in this commit. Only tool-output scans gain review. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ import { safeUnlink } from './error-handling';
|
|||||||
import {
|
import {
|
||||||
checkCanaryInStructure, logAttempt, hashPayload, extractDomain,
|
checkCanaryInStructure, logAttempt, hashPayload, extractDomain,
|
||||||
combineVerdict, writeSessionState, readSessionState, THRESHOLDS,
|
combineVerdict, writeSessionState, readSessionState, THRESHOLDS,
|
||||||
|
readDecision, clearDecision, excerptForReview,
|
||||||
type LayerSignal,
|
type LayerSignal,
|
||||||
} from './security';
|
} from './security';
|
||||||
import {
|
import {
|
||||||
@@ -643,15 +644,26 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
|
|||||||
if (result.verdict !== 'block') return;
|
if (result.verdict !== 'block') return;
|
||||||
toolResultBlockFired = true;
|
toolResultBlockFired = true;
|
||||||
const domain = extractDomain(pageUrl ?? '');
|
const domain = extractDomain(pageUrl ?? '');
|
||||||
|
const payloadHash = hashPayload(text.slice(0, 4096));
|
||||||
|
|
||||||
|
// Log pending — if the user overrides, we'll update via a separate
|
||||||
|
// log line. The attempts.jsonl is append-only so both entries survive.
|
||||||
logAttempt({
|
logAttempt({
|
||||||
ts: new Date().toISOString(),
|
ts: new Date().toISOString(),
|
||||||
urlDomain: domain,
|
urlDomain: domain,
|
||||||
payloadHash: hashPayload(text.slice(0, 4096)),
|
payloadHash,
|
||||||
confidence: result.confidence,
|
confidence: result.confidence,
|
||||||
layer: 'testsavant_content',
|
layer: 'testsavant_content',
|
||||||
verdict: 'block',
|
verdict: 'block',
|
||||||
});
|
});
|
||||||
console.warn(`[sidebar-agent] Tool-result BLOCK on ${toolName} for tab ${tid} (confidence=${result.confidence.toFixed(3)})`);
|
console.warn(`[sidebar-agent] Tool-result BLOCK on ${toolName} for tab ${tid} (confidence=${result.confidence.toFixed(3)}) — awaiting user decision`);
|
||||||
|
|
||||||
|
// Surface a REVIEWABLE block event. Sidepanel renders the suspected
|
||||||
|
// text + layer scores + [Allow and continue] / [Block session] buttons.
|
||||||
|
// The user has 60s to decide; default is BLOCK (safe fallback).
|
||||||
|
const layerScores = signals
|
||||||
|
.filter((s) => s.confidence > 0)
|
||||||
|
.map((s) => ({ layer: s.layer, confidence: s.confidence }));
|
||||||
await sendEvent({
|
await sendEvent({
|
||||||
type: 'security_event',
|
type: 'security_event',
|
||||||
verdict: 'block',
|
verdict: 'block',
|
||||||
@@ -660,10 +672,61 @@ async function askClaude(queueEntry: QueueEntry): Promise<void> {
|
|||||||
confidence: result.confidence,
|
confidence: result.confidence,
|
||||||
domain,
|
domain,
|
||||||
tool: toolName,
|
tool: toolName,
|
||||||
|
reviewable: true,
|
||||||
|
suspected_text: excerptForReview(text),
|
||||||
|
signals: layerScores,
|
||||||
}, tid);
|
}, tid);
|
||||||
|
|
||||||
|
// Poll for the user's decision. Default to BLOCK on timeout.
|
||||||
|
const REVIEW_TIMEOUT_MS = 60_000;
|
||||||
|
const POLL_MS = 500;
|
||||||
|
clearDecision(tid); // clear any stale decision from a prior session
|
||||||
|
const deadline = Date.now() + REVIEW_TIMEOUT_MS;
|
||||||
|
let decision: 'allow' | 'block' = 'block';
|
||||||
|
let decisionReason = 'timeout';
|
||||||
|
while (Date.now() < deadline) {
|
||||||
|
const rec = readDecision(tid);
|
||||||
|
if (rec?.decision === 'allow' || rec?.decision === 'block') {
|
||||||
|
decision = rec.decision;
|
||||||
|
decisionReason = rec.reason ?? 'user';
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
await new Promise((r) => setTimeout(r, POLL_MS));
|
||||||
|
}
|
||||||
|
clearDecision(tid);
|
||||||
|
|
||||||
|
if (decision === 'allow') {
|
||||||
|
// User overrode. Log the override so the audit trail captures it.
|
||||||
|
// toolResultBlockFired stays true so we don't re-prompt within the
|
||||||
|
// same message — one override per BLOCK event.
|
||||||
|
logAttempt({
|
||||||
|
ts: new Date().toISOString(),
|
||||||
|
urlDomain: domain,
|
||||||
|
payloadHash,
|
||||||
|
confidence: result.confidence,
|
||||||
|
layer: 'testsavant_content',
|
||||||
|
verdict: 'user_overrode',
|
||||||
|
});
|
||||||
|
await sendEvent({
|
||||||
|
type: 'security_event',
|
||||||
|
verdict: 'user_overrode',
|
||||||
|
reason: 'tool_result_ml',
|
||||||
|
layer: 'testsavant_content',
|
||||||
|
confidence: result.confidence,
|
||||||
|
domain,
|
||||||
|
tool: toolName,
|
||||||
|
}, tid);
|
||||||
|
console.warn(`[sidebar-agent] Tab ${tid}: user overrode BLOCK — session continues`);
|
||||||
|
// Let the block stay consumed; reset the flag so subsequent tool
|
||||||
|
// results get scanned fresh.
|
||||||
|
toolResultBlockFired = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// User chose BLOCK (or timed out). Kill the session as before.
|
||||||
await sendEvent({
|
await sendEvent({
|
||||||
type: 'agent_error',
|
type: 'agent_error',
|
||||||
error: `Session terminated — prompt injection detected in ${toolName} output`,
|
error: `Session terminated — prompt injection detected in ${toolName} output${decisionReason === 'timeout' ? ' (review timeout)' : ''}`,
|
||||||
}, tid);
|
}, tid);
|
||||||
try { proc.kill('SIGTERM'); } catch (err: any) { if (err?.code !== 'ESRCH') throw err; }
|
try { proc.kill('SIGTERM'); } catch (err: any) { if (err?.code !== 'ESRCH') throw err; }
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
|
|||||||
Reference in New Issue
Block a user