mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-16 09:12:13 +08:00
feat(security): DeBERTa-v3 ensemble classifier (opt-in)
Adds ProtectAI DeBERTa-v3-base-injection-onnx as an optional L4c layer
for cross-model agreement. Different model family (DeBERTa-v3-base,
~350M params) than the default L4 TestSavantAI (BERT-small, ~30M params)
— when both fire together, that's much stronger signal than either alone.
Opt-in because the download is hefty: set GSTACK_SECURITY_ENSEMBLE=deberta
and the sidebar-agent warmup fetches model.onnx (721MB FP32) into
~/.gstack/models/deberta-v3-injection/ on first run. Subsequent runs are
cached.
Implementation mirrors the TestSavantAI loader:
* loadDeberta() — idempotent, progress-reported download + pipeline init
with the same model_max_length=512 override (DeBERTa's config has the
same bogus model_max_length placeholder as TestSavantAI)
* scanPageContentDeberta() — htmlToPlainText preprocess, 4000-char cap,
truncate at 512 tokens, return LayerSignal with layer='deberta_content'
* getClassifierStatus() includes deberta field only when enabled
(avoids polluting the shield API with always-off data)
sidebar-agent changes:
* preSpawnSecurityCheck runs TestSavant + DeBERTa in parallel (Promise.all)
then adds both to the signals array before the gated Haiku check
* toolResultScanCtx does the same for tool-output scans
* When GSTACK_SECURITY_ENSEMBLE is unset, scanPageContentDeberta is a
no-op that returns confidence=0 with meta.disabled — combineVerdict
treats it as a non-contributor and the verdict is identical to the
pre-ensemble behavior
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -59,6 +59,31 @@ const TESTSAVANT_FILES = [
|
||||
'vocab.txt',
|
||||
];
|
||||
|
||||
// DeBERTa-v3 (ProtectAI) — OPT-IN ensemble layer. Adds architectural
|
||||
// diversity: TestSavantAI-small is BERT-small fine-tuned on injection +
|
||||
// jailbreak; DeBERTa-v3-base is a separate model family trained on its
|
||||
// own corpus. Agreement between the two is stronger evidence than either
|
||||
// alone.
|
||||
//
|
||||
// Size: model.onnx is 721MB (FP32). Users opt in via
|
||||
// GSTACK_SECURITY_ENSEMBLE=deberta. Not forced on every install because
|
||||
// most users won't need the higher recall and 721MB download is a lot.
|
||||
const DEBERTA_DIR = path.join(MODELS_DIR, 'deberta-v3-injection');
|
||||
const DEBERTA_HF_URL = 'https://huggingface.co/protectai/deberta-v3-base-injection-onnx/resolve/main';
|
||||
const DEBERTA_FILES = [
|
||||
'config.json',
|
||||
'tokenizer.json',
|
||||
'tokenizer_config.json',
|
||||
'special_tokens_map.json',
|
||||
'spm.model',
|
||||
'added_tokens.json',
|
||||
];
|
||||
|
||||
function isDebertaEnabled(): boolean {
|
||||
const setting = (process.env.GSTACK_SECURITY_ENSEMBLE ?? '').toLowerCase();
|
||||
return setting.split(',').map(s => s.trim()).includes('deberta');
|
||||
}
|
||||
|
||||
// ─── Load state ──────────────────────────────────────────────
|
||||
|
||||
type LoadState = 'uninitialized' | 'loading' | 'loaded' | 'failed';
|
||||
@@ -67,9 +92,14 @@ let testsavantState: LoadState = 'uninitialized';
|
||||
let testsavantClassifier: any = null;
|
||||
let testsavantLoadError: string | null = null;
|
||||
|
||||
let debertaState: LoadState = 'uninitialized';
|
||||
let debertaClassifier: any = null;
|
||||
let debertaLoadError: string | null = null;
|
||||
|
||||
export interface ClassifierStatus {
|
||||
testsavant: 'ok' | 'degraded' | 'off';
|
||||
transcript: 'ok' | 'degraded' | 'off';
|
||||
deberta?: 'ok' | 'degraded' | 'off'; // only present when ensemble enabled
|
||||
}
|
||||
|
||||
export function getClassifierStatus(): ClassifierStatus {
|
||||
@@ -77,11 +107,16 @@ export function getClassifierStatus(): ClassifierStatus {
|
||||
testsavantState === 'loaded' ? 'ok' :
|
||||
testsavantState === 'failed' ? 'degraded' :
|
||||
'off';
|
||||
// Transcript classifier has no persistent load state — it spawns claude-haiku
|
||||
// per-call. We report 'ok' if claude is on PATH (checked lazily on first call).
|
||||
const transcript = haikuAvailableCache === null ? 'off' :
|
||||
haikuAvailableCache ? 'ok' : 'degraded';
|
||||
return { testsavant, transcript };
|
||||
const status: ClassifierStatus = { testsavant, transcript };
|
||||
if (isDebertaEnabled()) {
|
||||
status.deberta =
|
||||
debertaState === 'loaded' ? 'ok' :
|
||||
debertaState === 'failed' ? 'degraded' :
|
||||
'off';
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
// ─── Model download + staging ────────────────────────────────
|
||||
@@ -245,6 +280,89 @@ export async function scanPageContent(text: string): Promise<LayerSignal> {
|
||||
}
|
||||
}
|
||||
|
||||
// ─── L4c: DeBERTa-v3 ensemble (opt-in) ───────────────────────
|
||||
|
||||
async function ensureDebertaStaged(onProgress?: (msg: string) => void): Promise<void> {
|
||||
fs.mkdirSync(path.join(DEBERTA_DIR, 'onnx'), { recursive: true, mode: 0o700 });
|
||||
for (const f of DEBERTA_FILES) {
|
||||
const dst = path.join(DEBERTA_DIR, f);
|
||||
if (fs.existsSync(dst)) continue;
|
||||
onProgress?.(`deberta: downloading ${f}`);
|
||||
await downloadFile(`${DEBERTA_HF_URL}/${f}`, dst);
|
||||
}
|
||||
const modelDst = path.join(DEBERTA_DIR, 'onnx', 'model.onnx');
|
||||
if (!fs.existsSync(modelDst)) {
|
||||
onProgress?.('deberta: downloading model.onnx (721MB) — first run only');
|
||||
await downloadFile(`${DEBERTA_HF_URL}/model.onnx`, modelDst);
|
||||
}
|
||||
}
|
||||
|
||||
let debertaLoadPromise: Promise<void> | null = null;
|
||||
export function loadDeberta(onProgress?: (msg: string) => void): Promise<void> {
|
||||
if (!isDebertaEnabled()) return Promise.resolve();
|
||||
if (debertaState === 'loaded') return Promise.resolve();
|
||||
if (debertaLoadPromise) return debertaLoadPromise;
|
||||
debertaState = 'loading';
|
||||
debertaLoadPromise = (async () => {
|
||||
try {
|
||||
await ensureDebertaStaged(onProgress);
|
||||
onProgress?.('deberta: initializing classifier');
|
||||
const { pipeline, env } = await import('@huggingface/transformers');
|
||||
env.allowLocalModels = true;
|
||||
env.allowRemoteModels = false;
|
||||
env.localModelPath = MODELS_DIR;
|
||||
debertaClassifier = await pipeline(
|
||||
'text-classification',
|
||||
'deberta-v3-injection',
|
||||
{ dtype: 'fp32' },
|
||||
);
|
||||
const tok = debertaClassifier?.tokenizer as any;
|
||||
if (tok?._tokenizerConfig) {
|
||||
tok._tokenizerConfig.model_max_length = 512;
|
||||
}
|
||||
debertaState = 'loaded';
|
||||
} catch (err: any) {
|
||||
debertaState = 'failed';
|
||||
debertaLoadError = err?.message ?? String(err);
|
||||
console.error('[security-classifier] Failed to load DeBERTa-v3:', debertaLoadError);
|
||||
}
|
||||
})();
|
||||
return debertaLoadPromise;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scan text with the DeBERTa-v3 ensemble classifier. Returns a LayerSignal
|
||||
* with layer='deberta_content'. No-op when ensemble is disabled — returns
|
||||
* confidence=0 with meta.disabled=true so combineVerdict treats it as safe.
|
||||
*/
|
||||
export async function scanPageContentDeberta(text: string): Promise<LayerSignal> {
|
||||
if (!isDebertaEnabled()) {
|
||||
return { layer: 'deberta_content', confidence: 0, meta: { disabled: true } };
|
||||
}
|
||||
if (!text || text.length === 0) {
|
||||
return { layer: 'deberta_content', confidence: 0 };
|
||||
}
|
||||
if (debertaState !== 'loaded') {
|
||||
return { layer: 'deberta_content', confidence: 0, meta: { degraded: true } };
|
||||
}
|
||||
try {
|
||||
const plain = htmlToPlainText(text);
|
||||
const input = plain.slice(0, 4000);
|
||||
const raw = await debertaClassifier(input);
|
||||
const top = Array.isArray(raw) ? raw[0] : raw;
|
||||
const label = top?.label ?? 'SAFE';
|
||||
const score = Number(top?.score ?? 0);
|
||||
if (label === 'INJECTION') {
|
||||
return { layer: 'deberta_content', confidence: score, meta: { label } };
|
||||
}
|
||||
return { layer: 'deberta_content', confidence: 0, meta: { label, safeScore: score } };
|
||||
} catch (err: any) {
|
||||
debertaState = 'failed';
|
||||
debertaLoadError = err?.message ?? String(err);
|
||||
return { layer: 'deberta_content', confidence: 0, meta: { degraded: true, error: debertaLoadError } };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── L4b: Claude Haiku transcript classifier ─────────────────
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user