mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-14 00:13:05 +08:00
merge: origin/main into garrytan/injection-tuning; bump v1.5.1.0 → v1.5.2.0
Main shipped v1.5.1.0 for /make-pdf entity + font fixes while this branch was in flight, creating a version collision. Resolving by bumping this branch's security tuning release to v1.5.2.0 (next PATCH after main's v1.5.1.0) and retaining both CHANGELOG entries: my v1.5.2.0 on top, main's v1.5.1.0 below. Updated v1.5.1.0 → v1.5.2.0 references in security.ts, security-classifier.ts, adversarial.test.ts, bench-ensemble.test.ts, bench-ensemble-live.test.ts, bench.test.ts, and TODOS.md. Main's CHANGELOG entry left untouched. All 231 security tests + fixture-replay gate still pass: TP=146 FN=114 FP=55 TN=185 → 56.2% / 22.9% → GATE PASS Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -490,7 +490,7 @@ export async function checkTranscript(params: {
|
||||
// repo with a prompt-injection-defense CLAUDE.md (like gstack itself),
|
||||
// Haiku reads "we have a strict security classifier" and responds with
|
||||
// meta-commentary instead of classifying the input — we measured 100%
|
||||
// timeout rate in the v1.5.1.0 ensemble bench because of this, plus
|
||||
// timeout rate in the v1.5.2.0 ensemble bench because of this, plus
|
||||
// ~44k cache_creation tokens per call (massive cost inflation).
|
||||
// Using os.tmpdir() gives Haiku a clean context for pure classification.
|
||||
const p = spawn('claude', [
|
||||
@@ -539,7 +539,7 @@ export async function checkTranscript(params: {
|
||||
p.on('error', () => {
|
||||
finish({ layer: 'transcript_classifier', confidence: 0, meta: { degraded: true, reason: 'spawn_error' } });
|
||||
});
|
||||
// Hard timeout. Measured in v1.5.1.0 bench: `claude -p --model
|
||||
// Hard timeout. Measured in v1.5.2.0 bench: `claude -p --model
|
||||
// claude-haiku-4-5-20251001` takes 17-33s end-to-end even for trivial
|
||||
// prompts (CLI session startup + Haiku API). The v1 15s timeout caused
|
||||
// 100% timeout rate when re-measured in v2 — v1's ensemble was
|
||||
|
||||
@@ -88,7 +88,7 @@ export interface StatusDetail {
|
||||
|
||||
/**
|
||||
* Combine per-layer signals into a single verdict. Post-v2 ensemble rule
|
||||
* (v1.5.1.0+) is label-first for the transcript layer: Haiku's verdict
|
||||
* (v1.5.2.0+) is label-first for the transcript layer: Haiku's verdict
|
||||
* label is the primary signal, not its self-reported confidence. Other ML
|
||||
* layers (testsavant_content, deberta_content) remain confidence-based
|
||||
* because they emit only a scalar.
|
||||
@@ -205,7 +205,7 @@ export function combineVerdict(signals: LayerSignal[], opts: CombineVerdictOpts
|
||||
// Single-layer BLOCK. For tool-output, BLOCK directly; for user-input,
|
||||
// degrade to WARN (SO-FP mitigation).
|
||||
//
|
||||
// Asymmetric thresholds (v1.5.1.0+):
|
||||
// Asymmetric thresholds (v1.5.2.0+):
|
||||
// - Content classifiers (testsavant, deberta): require confidence
|
||||
// >= THRESHOLDS.SOLO_CONTENT_BLOCK (0.92). These are label-less so the
|
||||
// bar is higher — pattern-matching on "suspicious text" alone isn't
|
||||
|
||||
Reference in New Issue
Block a user