From b068069b9b36e184dcc92d8effd9a181d31fac48 Mon Sep 17 00:00:00 2001 From: Jamkris Date: Tue, 19 May 2026 09:20:15 +0900 Subject: [PATCH] fix(ci): cover other widely-cited invisible code points in check-unicode-safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend `isDangerousInvisibleCodePoint` with five additional code points / ranges that are routinely cited in invisible-character smuggling references but were not in the previous denylist: - **U+180E** MONGOLIAN VOWEL SEPARATOR. Formerly classified as a space separator (Zs) until Unicode 6.3 reclassified it as Cf (Format control). Renders as zero-width; widely abused for homograph attacks and prompt smuggling. - **U+115F** HANGUL CHOSEONG FILLER and **U+1160** HANGUL JUNGSEONG FILLER. Zero-width fillers used in Korean text shaping. Both are cited as common LLM-injection vectors in Korean / multilingual threat models. - **U+2061–U+2064** invisible math operators (FUNCTION APPLICATION, INVISIBLE TIMES, INVISIBLE SEPARATOR, INVISIBLE PLUS). Zero-width and only meaningful inside math typesetting. No legitimate Markdown or source code uses them. - **U+3164** HANGUL FILLER. Reported in real-world Discord and Twitter smuggling incidents; not used in legitimate Korean text. Reproduced before this commit: a file containing any one of these code points passed `check-unicode-safety.js` silently. After this commit each one is reported as `dangerous-invisible U+` and `--write` mode strips it. Verified by writing 8 single-character probe files (`probe-0x180E.md`, `probe-0x115F.md`, …) and confirming exit=1 with each violation listed. ECC repo self-scan reports only the pre-existing `U+2605` BLACK STAR warnings (unchanged) and exits with the same status (no new in-repo violations introduced). Existing 5 unicode-safety tests still pass; `yarn lint` clean. Regression coverage for both the previous commit's Tag block fix and this commit's additions lands in the next commit. --- scripts/ci/check-unicode-safety.js | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/scripts/ci/check-unicode-safety.js b/scripts/ci/check-unicode-safety.js index c4f1740c..96c9ba54 100644 --- a/scripts/ci/check-unicode-safety.js +++ b/scripts/ci/check-unicode-safety.js @@ -122,7 +122,23 @@ function isDangerousInvisibleCodePoint(codePoint) { // an attacker hides instructions inside ASCII-looking strings (PR // bodies, SKILL.md, frontmatter), the LLM consumes the tag bytes, // and the human reviewer sees nothing. - (codePoint >= 0xE0000 && codePoint <= 0xE007F) + (codePoint >= 0xE0000 && codePoint <= 0xE007F) || + // U+180E MONGOLIAN VOWEL SEPARATOR — formerly classified as a space + // separator, reclassified as a format control in Unicode 6.3; renders + // as zero-width and routinely abused for homograph / smuggling. + codePoint === 0x180E || + // U+115F / U+1160 HANGUL CHOSEONG/JUNGSEONG FILLER — zero-width fillers + // used in Korean text shaping; abused as invisible characters. + codePoint === 0x115F || + codePoint === 0x1160 || + // U+2061–U+2064 invisible math operators (FUNCTION APPLICATION, + // INVISIBLE TIMES, INVISIBLE SEPARATOR, INVISIBLE PLUS). Zero-width + // and not used outside math typesetting; legitimate Markdown / source + // does not contain them. + (codePoint >= 0x2061 && codePoint <= 0x2064) || + // U+3164 HANGUL FILLER — zero-width filler reportedly used in Discord + // / Twitter smuggling attacks; not used in legitimate Korean text. + codePoint === 0x3164 ); }