Files
everything-claude-code/scripts/ci/check-unicode-safety.js
Jamkris b068069b9b fix(ci): cover other widely-cited invisible code points in check-unicode-safety
Extend `isDangerousInvisibleCodePoint` with five additional code
points / ranges that are routinely cited in invisible-character
smuggling references but were not in the previous denylist:

- **U+180E** MONGOLIAN VOWEL SEPARATOR. Formerly classified as a
  space separator (Zs) until Unicode 6.3 reclassified it as Cf
  (Format control). Renders as zero-width; widely abused for
  homograph attacks and prompt smuggling.

- **U+115F** HANGUL CHOSEONG FILLER and **U+1160** HANGUL JUNGSEONG
  FILLER. Zero-width fillers used in Korean text shaping. Both are
  cited as common LLM-injection vectors in Korean / multilingual
  threat models.

- **U+2061–U+2064** invisible math operators (FUNCTION APPLICATION,
  INVISIBLE TIMES, INVISIBLE SEPARATOR, INVISIBLE PLUS). Zero-width
  and only meaningful inside math typesetting. No legitimate
  Markdown or source code uses them.

- **U+3164** HANGUL FILLER. Reported in real-world Discord and
  Twitter smuggling incidents; not used in legitimate Korean text.

Reproduced before this commit: a file containing any one of these
code points passed `check-unicode-safety.js` silently.

After this commit each one is reported as
`dangerous-invisible U+<HEX>` and `--write` mode strips it.

Verified by writing 8 single-character probe files
(`probe-0x180E.md`, `probe-0x115F.md`, …) and confirming exit=1 with
each violation listed.

ECC repo self-scan reports only the pre-existing `U+2605` BLACK
STAR warnings (unchanged) and exits with the same status (no new
in-repo violations introduced). Existing 5 unicode-safety tests
still pass; `yarn lint` clean.

Regression coverage for both the previous commit's Tag block fix
and this commit's additions lands in the next commit.
2026-05-18 21:20:36 -04:00

273 lines
7.3 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const repoRoot = process.env.ECC_UNICODE_SCAN_ROOT
? path.resolve(process.env.ECC_UNICODE_SCAN_ROOT)
: path.resolve(__dirname, '..', '..');
const writeMode = process.argv.includes('--write');
const ignoredDirs = new Set([
'.git',
'node_modules',
'.dmux',
'.next',
'.venv',
'coverage',
'venv',
]);
const textExtensions = new Set([
'.md',
'.mdx',
'.txt',
'.js',
'.cjs',
'.mjs',
'.ts',
'.tsx',
'.jsx',
'.json',
'.toml',
'.yml',
'.yaml',
'.sh',
'.bash',
'.zsh',
'.ps1',
'.py',
'.rs',
]);
const writableExtensions = new Set([
'.md',
'.mdx',
'.txt',
]);
const writeModeSkip = new Set([
path.normalize('scripts/ci/check-unicode-safety.js'),
path.normalize('tests/scripts/check-unicode-safety.test.js'),
]);
const emojiRe = /(?:\p{Extended_Pictographic}|\p{Regional_Indicator})/gu;
const allowedSymbolCodePoints = new Set([
0x00A9,
0x00AE,
0x2122,
]);
const targetedReplacements = [
[new RegExp(`${String.fromCodePoint(0x26A0)}(?:\\uFE0F)?`, 'gu'), 'WARNING:'],
[new RegExp(`${String.fromCodePoint(0x23ED)}(?:\\uFE0F)?`, 'gu'), 'SKIPPED:'],
[new RegExp(String.fromCodePoint(0x2705), 'gu'), 'PASS:'],
[new RegExp(String.fromCodePoint(0x274C), 'gu'), 'FAIL:'],
[new RegExp(String.fromCodePoint(0x2728), 'gu'), ''],
];
function shouldSkip(entryPath) {
return entryPath.split(path.sep).some(part => ignoredDirs.has(part));
}
function isTextFile(filePath) {
return textExtensions.has(path.extname(filePath).toLowerCase());
}
function canAutoWrite(relativePath) {
return writableExtensions.has(path.extname(relativePath).toLowerCase());
}
function listFiles(dirPath) {
const results = [];
for (const entry of fs.readdirSync(dirPath, { withFileTypes: true })) {
const entryPath = path.join(dirPath, entry.name);
if (shouldSkip(entryPath)) continue;
if (entry.isDirectory()) {
results.push(...listFiles(entryPath));
continue;
}
if (entry.isFile() && isTextFile(entryPath)) {
results.push(entryPath);
}
}
return results;
}
function lineAndColumn(text, index) {
const line = text.slice(0, index).split('\n').length;
const lastNewline = text.lastIndexOf('\n', index - 1);
const column = index - lastNewline;
return { line, column };
}
function isAllowedEmojiLikeSymbol(char) {
return allowedSymbolCodePoints.has(char.codePointAt(0));
}
function isDangerousInvisibleCodePoint(codePoint) {
return (
(codePoint >= 0x200B && codePoint <= 0x200D) ||
codePoint === 0x2060 ||
codePoint === 0xFEFF ||
(codePoint >= 0x202A && codePoint <= 0x202E) ||
(codePoint >= 0x2066 && codePoint <= 0x2069) ||
(codePoint >= 0xFE00 && codePoint <= 0xFE0F) ||
(codePoint >= 0xE0100 && codePoint <= 0xE01EF) ||
// Unicode Tag block (U+E0000U+E007F). Tag characters were proposed
// for language tagging in Unicode 3.1 and have been deprecated since
// Unicode 5.1, so no legitimate text uses them. They are the canonical
// vector for "ASCII smuggling" / "Tag smuggling" prompt injection:
// an attacker hides instructions inside ASCII-looking strings (PR
// bodies, SKILL.md, frontmatter), the LLM consumes the tag bytes,
// and the human reviewer sees nothing.
(codePoint >= 0xE0000 && codePoint <= 0xE007F) ||
// U+180E MONGOLIAN VOWEL SEPARATOR — formerly classified as a space
// separator, reclassified as a format control in Unicode 6.3; renders
// as zero-width and routinely abused for homograph / smuggling.
codePoint === 0x180E ||
// U+115F / U+1160 HANGUL CHOSEONG/JUNGSEONG FILLER — zero-width fillers
// used in Korean text shaping; abused as invisible characters.
codePoint === 0x115F ||
codePoint === 0x1160 ||
// U+2061U+2064 invisible math operators (FUNCTION APPLICATION,
// INVISIBLE TIMES, INVISIBLE SEPARATOR, INVISIBLE PLUS). Zero-width
// and not used outside math typesetting; legitimate Markdown / source
// does not contain them.
(codePoint >= 0x2061 && codePoint <= 0x2064) ||
// U+3164 HANGUL FILLER — zero-width filler reportedly used in Discord
// / Twitter smuggling attacks; not used in legitimate Korean text.
codePoint === 0x3164
);
}
function stripDangerousInvisibleChars(text) {
let next = '';
for (const char of text) {
if (!isDangerousInvisibleCodePoint(char.codePointAt(0))) {
next += char;
}
}
return next;
}
function sanitizeText(text) {
let next = text;
next = stripDangerousInvisibleChars(next);
for (const [pattern, replacement] of targetedReplacements) {
next = next.replace(pattern, replacement);
}
next = next.replace(emojiRe, match => (isAllowedEmojiLikeSymbol(match) ? match : ''));
next = next.replace(/^ +(?=\*\*)/gm, '');
next = next.replace(/^(\*\*)\s+/gm, '$1');
next = next.replace(/^(#+)\s{2,}/gm, '$1 ');
next = next.replace(/^>\s{2,}/gm, '> ');
next = next.replace(/^-\s{2,}/gm, '- ');
next = next.replace(/^(\d+\.)\s{2,}/gm, '$1 ');
next = next.replace(/[ \t]+$/gm, '');
return next;
}
function collectMatches(text, regex, kind) {
const matches = [];
for (const match of text.matchAll(regex)) {
const char = match[0];
if (kind === 'emoji' && isAllowedEmojiLikeSymbol(char)) {
continue;
}
const index = match.index ?? 0;
const { line, column } = lineAndColumn(text, index);
matches.push({
kind,
char,
codePoint: `U+${char.codePointAt(0).toString(16).toUpperCase()}`,
line,
column,
});
}
return matches;
}
function collectDangerousInvisibleMatches(text) {
const matches = [];
let index = 0;
for (const char of text) {
const codePoint = char.codePointAt(0);
if (isDangerousInvisibleCodePoint(codePoint)) {
const { line, column } = lineAndColumn(text, index);
matches.push({
kind: 'dangerous-invisible',
char,
codePoint: `U+${codePoint.toString(16).toUpperCase()}`,
line,
column,
});
}
index += char.length;
}
return matches;
}
const changedFiles = [];
const violations = [];
for (const filePath of listFiles(repoRoot)) {
const relativePath = path.relative(repoRoot, filePath);
let text;
try {
text = fs.readFileSync(filePath, 'utf8');
} catch {
continue;
}
if (
writeMode &&
!writeModeSkip.has(path.normalize(relativePath)) &&
canAutoWrite(relativePath)
) {
const sanitized = sanitizeText(text);
if (sanitized !== text) {
fs.writeFileSync(filePath, sanitized, 'utf8');
changedFiles.push(relativePath);
text = sanitized;
}
}
const fileViolations = [
...collectDangerousInvisibleMatches(text),
...collectMatches(text, emojiRe, 'emoji'),
];
for (const violation of fileViolations) {
violations.push({
file: relativePath,
...violation,
});
}
}
if (changedFiles.length > 0) {
console.log(`Sanitized ${changedFiles.length} files:`);
for (const file of changedFiles) {
console.log(`- ${file}`);
}
}
if (violations.length > 0) {
console.error('Unicode safety violations detected:');
for (const violation of violations) {
console.error(
`${violation.file}:${violation.line}:${violation.column} ${violation.kind} ${violation.codePoint}`
);
}
process.exit(1);
}
console.log('Unicode safety check passed.');