mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-05-20 19:29:58 +08:00
Extend `isDangerousInvisibleCodePoint` with five additional code points / ranges that are routinely cited in invisible-character smuggling references but were not in the previous denylist: - **U+180E** MONGOLIAN VOWEL SEPARATOR. Formerly classified as a space separator (Zs) until Unicode 6.3 reclassified it as Cf (Format control). Renders as zero-width; widely abused for homograph attacks and prompt smuggling. - **U+115F** HANGUL CHOSEONG FILLER and **U+1160** HANGUL JUNGSEONG FILLER. Zero-width fillers used in Korean text shaping. Both are cited as common LLM-injection vectors in Korean / multilingual threat models. - **U+2061–U+2064** invisible math operators (FUNCTION APPLICATION, INVISIBLE TIMES, INVISIBLE SEPARATOR, INVISIBLE PLUS). Zero-width and only meaningful inside math typesetting. No legitimate Markdown or source code uses them. - **U+3164** HANGUL FILLER. Reported in real-world Discord and Twitter smuggling incidents; not used in legitimate Korean text. Reproduced before this commit: a file containing any one of these code points passed `check-unicode-safety.js` silently. After this commit each one is reported as `dangerous-invisible U+<HEX>` and `--write` mode strips it. Verified by writing 8 single-character probe files (`probe-0x180E.md`, `probe-0x115F.md`, …) and confirming exit=1 with each violation listed. ECC repo self-scan reports only the pre-existing `U+2605` BLACK STAR warnings (unchanged) and exits with the same status (no new in-repo violations introduced). Existing 5 unicode-safety tests still pass; `yarn lint` clean. Regression coverage for both the previous commit's Tag block fix and this commit's additions lands in the next commit.
273 lines
7.3 KiB
JavaScript
273 lines
7.3 KiB
JavaScript
#!/usr/bin/env node
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
const repoRoot = process.env.ECC_UNICODE_SCAN_ROOT
|
||
? path.resolve(process.env.ECC_UNICODE_SCAN_ROOT)
|
||
: path.resolve(__dirname, '..', '..');
|
||
|
||
const writeMode = process.argv.includes('--write');
|
||
|
||
const ignoredDirs = new Set([
|
||
'.git',
|
||
'node_modules',
|
||
'.dmux',
|
||
'.next',
|
||
'.venv',
|
||
'coverage',
|
||
'venv',
|
||
]);
|
||
|
||
const textExtensions = new Set([
|
||
'.md',
|
||
'.mdx',
|
||
'.txt',
|
||
'.js',
|
||
'.cjs',
|
||
'.mjs',
|
||
'.ts',
|
||
'.tsx',
|
||
'.jsx',
|
||
'.json',
|
||
'.toml',
|
||
'.yml',
|
||
'.yaml',
|
||
'.sh',
|
||
'.bash',
|
||
'.zsh',
|
||
'.ps1',
|
||
'.py',
|
||
'.rs',
|
||
]);
|
||
|
||
const writableExtensions = new Set([
|
||
'.md',
|
||
'.mdx',
|
||
'.txt',
|
||
]);
|
||
|
||
const writeModeSkip = new Set([
|
||
path.normalize('scripts/ci/check-unicode-safety.js'),
|
||
path.normalize('tests/scripts/check-unicode-safety.test.js'),
|
||
]);
|
||
|
||
const emojiRe = /(?:\p{Extended_Pictographic}|\p{Regional_Indicator})/gu;
|
||
const allowedSymbolCodePoints = new Set([
|
||
0x00A9,
|
||
0x00AE,
|
||
0x2122,
|
||
]);
|
||
|
||
const targetedReplacements = [
|
||
[new RegExp(`${String.fromCodePoint(0x26A0)}(?:\\uFE0F)?`, 'gu'), 'WARNING:'],
|
||
[new RegExp(`${String.fromCodePoint(0x23ED)}(?:\\uFE0F)?`, 'gu'), 'SKIPPED:'],
|
||
[new RegExp(String.fromCodePoint(0x2705), 'gu'), 'PASS:'],
|
||
[new RegExp(String.fromCodePoint(0x274C), 'gu'), 'FAIL:'],
|
||
[new RegExp(String.fromCodePoint(0x2728), 'gu'), ''],
|
||
];
|
||
|
||
function shouldSkip(entryPath) {
|
||
return entryPath.split(path.sep).some(part => ignoredDirs.has(part));
|
||
}
|
||
|
||
function isTextFile(filePath) {
|
||
return textExtensions.has(path.extname(filePath).toLowerCase());
|
||
}
|
||
|
||
function canAutoWrite(relativePath) {
|
||
return writableExtensions.has(path.extname(relativePath).toLowerCase());
|
||
}
|
||
|
||
function listFiles(dirPath) {
|
||
const results = [];
|
||
for (const entry of fs.readdirSync(dirPath, { withFileTypes: true })) {
|
||
const entryPath = path.join(dirPath, entry.name);
|
||
if (shouldSkip(entryPath)) continue;
|
||
if (entry.isDirectory()) {
|
||
results.push(...listFiles(entryPath));
|
||
continue;
|
||
}
|
||
if (entry.isFile() && isTextFile(entryPath)) {
|
||
results.push(entryPath);
|
||
}
|
||
}
|
||
return results;
|
||
}
|
||
|
||
function lineAndColumn(text, index) {
|
||
const line = text.slice(0, index).split('\n').length;
|
||
const lastNewline = text.lastIndexOf('\n', index - 1);
|
||
const column = index - lastNewline;
|
||
return { line, column };
|
||
}
|
||
|
||
function isAllowedEmojiLikeSymbol(char) {
|
||
return allowedSymbolCodePoints.has(char.codePointAt(0));
|
||
}
|
||
|
||
function isDangerousInvisibleCodePoint(codePoint) {
|
||
return (
|
||
(codePoint >= 0x200B && codePoint <= 0x200D) ||
|
||
codePoint === 0x2060 ||
|
||
codePoint === 0xFEFF ||
|
||
(codePoint >= 0x202A && codePoint <= 0x202E) ||
|
||
(codePoint >= 0x2066 && codePoint <= 0x2069) ||
|
||
(codePoint >= 0xFE00 && codePoint <= 0xFE0F) ||
|
||
(codePoint >= 0xE0100 && codePoint <= 0xE01EF) ||
|
||
// Unicode Tag block (U+E0000–U+E007F). Tag characters were proposed
|
||
// for language tagging in Unicode 3.1 and have been deprecated since
|
||
// Unicode 5.1, so no legitimate text uses them. They are the canonical
|
||
// vector for "ASCII smuggling" / "Tag smuggling" prompt injection:
|
||
// an attacker hides instructions inside ASCII-looking strings (PR
|
||
// bodies, SKILL.md, frontmatter), the LLM consumes the tag bytes,
|
||
// and the human reviewer sees nothing.
|
||
(codePoint >= 0xE0000 && codePoint <= 0xE007F) ||
|
||
// U+180E MONGOLIAN VOWEL SEPARATOR — formerly classified as a space
|
||
// separator, reclassified as a format control in Unicode 6.3; renders
|
||
// as zero-width and routinely abused for homograph / smuggling.
|
||
codePoint === 0x180E ||
|
||
// U+115F / U+1160 HANGUL CHOSEONG/JUNGSEONG FILLER — zero-width fillers
|
||
// used in Korean text shaping; abused as invisible characters.
|
||
codePoint === 0x115F ||
|
||
codePoint === 0x1160 ||
|
||
// U+2061–U+2064 invisible math operators (FUNCTION APPLICATION,
|
||
// INVISIBLE TIMES, INVISIBLE SEPARATOR, INVISIBLE PLUS). Zero-width
|
||
// and not used outside math typesetting; legitimate Markdown / source
|
||
// does not contain them.
|
||
(codePoint >= 0x2061 && codePoint <= 0x2064) ||
|
||
// U+3164 HANGUL FILLER — zero-width filler reportedly used in Discord
|
||
// / Twitter smuggling attacks; not used in legitimate Korean text.
|
||
codePoint === 0x3164
|
||
);
|
||
}
|
||
|
||
function stripDangerousInvisibleChars(text) {
|
||
let next = '';
|
||
for (const char of text) {
|
||
if (!isDangerousInvisibleCodePoint(char.codePointAt(0))) {
|
||
next += char;
|
||
}
|
||
}
|
||
return next;
|
||
}
|
||
|
||
function sanitizeText(text) {
|
||
let next = text;
|
||
next = stripDangerousInvisibleChars(next);
|
||
|
||
for (const [pattern, replacement] of targetedReplacements) {
|
||
next = next.replace(pattern, replacement);
|
||
}
|
||
|
||
next = next.replace(emojiRe, match => (isAllowedEmojiLikeSymbol(match) ? match : ''));
|
||
next = next.replace(/^ +(?=\*\*)/gm, '');
|
||
next = next.replace(/^(\*\*)\s+/gm, '$1');
|
||
next = next.replace(/^(#+)\s{2,}/gm, '$1 ');
|
||
next = next.replace(/^>\s{2,}/gm, '> ');
|
||
next = next.replace(/^-\s{2,}/gm, '- ');
|
||
next = next.replace(/^(\d+\.)\s{2,}/gm, '$1 ');
|
||
next = next.replace(/[ \t]+$/gm, '');
|
||
|
||
return next;
|
||
}
|
||
|
||
function collectMatches(text, regex, kind) {
|
||
const matches = [];
|
||
for (const match of text.matchAll(regex)) {
|
||
const char = match[0];
|
||
if (kind === 'emoji' && isAllowedEmojiLikeSymbol(char)) {
|
||
continue;
|
||
}
|
||
const index = match.index ?? 0;
|
||
const { line, column } = lineAndColumn(text, index);
|
||
matches.push({
|
||
kind,
|
||
char,
|
||
codePoint: `U+${char.codePointAt(0).toString(16).toUpperCase()}`,
|
||
line,
|
||
column,
|
||
});
|
||
}
|
||
return matches;
|
||
}
|
||
|
||
function collectDangerousInvisibleMatches(text) {
|
||
const matches = [];
|
||
let index = 0;
|
||
|
||
for (const char of text) {
|
||
const codePoint = char.codePointAt(0);
|
||
if (isDangerousInvisibleCodePoint(codePoint)) {
|
||
const { line, column } = lineAndColumn(text, index);
|
||
matches.push({
|
||
kind: 'dangerous-invisible',
|
||
char,
|
||
codePoint: `U+${codePoint.toString(16).toUpperCase()}`,
|
||
line,
|
||
column,
|
||
});
|
||
}
|
||
index += char.length;
|
||
}
|
||
|
||
return matches;
|
||
}
|
||
|
||
const changedFiles = [];
|
||
const violations = [];
|
||
|
||
for (const filePath of listFiles(repoRoot)) {
|
||
const relativePath = path.relative(repoRoot, filePath);
|
||
let text;
|
||
try {
|
||
text = fs.readFileSync(filePath, 'utf8');
|
||
} catch {
|
||
continue;
|
||
}
|
||
|
||
if (
|
||
writeMode &&
|
||
!writeModeSkip.has(path.normalize(relativePath)) &&
|
||
canAutoWrite(relativePath)
|
||
) {
|
||
const sanitized = sanitizeText(text);
|
||
if (sanitized !== text) {
|
||
fs.writeFileSync(filePath, sanitized, 'utf8');
|
||
changedFiles.push(relativePath);
|
||
text = sanitized;
|
||
}
|
||
}
|
||
|
||
const fileViolations = [
|
||
...collectDangerousInvisibleMatches(text),
|
||
...collectMatches(text, emojiRe, 'emoji'),
|
||
];
|
||
|
||
for (const violation of fileViolations) {
|
||
violations.push({
|
||
file: relativePath,
|
||
...violation,
|
||
});
|
||
}
|
||
}
|
||
|
||
if (changedFiles.length > 0) {
|
||
console.log(`Sanitized ${changedFiles.length} files:`);
|
||
for (const file of changedFiles) {
|
||
console.log(`- ${file}`);
|
||
}
|
||
}
|
||
|
||
if (violations.length > 0) {
|
||
console.error('Unicode safety violations detected:');
|
||
for (const violation of violations) {
|
||
console.error(
|
||
`${violation.file}:${violation.line}:${violation.column} ${violation.kind} ${violation.codePoint}`
|
||
);
|
||
}
|
||
process.exit(1);
|
||
}
|
||
|
||
console.log('Unicode safety check passed.');
|