Files
everything-claude-code/scripts/ci/check-unicode-safety.js
Jamkris e3483fda15 fix(ci): cover Unicode Tag block (U+E0000–U+E007F) in check-unicode-safety
`isDangerousInvisibleCodePoint` enumerated seven ranges of invisible/
bidi/variation-selector code points but omitted the Unicode Tag block
(U+E0000–U+E007F). Tag characters were proposed for language tagging
in Unicode 3.1 and have been deprecated since Unicode 5.1, so no
legitimate text uses them. They are the canonical vector for
"ASCII Smuggling" / "Tag Smuggling" LLM prompt injection: an attacker
hides instructions inside an ASCII-looking string, the model reads
the tag bytes, the human reviewer sees nothing. Demonstrated against
multiple LLM assistants during 2024–2025.

`check-unicode-safety.js` is the repo's last line of defence before
contributor content reaches agent context; the same script also runs
in `--write` auto-sanitize mode on `.md` / `.mdx` / `.txt`. Today it
silently passes tag-block characters through unchanged in both
detection mode and `--write` mode.

Reproduced before this commit:

  $ mkdir -p /tmp/uni-test && node -e "
      const fs = require('fs');
      const hidden = [...Array(5)].map((_,i) =>
        String.fromCodePoint(0xE0041 + i)).join('');
      fs.writeFileSync('/tmp/uni-test/innocent.md',
        '# Title\\n\\nBenign text' + hidden + ' more.\\n');"

  $ ECC_UNICODE_SCAN_ROOT=/tmp/uni-test \
      node scripts/ci/check-unicode-safety.js
  Unicode safety check passed.
  $ echo $?
  0

Expected: tag-block characters reported as `dangerous-invisible`
violations (exit 1) and stripped under `--write`.
Actual: validator passes, `--write` leaves the bytes intact.

Fix: extend the denylist with one new range
`(codePoint >= 0xE0000 && codePoint <= 0xE007F)`. The change is
purely additive; the existing seven ranges are untouched.

After this commit the same reproduction returns:

  $ ECC_UNICODE_SCAN_ROOT=/tmp/uni-test \
      node scripts/ci/check-unicode-safety.js
  Unicode safety violations detected:
  innocent.md:3:12 dangerous-invisible U+E0041
  innocent.md:3:14 dangerous-invisible U+E0042
  innocent.md:3:16 dangerous-invisible U+E0043
  innocent.md:3:18 dangerous-invisible U+E0044
  innocent.md:3:20 dangerous-invisible U+E0045
  exit=1

`--write` mode also strips the bytes (verified: file length 47 → 42
after sanitize, regex `/[\u{E0000}-\u{E007F}]/u` no longer matches).

Existing 5 unicode-safety tests still pass; `yarn lint` clean. The
ECC repo's own self-scan (`node scripts/ci/check-unicode-safety.js`
with no `ECC_UNICODE_SCAN_ROOT`) reports the same warnings as before
this commit and exits with the same status (no regressions on
in-repo content).

A handful of other widely-cited invisible code points are missing
from the denylist (`U+180E`, `U+115F`, `U+1160`, `U+2061–U+2064`,
`U+3164`); those are addressed in the next commit so each fix
remains independently reviewable. Regression coverage for both
fixes lands two commits later.
2026-05-18 21:20:36 -04:00

257 lines
6.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
const fs = require('fs');
const path = require('path');
const repoRoot = process.env.ECC_UNICODE_SCAN_ROOT
? path.resolve(process.env.ECC_UNICODE_SCAN_ROOT)
: path.resolve(__dirname, '..', '..');
const writeMode = process.argv.includes('--write');
const ignoredDirs = new Set([
'.git',
'node_modules',
'.dmux',
'.next',
'.venv',
'coverage',
'venv',
]);
const textExtensions = new Set([
'.md',
'.mdx',
'.txt',
'.js',
'.cjs',
'.mjs',
'.ts',
'.tsx',
'.jsx',
'.json',
'.toml',
'.yml',
'.yaml',
'.sh',
'.bash',
'.zsh',
'.ps1',
'.py',
'.rs',
]);
const writableExtensions = new Set([
'.md',
'.mdx',
'.txt',
]);
const writeModeSkip = new Set([
path.normalize('scripts/ci/check-unicode-safety.js'),
path.normalize('tests/scripts/check-unicode-safety.test.js'),
]);
const emojiRe = /(?:\p{Extended_Pictographic}|\p{Regional_Indicator})/gu;
const allowedSymbolCodePoints = new Set([
0x00A9,
0x00AE,
0x2122,
]);
const targetedReplacements = [
[new RegExp(`${String.fromCodePoint(0x26A0)}(?:\\uFE0F)?`, 'gu'), 'WARNING:'],
[new RegExp(`${String.fromCodePoint(0x23ED)}(?:\\uFE0F)?`, 'gu'), 'SKIPPED:'],
[new RegExp(String.fromCodePoint(0x2705), 'gu'), 'PASS:'],
[new RegExp(String.fromCodePoint(0x274C), 'gu'), 'FAIL:'],
[new RegExp(String.fromCodePoint(0x2728), 'gu'), ''],
];
function shouldSkip(entryPath) {
return entryPath.split(path.sep).some(part => ignoredDirs.has(part));
}
function isTextFile(filePath) {
return textExtensions.has(path.extname(filePath).toLowerCase());
}
function canAutoWrite(relativePath) {
return writableExtensions.has(path.extname(relativePath).toLowerCase());
}
function listFiles(dirPath) {
const results = [];
for (const entry of fs.readdirSync(dirPath, { withFileTypes: true })) {
const entryPath = path.join(dirPath, entry.name);
if (shouldSkip(entryPath)) continue;
if (entry.isDirectory()) {
results.push(...listFiles(entryPath));
continue;
}
if (entry.isFile() && isTextFile(entryPath)) {
results.push(entryPath);
}
}
return results;
}
function lineAndColumn(text, index) {
const line = text.slice(0, index).split('\n').length;
const lastNewline = text.lastIndexOf('\n', index - 1);
const column = index - lastNewline;
return { line, column };
}
function isAllowedEmojiLikeSymbol(char) {
return allowedSymbolCodePoints.has(char.codePointAt(0));
}
function isDangerousInvisibleCodePoint(codePoint) {
return (
(codePoint >= 0x200B && codePoint <= 0x200D) ||
codePoint === 0x2060 ||
codePoint === 0xFEFF ||
(codePoint >= 0x202A && codePoint <= 0x202E) ||
(codePoint >= 0x2066 && codePoint <= 0x2069) ||
(codePoint >= 0xFE00 && codePoint <= 0xFE0F) ||
(codePoint >= 0xE0100 && codePoint <= 0xE01EF) ||
// Unicode Tag block (U+E0000U+E007F). Tag characters were proposed
// for language tagging in Unicode 3.1 and have been deprecated since
// Unicode 5.1, so no legitimate text uses them. They are the canonical
// vector for "ASCII smuggling" / "Tag smuggling" prompt injection:
// an attacker hides instructions inside ASCII-looking strings (PR
// bodies, SKILL.md, frontmatter), the LLM consumes the tag bytes,
// and the human reviewer sees nothing.
(codePoint >= 0xE0000 && codePoint <= 0xE007F)
);
}
function stripDangerousInvisibleChars(text) {
let next = '';
for (const char of text) {
if (!isDangerousInvisibleCodePoint(char.codePointAt(0))) {
next += char;
}
}
return next;
}
function sanitizeText(text) {
let next = text;
next = stripDangerousInvisibleChars(next);
for (const [pattern, replacement] of targetedReplacements) {
next = next.replace(pattern, replacement);
}
next = next.replace(emojiRe, match => (isAllowedEmojiLikeSymbol(match) ? match : ''));
next = next.replace(/^ +(?=\*\*)/gm, '');
next = next.replace(/^(\*\*)\s+/gm, '$1');
next = next.replace(/^(#+)\s{2,}/gm, '$1 ');
next = next.replace(/^>\s{2,}/gm, '> ');
next = next.replace(/^-\s{2,}/gm, '- ');
next = next.replace(/^(\d+\.)\s{2,}/gm, '$1 ');
next = next.replace(/[ \t]+$/gm, '');
return next;
}
function collectMatches(text, regex, kind) {
const matches = [];
for (const match of text.matchAll(regex)) {
const char = match[0];
if (kind === 'emoji' && isAllowedEmojiLikeSymbol(char)) {
continue;
}
const index = match.index ?? 0;
const { line, column } = lineAndColumn(text, index);
matches.push({
kind,
char,
codePoint: `U+${char.codePointAt(0).toString(16).toUpperCase()}`,
line,
column,
});
}
return matches;
}
function collectDangerousInvisibleMatches(text) {
const matches = [];
let index = 0;
for (const char of text) {
const codePoint = char.codePointAt(0);
if (isDangerousInvisibleCodePoint(codePoint)) {
const { line, column } = lineAndColumn(text, index);
matches.push({
kind: 'dangerous-invisible',
char,
codePoint: `U+${codePoint.toString(16).toUpperCase()}`,
line,
column,
});
}
index += char.length;
}
return matches;
}
const changedFiles = [];
const violations = [];
for (const filePath of listFiles(repoRoot)) {
const relativePath = path.relative(repoRoot, filePath);
let text;
try {
text = fs.readFileSync(filePath, 'utf8');
} catch {
continue;
}
if (
writeMode &&
!writeModeSkip.has(path.normalize(relativePath)) &&
canAutoWrite(relativePath)
) {
const sanitized = sanitizeText(text);
if (sanitized !== text) {
fs.writeFileSync(filePath, sanitized, 'utf8');
changedFiles.push(relativePath);
text = sanitized;
}
}
const fileViolations = [
...collectDangerousInvisibleMatches(text),
...collectMatches(text, emojiRe, 'emoji'),
];
for (const violation of fileViolations) {
violations.push({
file: relativePath,
...violation,
});
}
}
if (changedFiles.length > 0) {
console.log(`Sanitized ${changedFiles.length} files:`);
for (const file of changedFiles) {
console.log(`- ${file}`);
}
}
if (violations.length > 0) {
console.error('Unicode safety violations detected:');
for (const violation of violations) {
console.error(
`${violation.file}:${violation.line}:${violation.column} ${violation.kind} ${violation.codePoint}`
);
}
process.exit(1);
}
console.log('Unicode safety check passed.');