| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469 |
- /**
- * Per-language comment stripper for framework route extractors.
- *
- * Replaces comment characters and string-literal contents that hide
- * routing-shaped text with spaces (NOT removal) so that source offsets
- * are preserved. This means `match.index` from a regex run on the
- * stripped output still maps to the same line in the original source.
- *
- * Example:
- * Input: "x = 1 # path('/fake/', V)\n real = 2"
- * Output: "x = 1 \n real = 2"
- *
- * Why strip strings/docstrings as well as comments? Python module/class
- * docstrings are a common source of false positives — they often contain
- * `path('/example/', View)` examples in usage docs. We treat triple-quoted
- * strings the same as comments. Single-line strings stay intact (a `#`
- * inside a Python string is NOT a comment).
- *
- * Scope: this is a pragmatic, regex-supporting helper, not a full parser.
- * It does NOT try to detect JS regex literals, Python f-string expressions,
- * or shell-style heredocs. Those edge cases are not load-bearing for the
- * `path(...)`, `Route::get(...)`, `app.get(...)` style patterns that
- * framework extractors scan for.
- */
- export type CommentLang =
- | 'python'
- | 'javascript'
- | 'typescript'
- | 'php'
- | 'ruby'
- | 'java'
- | 'csharp'
- | 'swift'
- | 'go'
- | 'rust';
- export function stripCommentsForRegex(content: string, lang: CommentLang): string {
- switch (lang) {
- case 'python':
- return stripPython(content);
- case 'ruby':
- return stripRuby(content);
- case 'rust':
- return stripRust(content);
- case 'php':
- return stripPhp(content);
- case 'go':
- return stripGo(content);
- case 'javascript':
- case 'typescript':
- case 'java':
- case 'csharp':
- case 'swift':
- return stripCStyle(content, /* allowSingleQuoteStrings */ lang === 'javascript' || lang === 'typescript');
- default:
- return content;
- }
- }
- /**
- * Replace every char in a slice with spaces, but keep newlines so line
- * numbers computed downstream remain valid.
- */
- function blankRange(buf: string[], start: number, end: number, src: string): void {
- for (let i = start; i < end; i++) {
- buf[i] = src[i] === '\n' ? '\n' : ' ';
- }
- }
- // ---------- Python ----------
- function stripPython(src: string): string {
- const out = src.split('');
- let i = 0;
- const n = src.length;
- while (i < n) {
- const c = src[i]!;
- const c2 = src[i + 1] ?? '';
- const c3 = src[i + 2] ?? '';
- // Triple-quoted string: """...""" or '''...'''
- if ((c === '"' || c === "'") && c2 === c && c3 === c) {
- const quote = c;
- const start = i;
- i += 3;
- while (i < n) {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === quote && src[i + 1] === quote && src[i + 2] === quote) {
- i += 3;
- break;
- }
- i++;
- }
- blankRange(out, start, i, src);
- continue;
- }
- // Single-line string: '...' or "..."
- if (c === '"' || c === "'") {
- const quote = c;
- i++;
- while (i < n && src[i] !== quote) {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === '\n') break; // unterminated
- i++;
- }
- if (i < n && src[i] === quote) i++;
- continue;
- }
- // Line comment
- if (c === '#') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- continue;
- }
- i++;
- }
- return out.join('');
- }
- // ---------- Ruby ----------
- function stripRuby(src: string): string {
- const out = src.split('');
- let i = 0;
- const n = src.length;
- let atLineStart = true;
- while (i < n) {
- const c = src[i]!;
- // =begin / =end block comments must be at start of line (after optional whitespace)
- if (atLineStart && c === '=' && src.startsWith('=begin', i)) {
- const start = i;
- // consume to matching =end at line start
- i += '=begin'.length;
- while (i < n) {
- if (src[i] === '\n') {
- // check next line for =end
- let j = i + 1;
- while (j < n && (src[j] === ' ' || src[j] === '\t')) j++;
- if (src.startsWith('=end', j)) {
- i = j + '=end'.length;
- // consume rest of that line
- while (i < n && src[i] !== '\n') i++;
- break;
- }
- }
- i++;
- }
- blankRange(out, start, i, src);
- atLineStart = i > 0 && src[i - 1] === '\n';
- continue;
- }
- // String literals
- if (c === '"' || c === "'") {
- const quote = c;
- i++;
- while (i < n && src[i] !== quote) {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === '\n') break;
- i++;
- }
- if (i < n && src[i] === quote) i++;
- atLineStart = false;
- continue;
- }
- // Line comment
- if (c === '#') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- atLineStart = false;
- continue;
- }
- if (c === '\n') {
- atLineStart = true;
- i++;
- continue;
- }
- if (c === ' ' || c === '\t') {
- // whitespace doesn't change atLineStart
- i++;
- continue;
- }
- atLineStart = false;
- i++;
- }
- return out.join('');
- }
- // ---------- C-style (JS/TS/Java/C#/Swift) ----------
- function stripCStyle(src: string, allowSingleQuoteStrings: boolean): string {
- const out = src.split('');
- let i = 0;
- const n = src.length;
- while (i < n) {
- const c = src[i]!;
- const c2 = src[i + 1] ?? '';
- // Block comment
- if (c === '/' && c2 === '*') {
- const start = i;
- i += 2;
- while (i < n && !(src[i] === '*' && src[i + 1] === '/')) i++;
- if (i < n) i += 2;
- blankRange(out, start, i, src);
- continue;
- }
- // Line comment
- if (c === '/' && c2 === '/') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- continue;
- }
- // String literals
- if (c === '"' || (allowSingleQuoteStrings && c === "'") || c === '`') {
- const quote = c;
- i++;
- while (i < n && src[i] !== quote) {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- // Template literal can span lines; regular strings break on newline (treat as unterminated)
- if (quote !== '`' && src[i] === '\n') break;
- i++;
- }
- if (i < n && src[i] === quote) i++;
- continue;
- }
- i++;
- }
- return out.join('');
- }
- // ---------- PHP ----------
- function stripPhp(src: string): string {
- const out = src.split('');
- let i = 0;
- const n = src.length;
- while (i < n) {
- const c = src[i]!;
- const c2 = src[i + 1] ?? '';
- // Block comment
- if (c === '/' && c2 === '*') {
- const start = i;
- i += 2;
- while (i < n && !(src[i] === '*' && src[i + 1] === '/')) i++;
- if (i < n) i += 2;
- blankRange(out, start, i, src);
- continue;
- }
- // // line comment
- if (c === '/' && c2 === '/') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- continue;
- }
- // # line comment (PHP supports both)
- if (c === '#') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- continue;
- }
- // String literals: ', ", ` (PHP doesn't really use backticks for strings,
- // but it does have shell-exec backticks; treating as a string is fine here)
- if (c === '"' || c === "'" || c === '`') {
- const quote = c;
- i++;
- while (i < n && src[i] !== quote) {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === '\n') break;
- i++;
- }
- if (i < n && src[i] === quote) i++;
- continue;
- }
- i++;
- }
- return out.join('');
- }
- // ---------- Go ----------
- function stripGo(src: string): string {
- const out = src.split('');
- let i = 0;
- const n = src.length;
- while (i < n) {
- const c = src[i]!;
- const c2 = src[i + 1] ?? '';
- // Block comment
- if (c === '/' && c2 === '*') {
- const start = i;
- i += 2;
- while (i < n && !(src[i] === '*' && src[i + 1] === '/')) i++;
- if (i < n) i += 2;
- blankRange(out, start, i, src);
- continue;
- }
- // Line comment
- if (c === '/' && c2 === '/') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- continue;
- }
- // Raw string with backticks (no escapes, can span lines)
- if (c === '`') {
- i++;
- while (i < n && src[i] !== '`') i++;
- if (i < n) i++;
- continue;
- }
- // Interpreted string with double quotes
- if (c === '"') {
- i++;
- while (i < n && src[i] !== '"') {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === '\n') break;
- i++;
- }
- if (i < n && src[i] === '"') i++;
- continue;
- }
- // Rune literal with single quotes (handle as a tiny string)
- if (c === "'") {
- i++;
- while (i < n && src[i] !== "'") {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === '\n') break;
- i++;
- }
- if (i < n && src[i] === "'") i++;
- continue;
- }
- i++;
- }
- return out.join('');
- }
- // ---------- Rust ----------
- function stripRust(src: string): string {
- const out = src.split('');
- let i = 0;
- const n = src.length;
- while (i < n) {
- const c = src[i]!;
- const c2 = src[i + 1] ?? '';
- // Nested block comment /* ... /* ... */ ... */
- if (c === '/' && c2 === '*') {
- const start = i;
- i += 2;
- let depth = 1;
- while (i < n && depth > 0) {
- if (src[i] === '/' && src[i + 1] === '*') {
- depth++;
- i += 2;
- } else if (src[i] === '*' && src[i + 1] === '/') {
- depth--;
- i += 2;
- } else {
- i++;
- }
- }
- blankRange(out, start, i, src);
- continue;
- }
- // Line comment
- if (c === '/' && c2 === '/') {
- const start = i;
- while (i < n && src[i] !== '\n') i++;
- blankRange(out, start, i, src);
- continue;
- }
- // String literals
- if (c === '"') {
- i++;
- while (i < n && src[i] !== '"') {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- i++;
- }
- if (i < n && src[i] === '"') i++;
- continue;
- }
- // Char literal — keep simple: skip 'x' or '\x'
- if (c === "'") {
- // Could be a lifetime, e.g. 'a, but those don't contain routing text
- i++;
- while (i < n && src[i] !== "'") {
- if (src[i] === '\\' && i + 1 < n) {
- i += 2;
- continue;
- }
- if (src[i] === '\n') break;
- i++;
- }
- if (i < n && src[i] === "'") i++;
- continue;
- }
- i++;
- }
- return out.join('');
- }
|