haiany
/
codegraph
зеркало из https://github.com/colbymchenry/codegraph.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
							/**
 * Per-language comment stripper for framework route extractors.
 *
 * Replaces comment characters and string-literal contents that hide
 * routing-shaped text with spaces (NOT removal) so that source offsets
 * are preserved. This means `match.index` from a regex run on the
 * stripped output still maps to the same line in the original source.
 *
 * Example:
 *   Input:  "x = 1  # path('/fake/', V)\n real = 2"
 *   Output: "x = 1                       \n real = 2"
 *
 * Why strip strings/docstrings as well as comments? Python module/class
 * docstrings are a common source of false positives — they often contain
 * `path('/example/', View)` examples in usage docs. We treat triple-quoted
 * strings the same as comments. Single-line strings stay intact (a `#`
 * inside a Python string is NOT a comment).
 *
 * Scope: this is a pragmatic, regex-supporting helper, not a full parser.
 * It does NOT try to detect JS regex literals, Python f-string expressions,
 * or shell-style heredocs. Those edge cases are not load-bearing for the
 * `path(...)`, `Route::get(...)`, `app.get(...)` style patterns that
 * framework extractors scan for.
 */

export type CommentLang =
  | 'python'
  | 'javascript'
  | 'typescript'
  | 'php'
  | 'ruby'
  | 'java'
  | 'csharp'
  | 'swift'
  | 'go'
  | 'rust';

export function stripCommentsForRegex(content: string, lang: CommentLang): string {
  switch (lang) {
    case 'python':
      return stripPython(content);
    case 'ruby':
      return stripRuby(content);
    case 'rust':
      return stripRust(content);
    case 'php':
      return stripPhp(content);
    case 'go':
      return stripGo(content);
    case 'javascript':
    case 'typescript':
    case 'java':
    case 'csharp':
    case 'swift':
      return stripCStyle(content, /* allowSingleQuoteStrings */ lang === 'javascript' || lang === 'typescript');
    default:
      return content;
  }
}

/**
 * Replace every char in a slice with spaces, but keep newlines so line
 * numbers computed downstream remain valid.
 */
function blankRange(buf: string[], start: number, end: number, src: string): void {
  for (let i = start; i < end; i++) {
    buf[i] = src[i] === '\n' ? '\n' : ' ';
  }
}

// ---------- Python ----------

function stripPython(src: string): string {
  const out = src.split('');
  let i = 0;
  const n = src.length;

  while (i < n) {
    const c = src[i]!;
    const c2 = src[i + 1] ?? '';
    const c3 = src[i + 2] ?? '';

    // Triple-quoted string: """...""" or '''...'''
    if ((c === '"' || c === "'") && c2 === c && c3 === c) {
      const quote = c;
      const start = i;
      i += 3;
      while (i < n) {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === quote && src[i + 1] === quote && src[i + 2] === quote) {
          i += 3;
          break;
        }
        i++;
      }
      blankRange(out, start, i, src);
      continue;
    }

    // Single-line string: '...' or "..."
    if (c === '"' || c === "'") {
      const quote = c;
      i++;
      while (i < n && src[i] !== quote) {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === '\n') break; // unterminated
        i++;
      }
      if (i < n && src[i] === quote) i++;
      continue;
    }

    // Line comment
    if (c === '#') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      continue;
    }

    i++;
  }

  return out.join('');
}

// ---------- Ruby ----------

function stripRuby(src: string): string {
  const out = src.split('');
  let i = 0;
  const n = src.length;
  let atLineStart = true;

  while (i < n) {
    const c = src[i]!;

    // =begin / =end block comments must be at start of line (after optional whitespace)
    if (atLineStart && c === '=' && src.startsWith('=begin', i)) {
      const start = i;
      // consume to matching =end at line start
      i += '=begin'.length;
      while (i < n) {
        if (src[i] === '\n') {
          // check next line for =end
          let j = i + 1;
          while (j < n && (src[j] === ' ' || src[j] === '\t')) j++;
          if (src.startsWith('=end', j)) {
            i = j + '=end'.length;
            // consume rest of that line
            while (i < n && src[i] !== '\n') i++;
            break;
          }
        }
        i++;
      }
      blankRange(out, start, i, src);
      atLineStart = i > 0 && src[i - 1] === '\n';
      continue;
    }

    // String literals
    if (c === '"' || c === "'") {
      const quote = c;
      i++;
      while (i < n && src[i] !== quote) {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === '\n') break;
        i++;
      }
      if (i < n && src[i] === quote) i++;
      atLineStart = false;
      continue;
    }

    // Line comment
    if (c === '#') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      atLineStart = false;
      continue;
    }

    if (c === '\n') {
      atLineStart = true;
      i++;
      continue;
    }
    if (c === ' ' || c === '\t') {
      // whitespace doesn't change atLineStart
      i++;
      continue;
    }
    atLineStart = false;
    i++;
  }

  return out.join('');
}

// ---------- C-style (JS/TS/Java/C#/Swift) ----------

function stripCStyle(src: string, allowSingleQuoteStrings: boolean): string {
  const out = src.split('');
  let i = 0;
  const n = src.length;

  while (i < n) {
    const c = src[i]!;
    const c2 = src[i + 1] ?? '';

    // Block comment
    if (c === '/' && c2 === '*') {
      const start = i;
      i += 2;
      while (i < n && !(src[i] === '*' && src[i + 1] === '/')) i++;
      if (i < n) i += 2;
      blankRange(out, start, i, src);
      continue;
    }

    // Line comment
    if (c === '/' && c2 === '/') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      continue;
    }

    // String literals
    if (c === '"' || (allowSingleQuoteStrings && c === "'") || c === '`') {
      const quote = c;
      i++;
      while (i < n && src[i] !== quote) {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        // Template literal can span lines; regular strings break on newline (treat as unterminated)
        if (quote !== '`' && src[i] === '\n') break;
        i++;
      }
      if (i < n && src[i] === quote) i++;
      continue;
    }

    i++;
  }

  return out.join('');
}

// ---------- PHP ----------

function stripPhp(src: string): string {
  const out = src.split('');
  let i = 0;
  const n = src.length;

  while (i < n) {
    const c = src[i]!;
    const c2 = src[i + 1] ?? '';

    // Block comment
    if (c === '/' && c2 === '*') {
      const start = i;
      i += 2;
      while (i < n && !(src[i] === '*' && src[i + 1] === '/')) i++;
      if (i < n) i += 2;
      blankRange(out, start, i, src);
      continue;
    }

    // // line comment
    if (c === '/' && c2 === '/') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      continue;
    }

    // # line comment (PHP supports both)
    if (c === '#') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      continue;
    }

    // String literals: ', ", ` (PHP doesn't really use backticks for strings,
    // but it does have shell-exec backticks; treating as a string is fine here)
    if (c === '"' || c === "'" || c === '`') {
      const quote = c;
      i++;
      while (i < n && src[i] !== quote) {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === '\n') break;
        i++;
      }
      if (i < n && src[i] === quote) i++;
      continue;
    }

    i++;
  }

  return out.join('');
}

// ---------- Go ----------

function stripGo(src: string): string {
  const out = src.split('');
  let i = 0;
  const n = src.length;

  while (i < n) {
    const c = src[i]!;
    const c2 = src[i + 1] ?? '';

    // Block comment
    if (c === '/' && c2 === '*') {
      const start = i;
      i += 2;
      while (i < n && !(src[i] === '*' && src[i + 1] === '/')) i++;
      if (i < n) i += 2;
      blankRange(out, start, i, src);
      continue;
    }

    // Line comment
    if (c === '/' && c2 === '/') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      continue;
    }

    // Raw string with backticks (no escapes, can span lines)
    if (c === '`') {
      i++;
      while (i < n && src[i] !== '`') i++;
      if (i < n) i++;
      continue;
    }

    // Interpreted string with double quotes
    if (c === '"') {
      i++;
      while (i < n && src[i] !== '"') {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === '\n') break;
        i++;
      }
      if (i < n && src[i] === '"') i++;
      continue;
    }

    // Rune literal with single quotes (handle as a tiny string)
    if (c === "'") {
      i++;
      while (i < n && src[i] !== "'") {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === '\n') break;
        i++;
      }
      if (i < n && src[i] === "'") i++;
      continue;
    }

    i++;
  }

  return out.join('');
}

// ---------- Rust ----------

function stripRust(src: string): string {
  const out = src.split('');
  let i = 0;
  const n = src.length;

  while (i < n) {
    const c = src[i]!;
    const c2 = src[i + 1] ?? '';

    // Nested block comment /* ... /* ... */ ... */
    if (c === '/' && c2 === '*') {
      const start = i;
      i += 2;
      let depth = 1;
      while (i < n && depth > 0) {
        if (src[i] === '/' && src[i + 1] === '*') {
          depth++;
          i += 2;
        } else if (src[i] === '*' && src[i + 1] === '/') {
          depth--;
          i += 2;
        } else {
          i++;
        }
      }
      blankRange(out, start, i, src);
      continue;
    }

    // Line comment
    if (c === '/' && c2 === '/') {
      const start = i;
      while (i < n && src[i] !== '\n') i++;
      blankRange(out, start, i, src);
      continue;
    }

    // String literals
    if (c === '"') {
      i++;
      while (i < n && src[i] !== '"') {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        i++;
      }
      if (i < n && src[i] === '"') i++;
      continue;
    }

    // Char literal — keep simple: skip 'x' or '\x'
    if (c === "'") {
      // Could be a lifetime, e.g. 'a, but those don't contain routing text
      i++;
      while (i < n && src[i] !== "'") {
        if (src[i] === '\\' && i + 1 < n) {
          i += 2;
          continue;
        }
        if (src[i] === '\n') break;
        i++;
      }
      if (i < n && src[i] === "'") i++;
      continue;
    }

    i++;
  }

  return out.join('');
}