grammars.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. /**
  2. * Grammar Loading and Caching
  3. *
  4. * Uses web-tree-sitter (WASM) for universal cross-platform support.
  5. * Grammars are loaded lazily — only languages actually present in the project
  6. * are compiled, keeping V8 WASM memory pressure low on large codebases.
  7. */
  8. import * as path from 'path';
  9. import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
  10. import { Language } from '../types';
  11. export type GrammarLanguage = Exclude<Language, 'svelte' | 'vue' | 'liquid' | 'yaml' | 'twig' | 'unknown'>;
  12. /**
  13. * WASM filename map — maps each language to its .wasm grammar file
  14. * in the tree-sitter-wasms package.
  15. */
  16. const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
  17. typescript: 'tree-sitter-typescript.wasm',
  18. tsx: 'tree-sitter-tsx.wasm',
  19. javascript: 'tree-sitter-javascript.wasm',
  20. jsx: 'tree-sitter-javascript.wasm',
  21. python: 'tree-sitter-python.wasm',
  22. go: 'tree-sitter-go.wasm',
  23. rust: 'tree-sitter-rust.wasm',
  24. java: 'tree-sitter-java.wasm',
  25. c: 'tree-sitter-c.wasm',
  26. cpp: 'tree-sitter-cpp.wasm',
  27. csharp: 'tree-sitter-c_sharp.wasm',
  28. php: 'tree-sitter-php.wasm',
  29. ruby: 'tree-sitter-ruby.wasm',
  30. swift: 'tree-sitter-swift.wasm',
  31. kotlin: 'tree-sitter-kotlin.wasm',
  32. dart: 'tree-sitter-dart.wasm',
  33. pascal: 'tree-sitter-pascal.wasm',
  34. scala: 'tree-sitter-scala.wasm',
  35. lua: 'tree-sitter-lua.wasm',
  36. luau: 'tree-sitter-luau.wasm',
  37. };
  38. /**
  39. * File extension to Language mapping
  40. */
  41. export const EXTENSION_MAP: Record<string, Language> = {
  42. '.ts': 'typescript',
  43. '.tsx': 'tsx',
  44. '.js': 'javascript',
  45. '.mjs': 'javascript',
  46. '.cjs': 'javascript',
  47. '.jsx': 'jsx',
  48. '.py': 'python',
  49. '.pyw': 'python',
  50. '.go': 'go',
  51. '.rs': 'rust',
  52. '.java': 'java',
  53. '.c': 'c',
  54. '.h': 'c', // Could also be C++, defaulting to C
  55. '.cpp': 'cpp',
  56. '.cc': 'cpp',
  57. '.cxx': 'cpp',
  58. '.hpp': 'cpp',
  59. '.hxx': 'cpp',
  60. '.cs': 'csharp',
  61. '.php': 'php',
  62. // Drupal-specific PHP file extensions
  63. '.module': 'php',
  64. '.install': 'php',
  65. '.theme': 'php',
  66. '.inc': 'php',
  67. // YAML (used for Drupal routing files; no symbol extraction, file-level tracking only)
  68. '.yml': 'yaml',
  69. '.yaml': 'yaml',
  70. // Twig templates (file-level tracking only, no symbol extraction)
  71. '.twig': 'twig',
  72. '.rb': 'ruby',
  73. '.rake': 'ruby',
  74. '.swift': 'swift',
  75. '.kt': 'kotlin',
  76. '.kts': 'kotlin',
  77. '.dart': 'dart',
  78. '.liquid': 'liquid',
  79. '.svelte': 'svelte',
  80. '.vue': 'vue',
  81. '.pas': 'pascal',
  82. '.dpr': 'pascal',
  83. '.dpk': 'pascal',
  84. '.lpr': 'pascal',
  85. '.dfm': 'pascal',
  86. '.fmx': 'pascal',
  87. '.scala': 'scala',
  88. '.sc': 'scala',
  89. '.lua': 'lua',
  90. '.luau': 'luau',
  91. };
  92. /**
  93. * Whether a file is one CodeGraph can parse, based purely on its extension.
  94. * This is the single source of truth for "should we index this file" — derived
  95. * from EXTENSION_MAP so parser support and indexing selection never drift.
  96. */
  97. export function isSourceFile(filePath: string): boolean {
  98. const dot = filePath.lastIndexOf('.');
  99. if (dot < 0) return false;
  100. return filePath.slice(dot).toLowerCase() in EXTENSION_MAP;
  101. }
  102. /**
  103. * Caches for loaded grammars and parsers
  104. */
  105. const parserCache = new Map<Language, Parser>();
  106. const languageCache = new Map<Language, WasmLanguage>();
  107. const unavailableGrammarErrors = new Map<Language, string>();
  108. let parserInitialized = false;
  109. /**
  110. * Initialize the tree-sitter WASM runtime. Must be called before loading grammars.
  111. * Does NOT load any grammar WASM files — use loadGrammarsForLanguages() for that.
  112. * Idempotent — safe to call multiple times.
  113. */
  114. export async function initGrammars(): Promise<void> {
  115. if (parserInitialized) return;
  116. await Parser.init();
  117. parserInitialized = true;
  118. }
  119. /**
  120. * Load grammar WASM files for specific languages only.
  121. * Skips languages that are already loaded or have no WASM grammar.
  122. * Must be called after initGrammars().
  123. */
  124. export async function loadGrammarsForLanguages(languages: Language[]): Promise<void> {
  125. if (!parserInitialized) {
  126. await initGrammars();
  127. }
  128. // Deduplicate and filter to languages that have WASM grammars and aren't already loaded
  129. const toLoad = [...new Set(languages)].filter(
  130. (lang): lang is GrammarLanguage =>
  131. lang in WASM_GRAMMAR_FILES &&
  132. !languageCache.has(lang) &&
  133. !unavailableGrammarErrors.has(lang)
  134. );
  135. // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+
  136. // See: https://github.com/tree-sitter/tree-sitter/issues/2338
  137. for (const lang of toLoad) {
  138. const wasmFile = WASM_GRAMMAR_FILES[lang];
  139. try {
  140. // Some grammars ship their own WASMs (not in tree-sitter-wasms, or the
  141. // tree-sitter-wasms build is too old). Lua: tree-sitter-wasms ships an
  142. // ABI-13 build that corrupts the shared WASM heap under web-tree-sitter
  143. // 0.25 (drops nested calls/imports on every file after the first); we
  144. // vendor the upstream ABI-15 wasm instead.
  145. const wasmPath = (lang === 'pascal' || lang === 'scala' || lang === 'lua' || lang === 'luau')
  146. ? path.join(__dirname, 'wasm', wasmFile)
  147. : require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
  148. const language = await WasmLanguage.load(wasmPath);
  149. languageCache.set(lang, language);
  150. } catch (error) {
  151. const message = error instanceof Error ? error.message : String(error);
  152. console.warn(`[CodeGraph] Failed to load ${lang} grammar — parsing will be unavailable: ${message}`);
  153. unavailableGrammarErrors.set(lang, message);
  154. }
  155. }
  156. }
  157. /**
  158. * Load ALL grammar WASM files. Convenience function for tests and
  159. * backward compatibility. Prefer loadGrammarsForLanguages() in production.
  160. */
  161. export async function loadAllGrammars(): Promise<void> {
  162. const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[];
  163. await loadGrammarsForLanguages(allLanguages);
  164. }
  165. /**
  166. * Check if grammars have been initialized
  167. */
  168. export function isGrammarsInitialized(): boolean {
  169. return parserInitialized;
  170. }
  171. /**
  172. * Get a parser for the specified language.
  173. * Returns synchronously from pre-loaded cache.
  174. */
  175. export function getParser(language: Language): Parser | null {
  176. if (parserCache.has(language)) {
  177. return parserCache.get(language)!;
  178. }
  179. const lang = languageCache.get(language);
  180. if (!lang) {
  181. return null;
  182. }
  183. const parser = new Parser();
  184. parser.setLanguage(lang);
  185. parserCache.set(language, parser);
  186. return parser;
  187. }
  188. /**
  189. * Detect language from file extension
  190. */
  191. export function detectLanguage(filePath: string, source?: string): Language {
  192. const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
  193. const lang = EXTENSION_MAP[ext] || 'unknown';
  194. // .h files could be C or C++ — check source content for C++ features
  195. if (lang === 'c' && ext === '.h' && source) {
  196. if (looksLikeCpp(source)) return 'cpp';
  197. }
  198. return lang;
  199. }
  200. /**
  201. * Heuristic: does a .h file contain C++ constructs?
  202. * Checks the first ~8KB for patterns that are unique to C++ and never valid C.
  203. */
  204. function looksLikeCpp(source: string): boolean {
  205. const sample = source.substring(0, 8192);
  206. return /\bnamespace\b|\bclass\s+\w+\s*[:{]|\btemplate\s*<|\b(?:public|private|protected)\s*:|\bvirtual\b|\busing\s+(?:namespace\b|\w+\s*=)/.test(sample);
  207. }
  208. /**
  209. * Check if a language is supported (has a grammar defined).
  210. * Returns true if the grammar exists, even if not yet loaded.
  211. */
  212. export function isLanguageSupported(language: Language): boolean {
  213. if (language === 'svelte') return true; // custom extractor (script block delegation)
  214. if (language === 'vue') return true; // custom extractor (script block delegation)
  215. if (language === 'liquid') return true; // custom regex extractor
  216. if (language === 'yaml') return true; // file-level tracking only; Drupal routing extraction via framework resolver
  217. if (language === 'twig') return true; // file-level tracking only
  218. if (language === 'unknown') return false;
  219. return language in WASM_GRAMMAR_FILES;
  220. }
  221. /**
  222. * Check if a grammar has been loaded and is ready for parsing.
  223. */
  224. export function isGrammarLoaded(language: Language): boolean {
  225. if (language === 'svelte' || language === 'vue' || language === 'liquid') return true;
  226. if (language === 'yaml' || language === 'twig') return true; // no WASM grammar needed
  227. return languageCache.has(language);
  228. }
  229. /**
  230. * Get all supported languages (those with grammar definitions).
  231. */
  232. export function getSupportedLanguages(): Language[] {
  233. return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'liquid'];
  234. }
  235. /**
  236. * Reset the cached parser for a language to reclaim WASM heap memory.
  237. * The tree-sitter WASM runtime accumulates fragmented memory over thousands
  238. * of parses. Deleting and recreating the Parser instance forces the WASM
  239. * heap to reset, preventing "memory access out of bounds" crashes in
  240. * large repos.
  241. */
  242. export function resetParser(language: Language): void {
  243. const old = parserCache.get(language);
  244. if (old) {
  245. old.delete();
  246. parserCache.delete(language);
  247. }
  248. }
  249. /**
  250. * Clear parser/grammar caches (useful for testing)
  251. */
  252. export function clearParserCache(): void {
  253. for (const parser of parserCache.values()) {
  254. parser.delete();
  255. }
  256. parserCache.clear();
  257. // Note: languageCache is NOT cleared — WASM languages persist.
  258. // To fully re-init, set parserInitialized = false and call initGrammars() again.
  259. unavailableGrammarErrors.clear();
  260. }
  261. /**
  262. * Report grammars that failed to load.
  263. */
  264. export function getUnavailableGrammarErrors(): Partial<Record<Language, string>> {
  265. const out: Partial<Record<Language, string>> = {};
  266. for (const [language, message] of unavailableGrammarErrors.entries()) {
  267. out[language] = message;
  268. }
  269. return out;
  270. }
  271. /**
  272. * Get language display name
  273. */
  274. export function getLanguageDisplayName(language: Language): string {
  275. const names: Record<Language, string> = {
  276. typescript: 'TypeScript',
  277. javascript: 'JavaScript',
  278. tsx: 'TypeScript (TSX)',
  279. jsx: 'JavaScript (JSX)',
  280. python: 'Python',
  281. go: 'Go',
  282. rust: 'Rust',
  283. java: 'Java',
  284. c: 'C',
  285. cpp: 'C++',
  286. csharp: 'C#',
  287. php: 'PHP',
  288. ruby: 'Ruby',
  289. swift: 'Swift',
  290. kotlin: 'Kotlin',
  291. dart: 'Dart',
  292. svelte: 'Svelte',
  293. vue: 'Vue',
  294. liquid: 'Liquid',
  295. pascal: 'Pascal / Delphi',
  296. scala: 'Scala',
  297. lua: 'Lua',
  298. luau: 'Luau',
  299. yaml: 'YAML',
  300. twig: 'Twig',
  301. unknown: 'Unknown',
  302. };
  303. return names[language] || language;
  304. }