grammars.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. /**
  2. * Grammar Loading and Caching
  3. *
  4. * Uses web-tree-sitter (WASM) for universal cross-platform support.
  5. * Grammars are loaded lazily — only languages actually present in the project
  6. * are compiled, keeping V8 WASM memory pressure low on large codebases.
  7. */
  8. import * as path from 'path';
  9. import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
  10. import { Language } from '../types';
  11. export type GrammarLanguage = Exclude<Language, 'svelte' | 'vue' | 'astro' | 'liquid' | 'razor' | 'yaml' | 'twig' | 'xml' | 'properties' | 'unknown'>;
  12. /**
  13. * WASM filename map — maps each language to its .wasm grammar file
  14. * in the tree-sitter-wasms package.
  15. */
  16. const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
  17. typescript: 'tree-sitter-typescript.wasm',
  18. tsx: 'tree-sitter-tsx.wasm',
  19. javascript: 'tree-sitter-javascript.wasm',
  20. jsx: 'tree-sitter-javascript.wasm',
  21. python: 'tree-sitter-python.wasm',
  22. go: 'tree-sitter-go.wasm',
  23. rust: 'tree-sitter-rust.wasm',
  24. java: 'tree-sitter-java.wasm',
  25. c: 'tree-sitter-c.wasm',
  26. cpp: 'tree-sitter-cpp.wasm',
  27. csharp: 'tree-sitter-c_sharp.wasm',
  28. php: 'tree-sitter-php.wasm',
  29. ruby: 'tree-sitter-ruby.wasm',
  30. swift: 'tree-sitter-swift.wasm',
  31. kotlin: 'tree-sitter-kotlin.wasm',
  32. dart: 'tree-sitter-dart.wasm',
  33. pascal: 'tree-sitter-pascal.wasm',
  34. scala: 'tree-sitter-scala.wasm',
  35. lua: 'tree-sitter-lua.wasm',
  36. r: 'tree-sitter-r.wasm',
  37. luau: 'tree-sitter-luau.wasm',
  38. objc: 'tree-sitter-objc.wasm',
  39. };
  40. /**
  41. * File extension to Language mapping
  42. */
  43. export const EXTENSION_MAP: Record<string, Language> = {
  44. '.ts': 'typescript',
  45. '.tsx': 'tsx',
  46. // ESM/CJS TypeScript module extensions — parsed as TS (no JSX). (#366)
  47. '.mts': 'typescript',
  48. '.cts': 'typescript',
  49. '.js': 'javascript',
  50. '.mjs': 'javascript',
  51. '.cjs': 'javascript',
  52. // SAP HANA XS Classic server-side JavaScript. (#556)
  53. '.xsjs': 'javascript',
  54. '.xsjslib': 'javascript',
  55. '.jsx': 'jsx',
  56. '.py': 'python',
  57. '.pyw': 'python',
  58. '.go': 'go',
  59. '.rs': 'rust',
  60. '.java': 'java',
  61. '.c': 'c',
  62. '.h': 'c', // Could also be C++, defaulting to C
  63. '.cpp': 'cpp',
  64. '.cc': 'cpp',
  65. '.cxx': 'cpp',
  66. '.hpp': 'cpp',
  67. '.hxx': 'cpp',
  68. '.cs': 'csharp',
  69. // ASP.NET Razor / Blazor markup — custom RazorExtractor (links @model/@inject/
  70. // component tags to their C# types; markup isn't a tree-sitter grammar).
  71. '.cshtml': 'razor',
  72. '.razor': 'razor',
  73. '.php': 'php',
  74. // Drupal-specific PHP file extensions
  75. '.module': 'php',
  76. '.install': 'php',
  77. '.theme': 'php',
  78. '.inc': 'php',
  79. // YAML (used for Drupal routing files; no symbol extraction, file-level tracking only)
  80. '.yml': 'yaml',
  81. '.yaml': 'yaml',
  82. // Twig templates (file-level tracking only, no symbol extraction)
  83. '.twig': 'twig',
  84. '.rb': 'ruby',
  85. '.rake': 'ruby',
  86. '.swift': 'swift',
  87. '.kt': 'kotlin',
  88. '.kts': 'kotlin',
  89. '.dart': 'dart',
  90. '.liquid': 'liquid',
  91. '.svelte': 'svelte',
  92. '.vue': 'vue',
  93. '.astro': 'astro',
  94. '.r': 'r',
  95. '.pas': 'pascal',
  96. '.dpr': 'pascal',
  97. '.dpk': 'pascal',
  98. '.lpr': 'pascal',
  99. '.dfm': 'pascal',
  100. '.fmx': 'pascal',
  101. '.scala': 'scala',
  102. '.sc': 'scala',
  103. '.lua': 'lua',
  104. '.luau': 'luau',
  105. '.m': 'objc',
  106. '.mm': 'objc',
  107. // XML: file-level tracking; the MyBatis extractor matches `<mapper namespace="...">`
  108. // shape and emits SQL-statement nodes (other XML returns empty).
  109. '.xml': 'xml',
  110. // Spring config: `application.properties` / `application-*.properties`. Same
  111. // shape as the `.yml` variants — the YAML/properties extractor emits one node
  112. // per leaf key, and the Spring resolver links `@Value("${k}")` references.
  113. '.properties': 'properties',
  114. };
  115. /**
  116. * Whether a file is one CodeGraph can parse, based purely on its extension.
  117. * This is the single source of truth for "should we index this file" — derived
  118. * from EXTENSION_MAP so parser support and indexing selection never drift.
  119. *
  120. * `overrides` is the project's validated custom extension → language map (from
  121. * `codegraph.json`); when present its extensions count as indexable in addition
  122. * to the built-ins. Omitting it is byte-identical to the zero-config behavior.
  123. */
  124. export function isSourceFile(filePath: string, overrides?: Record<string, Language>): boolean {
  125. if (isPlayRoutesFile(filePath)) return true; // Play `conf/routes` is extensionless
  126. if (isShopifyLiquidJson(filePath)) return true; // Shopify OS 2.0 JSON templates / section groups
  127. const dot = filePath.lastIndexOf('.');
  128. if (dot < 0) return false;
  129. const ext = filePath.slice(dot).toLowerCase();
  130. return ext in EXTENSION_MAP || (!!overrides && ext in overrides);
  131. }
  132. /**
  133. * Shopify OS 2.0 JSON template (`templates/*.json`) or section group
  134. * (`sections/*.json`) — these reference sections by `"type"`, so the Liquid
  135. * extractor links them. (config/ + locales/ JSON have no section refs.)
  136. */
  137. export function isShopifyLiquidJson(filePath: string): boolean {
  138. // Allow nested template dirs (`templates/customers/login.json`), not just
  139. // top-level (`templates/product.json`).
  140. return /(^|\/)(templates|sections)\/.+\.json$/i.test(filePath);
  141. }
  142. /**
  143. * Play Framework routes file: the extensionless `conf/routes` (and included
  144. * `conf/*.routes`). No grammar — route extraction is done by the Play framework
  145. * resolver, so it's processed through the no-grammar (`yaml`-style) path.
  146. */
  147. export function isPlayRoutesFile(filePath: string): boolean {
  148. return (
  149. filePath === 'conf/routes' ||
  150. filePath.endsWith('/conf/routes') ||
  151. filePath.endsWith('.routes')
  152. );
  153. }
  154. /**
  155. * Caches for loaded grammars and parsers
  156. */
  157. const parserCache = new Map<Language, Parser>();
  158. const languageCache = new Map<Language, WasmLanguage>();
  159. const unavailableGrammarErrors = new Map<Language, string>();
  160. let parserInitialized = false;
  161. /**
  162. * Initialize the tree-sitter WASM runtime. Must be called before loading grammars.
  163. * Does NOT load any grammar WASM files — use loadGrammarsForLanguages() for that.
  164. * Idempotent — safe to call multiple times.
  165. */
  166. export async function initGrammars(): Promise<void> {
  167. if (parserInitialized) return;
  168. await Parser.init();
  169. parserInitialized = true;
  170. }
  171. /**
  172. * Load grammar WASM files for specific languages only.
  173. * Skips languages that are already loaded or have no WASM grammar.
  174. * Must be called after initGrammars().
  175. */
  176. export async function loadGrammarsForLanguages(languages: Language[]): Promise<void> {
  177. if (!parserInitialized) {
  178. await initGrammars();
  179. }
  180. // SFC languages (svelte/vue/astro) have no grammar of their own — their
  181. // extractors delegate <script>/frontmatter content to the TS/JS extractor,
  182. // so those grammars must be loaded even when no plain .ts/.js file is in
  183. // the index set (e.g. a pure-.astro content site).
  184. if (languages.some((l) => l === 'svelte' || l === 'vue' || l === 'astro')) {
  185. languages = [...languages, 'typescript', 'javascript'];
  186. }
  187. // Deduplicate and filter to languages that have WASM grammars and aren't already loaded
  188. const toLoad = [...new Set(languages)].filter(
  189. (lang): lang is GrammarLanguage =>
  190. lang in WASM_GRAMMAR_FILES &&
  191. !languageCache.has(lang) &&
  192. !unavailableGrammarErrors.has(lang)
  193. );
  194. // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+
  195. // See: https://github.com/tree-sitter/tree-sitter/issues/2338
  196. for (const lang of toLoad) {
  197. const wasmFile = WASM_GRAMMAR_FILES[lang];
  198. try {
  199. // Some grammars ship their own WASMs (not in tree-sitter-wasms, or the
  200. // tree-sitter-wasms build is too old). Lua: tree-sitter-wasms ships an
  201. // ABI-13 build that corrupts the shared WASM heap under web-tree-sitter
  202. // 0.25 (drops nested calls/imports on every file after the first); we
  203. // vendor the upstream ABI-15 wasm instead. C#: the tree-sitter-wasms
  204. // build (ABI 13) has no primary-constructor support and parses
  205. // `class Foo(...)` as an ERROR that swallows the whole class (#237); we
  206. // vendor the upstream ABI-15 tree-sitter-c-sharp 0.23.5 wasm, which parses
  207. // primary constructors natively.
  208. const wasmPath = (lang === 'pascal' || lang === 'scala' || lang === 'lua' || lang === 'luau' || lang === 'csharp' || lang === 'r')
  209. ? path.join(__dirname, 'wasm', wasmFile)
  210. : require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
  211. const language = await WasmLanguage.load(wasmPath);
  212. languageCache.set(lang, language);
  213. } catch (error) {
  214. const message = error instanceof Error ? error.message : String(error);
  215. console.warn(`[CodeGraph] Failed to load ${lang} grammar — parsing will be unavailable: ${message}`);
  216. unavailableGrammarErrors.set(lang, message);
  217. }
  218. }
  219. }
  220. /**
  221. * Load ALL grammar WASM files. Convenience function for tests and
  222. * backward compatibility. Prefer loadGrammarsForLanguages() in production.
  223. */
  224. export async function loadAllGrammars(): Promise<void> {
  225. const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[];
  226. await loadGrammarsForLanguages(allLanguages);
  227. }
  228. /**
  229. * Check if grammars have been initialized
  230. */
  231. export function isGrammarsInitialized(): boolean {
  232. return parserInitialized;
  233. }
  234. /**
  235. * Get a parser for the specified language.
  236. * Returns synchronously from pre-loaded cache.
  237. */
  238. export function getParser(language: Language): Parser | null {
  239. if (parserCache.has(language)) {
  240. return parserCache.get(language)!;
  241. }
  242. const lang = languageCache.get(language);
  243. if (!lang) {
  244. return null;
  245. }
  246. const parser = new Parser();
  247. parser.setLanguage(lang);
  248. parserCache.set(language, parser);
  249. return parser;
  250. }
  251. /**
  252. * Detect language from file extension.
  253. *
  254. * `overrides` is the project's validated custom extension → language map (from
  255. * `codegraph.json`); when present its mappings take precedence over the built-in
  256. * `EXTENSION_MAP`. Omitting it is byte-identical to the zero-config behavior.
  257. */
  258. export function detectLanguage(filePath: string, source?: string, overrides?: Record<string, Language>): Language {
  259. // Play `conf/routes` has no grammar — route through the no-symbol path; the
  260. // Play framework resolver extracts route nodes from it.
  261. if (isPlayRoutesFile(filePath)) return 'yaml';
  262. const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
  263. // Shopify OS 2.0 JSON templates / section groups → the Liquid extractor (it
  264. // links each section `"type"` to its `sections/<type>.liquid`).
  265. if (isShopifyLiquidJson(filePath)) return 'liquid';
  266. const lang = (overrides && overrides[ext]) || EXTENSION_MAP[ext] || 'unknown';
  267. // .h files could be C, C++, or Objective-C — check source content
  268. if (lang === 'c' && ext === '.h' && source) {
  269. if (looksLikeCpp(source)) return 'cpp';
  270. if (looksLikeObjc(source)) return 'objc';
  271. }
  272. return lang;
  273. }
  274. /**
  275. * Heuristic: does a .h file contain C++ constructs?
  276. * Checks the first ~8KB for patterns that are unique to C++ and never valid C.
  277. */
  278. function looksLikeCpp(source: string): boolean {
  279. const sample = source.substring(0, 8192);
  280. return /\bnamespace\b|\bclass\s+\w+\s*[:{]|\btemplate\s*<|\b(?:public|private|protected)\s*:|\bvirtual\b|\busing\s+(?:namespace\b|\w+\s*=)/.test(sample);
  281. }
  282. /**
  283. * Heuristic: does a .h file contain Objective-C constructs?
  284. */
  285. function looksLikeObjc(source: string): boolean {
  286. const sample = source.substring(0, 8192);
  287. return /@(?:interface|implementation|protocol|synthesize)\b/.test(sample);
  288. }
  289. /**
  290. * Check if a language is supported (has a grammar defined).
  291. * Returns true if the grammar exists, even if not yet loaded.
  292. */
  293. export function isLanguageSupported(language: Language): boolean {
  294. if (language === 'svelte') return true; // custom extractor (script block delegation)
  295. if (language === 'vue') return true; // custom extractor (script block delegation)
  296. if (language === 'astro') return true; // custom extractor (frontmatter/script block delegation)
  297. if (language === 'liquid') return true; // custom regex extractor
  298. if (language === 'razor') return true; // custom RazorExtractor (.cshtml/.razor markup)
  299. if (language === 'yaml') return true; // file-level tracking only; Drupal routing extraction via framework resolver
  300. if (language === 'twig') return true; // file-level tracking only
  301. if (language === 'xml') return true; // MyBatis mapper extractor
  302. if (language === 'properties') return true; // Spring config keys
  303. if (language === 'unknown') return false;
  304. return language in WASM_GRAMMAR_FILES;
  305. }
  306. /**
  307. * Check if a grammar has been loaded and is ready for parsing.
  308. */
  309. export function isGrammarLoaded(language: Language): boolean {
  310. if (language === 'svelte' || language === 'vue' || language === 'astro' || language === 'liquid' || language === 'razor') return true;
  311. if (language === 'yaml' || language === 'twig') return true; // no WASM grammar needed
  312. if (language === 'xml' || language === 'properties') return true; // no WASM grammar needed
  313. return languageCache.has(language);
  314. }
  315. /**
  316. * Languages tracked at the file-record level only: parsing emits zero symbol
  317. * nodes, but the file is still stored (and framework resolvers may add per-file
  318. * references later, e.g. Drupal routing yml, Spring `@Value` against
  319. * application.properties). This is the canonical set behind the no-symbol
  320. * branch in `tree-sitter.ts`; `xml` is intentionally excluded because its
  321. * MyBatis extractor emits a file node. Callers use this to count such files as
  322. * indexed rather than skipped, so it must stay in sync with that branch.
  323. */
  324. export function isFileLevelOnlyLanguage(language: Language): boolean {
  325. return language === 'yaml' || language === 'twig' || language === 'properties';
  326. }
  327. /**
  328. * Get all supported languages (those with grammar definitions).
  329. */
  330. export function getSupportedLanguages(): Language[] {
  331. return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'vue', 'astro', 'liquid'];
  332. }
  333. /**
  334. * Reset the cached parser for a language to reclaim WASM heap memory.
  335. * The tree-sitter WASM runtime accumulates fragmented memory over thousands
  336. * of parses. Deleting and recreating the Parser instance forces the WASM
  337. * heap to reset, preventing "memory access out of bounds" crashes in
  338. * large repos.
  339. */
  340. export function resetParser(language: Language): void {
  341. const old = parserCache.get(language);
  342. if (old) {
  343. old.delete();
  344. parserCache.delete(language);
  345. }
  346. }
  347. /**
  348. * Clear parser/grammar caches (useful for testing)
  349. */
  350. export function clearParserCache(): void {
  351. for (const parser of parserCache.values()) {
  352. parser.delete();
  353. }
  354. parserCache.clear();
  355. // Note: languageCache is NOT cleared — WASM languages persist.
  356. // To fully re-init, set parserInitialized = false and call initGrammars() again.
  357. unavailableGrammarErrors.clear();
  358. }
  359. /**
  360. * Report grammars that failed to load.
  361. */
  362. export function getUnavailableGrammarErrors(): Partial<Record<Language, string>> {
  363. const out: Partial<Record<Language, string>> = {};
  364. for (const [language, message] of unavailableGrammarErrors.entries()) {
  365. out[language] = message;
  366. }
  367. return out;
  368. }
  369. /**
  370. * Get language display name
  371. */
  372. export function getLanguageDisplayName(language: Language): string {
  373. const names: Record<Language, string> = {
  374. typescript: 'TypeScript',
  375. javascript: 'JavaScript',
  376. tsx: 'TypeScript (TSX)',
  377. jsx: 'JavaScript (JSX)',
  378. python: 'Python',
  379. go: 'Go',
  380. rust: 'Rust',
  381. r: 'R',
  382. java: 'Java',
  383. c: 'C',
  384. cpp: 'C++',
  385. csharp: 'C#',
  386. razor: 'Razor/Blazor',
  387. php: 'PHP',
  388. ruby: 'Ruby',
  389. swift: 'Swift',
  390. kotlin: 'Kotlin',
  391. dart: 'Dart',
  392. svelte: 'Svelte',
  393. vue: 'Vue',
  394. astro: 'Astro',
  395. liquid: 'Liquid',
  396. pascal: 'Pascal / Delphi',
  397. scala: 'Scala',
  398. lua: 'Lua',
  399. luau: 'Luau',
  400. objc: 'Objective-C',
  401. yaml: 'YAML',
  402. twig: 'Twig',
  403. xml: 'XML',
  404. properties: 'Java properties',
  405. unknown: 'Unknown',
  406. };
  407. return names[language] || language;
  408. }