grammars.ts 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. /**
  2. * Grammar Loading and Caching
  3. *
  4. * Uses web-tree-sitter (WASM) for universal cross-platform support.
  5. * All grammars are pre-loaded asynchronously via initGrammars(), then
  6. * getParser() returns synchronously from cache.
  7. */
  8. import * as path from 'path';
  9. import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
  10. import { Language } from '../types';
  11. type GrammarLanguage = Exclude<Language, 'svelte' | 'liquid' | 'unknown'>;
  12. /**
  13. * WASM filename map — maps each language to its .wasm grammar file
  14. * in the tree-sitter-wasms package.
  15. */
  16. const WASM_GRAMMAR_FILES: Record<GrammarLanguage, string> = {
  17. typescript: 'tree-sitter-typescript.wasm',
  18. tsx: 'tree-sitter-tsx.wasm',
  19. javascript: 'tree-sitter-javascript.wasm',
  20. jsx: 'tree-sitter-javascript.wasm',
  21. python: 'tree-sitter-python.wasm',
  22. go: 'tree-sitter-go.wasm',
  23. rust: 'tree-sitter-rust.wasm',
  24. java: 'tree-sitter-java.wasm',
  25. c: 'tree-sitter-c.wasm',
  26. cpp: 'tree-sitter-cpp.wasm',
  27. csharp: 'tree-sitter-c_sharp.wasm',
  28. php: 'tree-sitter-php.wasm',
  29. ruby: 'tree-sitter-ruby.wasm',
  30. swift: 'tree-sitter-swift.wasm',
  31. kotlin: 'tree-sitter-kotlin.wasm',
  32. dart: 'tree-sitter-dart.wasm',
  33. pascal: 'tree-sitter-pascal.wasm',
  34. };
  35. /**
  36. * File extension to Language mapping
  37. */
  38. export const EXTENSION_MAP: Record<string, Language> = {
  39. '.ts': 'typescript',
  40. '.tsx': 'tsx',
  41. '.js': 'javascript',
  42. '.mjs': 'javascript',
  43. '.cjs': 'javascript',
  44. '.jsx': 'jsx',
  45. '.py': 'python',
  46. '.pyw': 'python',
  47. '.go': 'go',
  48. '.rs': 'rust',
  49. '.java': 'java',
  50. '.c': 'c',
  51. '.h': 'c', // Could also be C++, defaulting to C
  52. '.cpp': 'cpp',
  53. '.cc': 'cpp',
  54. '.cxx': 'cpp',
  55. '.hpp': 'cpp',
  56. '.hxx': 'cpp',
  57. '.cs': 'csharp',
  58. '.php': 'php',
  59. '.rb': 'ruby',
  60. '.rake': 'ruby',
  61. '.swift': 'swift',
  62. '.kt': 'kotlin',
  63. '.kts': 'kotlin',
  64. '.dart': 'dart',
  65. '.liquid': 'liquid',
  66. '.svelte': 'svelte',
  67. '.pas': 'pascal',
  68. '.dpr': 'pascal',
  69. '.dpk': 'pascal',
  70. '.lpr': 'pascal',
  71. '.dfm': 'pascal',
  72. '.fmx': 'pascal',
  73. };
  74. /**
  75. * Caches for loaded grammars and parsers
  76. */
  77. const parserCache = new Map<Language, Parser>();
  78. const languageCache = new Map<Language, WasmLanguage>();
  79. const unavailableGrammarErrors = new Map<Language, string>();
  80. let grammarsInitialized = false;
  81. /**
  82. * Initialize all WASM grammars. Must be called before any parsing.
  83. * Idempotent — safe to call multiple times.
  84. */
  85. export async function initGrammars(): Promise<void> {
  86. if (grammarsInitialized) return;
  87. await Parser.init();
  88. // Load all grammars in parallel
  89. const entries = Object.entries(WASM_GRAMMAR_FILES) as [GrammarLanguage, string][];
  90. await Promise.allSettled(
  91. entries.map(async ([lang, wasmFile]) => {
  92. try {
  93. // Pascal ships its own WASM (not in tree-sitter-wasms)
  94. const wasmPath = lang === 'pascal'
  95. ? path.join(__dirname, 'wasm', wasmFile)
  96. : require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
  97. const language = await WasmLanguage.load(wasmPath);
  98. languageCache.set(lang, language);
  99. } catch (error) {
  100. const message = error instanceof Error ? error.message : String(error);
  101. console.warn(`[CodeGraph] Failed to load ${lang} grammar — parsing will be unavailable: ${message}`);
  102. unavailableGrammarErrors.set(lang, message);
  103. }
  104. })
  105. );
  106. grammarsInitialized = true;
  107. }
  108. /**
  109. * Check if grammars have been initialized
  110. */
  111. export function isGrammarsInitialized(): boolean {
  112. return grammarsInitialized;
  113. }
  114. /**
  115. * Get a parser for the specified language.
  116. * Returns synchronously from pre-loaded cache.
  117. */
  118. export function getParser(language: Language): Parser | null {
  119. if (parserCache.has(language)) {
  120. return parserCache.get(language)!;
  121. }
  122. const lang = languageCache.get(language);
  123. if (!lang) {
  124. return null;
  125. }
  126. const parser = new Parser();
  127. parser.setLanguage(lang);
  128. parserCache.set(language, parser);
  129. return parser;
  130. }
  131. /**
  132. * Detect language from file extension
  133. */
  134. export function detectLanguage(filePath: string): Language {
  135. const ext = filePath.substring(filePath.lastIndexOf('.')).toLowerCase();
  136. return EXTENSION_MAP[ext] || 'unknown';
  137. }
  138. /**
  139. * Check if a language is supported by currently available parsers.
  140. */
  141. export function isLanguageSupported(language: Language): boolean {
  142. if (language === 'svelte') return true; // custom extractor (script block delegation)
  143. if (language === 'liquid') return true; // custom regex extractor
  144. if (language === 'unknown') return false;
  145. return languageCache.has(language);
  146. }
  147. /**
  148. * Get all currently supported languages.
  149. */
  150. export function getSupportedLanguages(): Language[] {
  151. const available = (Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[])
  152. .filter((language) => languageCache.has(language));
  153. return [...available, 'svelte', 'liquid'];
  154. }
  155. /**
  156. * Clear parser/grammar caches (useful for testing)
  157. */
  158. export function clearParserCache(): void {
  159. parserCache.clear();
  160. // Note: languageCache is NOT cleared — WASM languages persist.
  161. // To fully re-init, set grammarsInitialized = false and call initGrammars() again.
  162. unavailableGrammarErrors.clear();
  163. }
  164. /**
  165. * Report grammars that failed to load.
  166. */
  167. export function getUnavailableGrammarErrors(): Partial<Record<Language, string>> {
  168. const out: Partial<Record<Language, string>> = {};
  169. for (const [language, message] of unavailableGrammarErrors.entries()) {
  170. out[language] = message;
  171. }
  172. return out;
  173. }
  174. /**
  175. * Get language display name
  176. */
  177. export function getLanguageDisplayName(language: Language): string {
  178. const names: Record<Language, string> = {
  179. typescript: 'TypeScript',
  180. javascript: 'JavaScript',
  181. tsx: 'TypeScript (TSX)',
  182. jsx: 'JavaScript (JSX)',
  183. python: 'Python',
  184. go: 'Go',
  185. rust: 'Rust',
  186. java: 'Java',
  187. c: 'C',
  188. cpp: 'C++',
  189. csharp: 'C#',
  190. php: 'PHP',
  191. ruby: 'Ruby',
  192. swift: 'Swift',
  193. kotlin: 'Kotlin',
  194. dart: 'Dart',
  195. svelte: 'Svelte',
  196. liquid: 'Liquid',
  197. pascal: 'Pascal / Delphi',
  198. unknown: 'Unknown',
  199. };
  200. return names[language] || language;
  201. }