فهرست منبع

Merge pull request #61 from colbymchenry/fix/wasm-oom-lazy-grammars

fix: Lazy grammar loading to prevent V8 WASM OOM on large codebases
Colby Mchenry 3 ماه پیش
والد
کامیت
3ffcac781e
6فایلهای تغییر یافته به همراه81 افزوده شده و 28 حذف شده
  1. 2 1
      __tests__/extraction.test.ts
  2. 2 0
      __tests__/pr19-improvements.test.ts
  3. 58 22
      src/extraction/grammars.ts
  4. 15 3
      src/extraction/index.ts
  5. 1 1
      src/index.ts
  6. 3 1
      src/vectors/embedder.ts

+ 2 - 1
__tests__/extraction.test.ts

@@ -10,12 +10,13 @@ import * as path from 'path';
 import * as os from 'os';
 import { CodeGraph } from '../src';
 import { extractFromSource, scanDirectory, shouldIncludeFile } from '../src/extraction';
-import { detectLanguage, isLanguageSupported, getSupportedLanguages, initGrammars } from '../src/extraction/grammars';
+import { detectLanguage, isLanguageSupported, getSupportedLanguages, initGrammars, loadAllGrammars } from '../src/extraction/grammars';
 import { normalizePath } from '../src/utils';
 import { DEFAULT_CONFIG } from '../src/types';
 
 beforeAll(async () => {
   await initGrammars();
+  await loadAllGrammars();
 });
 
 // Create a temporary directory for each test

+ 2 - 0
__tests__/pr19-improvements.test.ts

@@ -25,10 +25,12 @@ import {
   clearParserCache,
   getUnavailableGrammarErrors,
   initGrammars,
+  loadAllGrammars,
 } from '../src/extraction/grammars';
 
 beforeAll(async () => {
   await initGrammars();
+  await loadAllGrammars();
 });
 
 // Create a temporary directory for each test

+ 58 - 22
src/extraction/grammars.ts

@@ -2,15 +2,15 @@
  * Grammar Loading and Caching
  *
  * Uses web-tree-sitter (WASM) for universal cross-platform support.
- * All grammars are pre-loaded asynchronously via initGrammars(), then
- * getParser() returns synchronously from cache.
+ * Grammars are loaded lazily — only languages actually present in the project
+ * are compiled, keeping V8 WASM memory pressure low on large codebases.
  */
 
 import * as path from 'path';
 import { Parser, Language as WasmLanguage } from 'web-tree-sitter';
 import { Language } from '../types';
 
-type GrammarLanguage = Exclude<Language, 'svelte' | 'liquid' | 'unknown'>;
+export type GrammarLanguage = Exclude<Language, 'svelte' | 'liquid' | 'unknown'>;
 
 /**
  * WASM filename map — maps each language to its .wasm grammar file
@@ -83,43 +83,72 @@ const parserCache = new Map<Language, Parser>();
 const languageCache = new Map<Language, WasmLanguage>();
 const unavailableGrammarErrors = new Map<Language, string>();
 
-let grammarsInitialized = false;
+let parserInitialized = false;
 
 /**
- * Initialize all WASM grammars. Must be called before any parsing.
+ * Initialize the tree-sitter WASM runtime. Must be called before loading grammars.
+ * Does NOT load any grammar WASM files — use loadGrammarsForLanguages() for that.
  * Idempotent — safe to call multiple times.
  */
 export async function initGrammars(): Promise<void> {
-  if (grammarsInitialized) return;
+  if (parserInitialized) return;
 
   await Parser.init();
 
+  parserInitialized = true;
+}
+
+/**
+ * Load grammar WASM files for specific languages only.
+ * Skips languages that are already loaded or have no WASM grammar.
+ * Must be called after initGrammars().
+ */
+export async function loadGrammarsForLanguages(languages: Language[]): Promise<void> {
+  if (!parserInitialized) {
+    await initGrammars();
+  }
+
+  // Deduplicate and filter to languages that have WASM grammars and aren't already loaded
+  const toLoad = [...new Set(languages)].filter(
+    (lang): lang is GrammarLanguage =>
+      lang in WASM_GRAMMAR_FILES &&
+      !languageCache.has(lang) &&
+      !unavailableGrammarErrors.has(lang)
+  );
+
   // Load grammars sequentially to avoid web-tree-sitter WASM race condition on Node 20+
   // See: https://github.com/tree-sitter/tree-sitter/issues/2338
-  const entries = Object.entries(WASM_GRAMMAR_FILES) as [GrammarLanguage, string][];
-  for (const [lang, wasmFile] of entries) {
+  for (const lang of toLoad) {
+    const wasmFile = WASM_GRAMMAR_FILES[lang];
     try {
-        // Pascal ships its own WASM (not in tree-sitter-wasms)
-        const wasmPath = lang === 'pascal'
-          ? path.join(__dirname, 'wasm', wasmFile)
-          : require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
-        const language = await WasmLanguage.load(wasmPath);
-        languageCache.set(lang, language);
+      // Pascal ships its own WASM (not in tree-sitter-wasms)
+      const wasmPath = lang === 'pascal'
+        ? path.join(__dirname, 'wasm', wasmFile)
+        : require.resolve(`tree-sitter-wasms/out/${wasmFile}`);
+      const language = await WasmLanguage.load(wasmPath);
+      languageCache.set(lang, language);
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`[CodeGraph] Failed to load ${lang} grammar — parsing will be unavailable: ${message}`);
       unavailableGrammarErrors.set(lang, message);
     }
   }
+}
 
-  grammarsInitialized = true;
+/**
+ * Load ALL grammar WASM files. Convenience function for tests and
+ * backward compatibility. Prefer loadGrammarsForLanguages() in production.
+ */
+export async function loadAllGrammars(): Promise<void> {
+  const allLanguages = Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[];
+  await loadGrammarsForLanguages(allLanguages);
 }
 
 /**
  * Check if grammars have been initialized
  */
 export function isGrammarsInitialized(): boolean {
-  return grammarsInitialized;
+  return parserInitialized;
 }
 
 /**
@@ -151,22 +180,29 @@ export function detectLanguage(filePath: string): Language {
 }
 
 /**
- * Check if a language is supported by currently available parsers.
+ * Check if a language is supported (has a grammar defined).
+ * Returns true if the grammar exists, even if not yet loaded.
  */
 export function isLanguageSupported(language: Language): boolean {
   if (language === 'svelte') return true; // custom extractor (script block delegation)
   if (language === 'liquid') return true; // custom regex extractor
   if (language === 'unknown') return false;
+  return language in WASM_GRAMMAR_FILES;
+}
+
+/**
+ * Check if a grammar has been loaded and is ready for parsing.
+ */
+export function isGrammarLoaded(language: Language): boolean {
+  if (language === 'svelte' || language === 'liquid') return true;
   return languageCache.has(language);
 }
 
 /**
- * Get all currently supported languages.
+ * Get all supported languages (those with grammar definitions).
  */
 export function getSupportedLanguages(): Language[] {
-  const available = (Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[])
-    .filter((language) => languageCache.has(language));
-  return [...available, 'svelte', 'liquid'];
+  return [...(Object.keys(WASM_GRAMMAR_FILES) as GrammarLanguage[]), 'svelte', 'liquid'];
 }
 
 /**
@@ -175,7 +211,7 @@ export function getSupportedLanguages(): Language[] {
 export function clearParserCache(): void {
   parserCache.clear();
   // Note: languageCache is NOT cleared — WASM languages persist.
-  // To fully re-init, set grammarsInitialized = false and call initGrammars() again.
+  // To fully re-init, set parserInitialized = false and call initGrammars() again.
   unavailableGrammarErrors.clear();
 }
 

+ 15 - 3
src/extraction/index.ts

@@ -18,7 +18,7 @@ import {
 } from '../types';
 import { QueryBuilder } from '../db/queries';
 import { extractFromSource } from './tree-sitter';
-import { detectLanguage, isLanguageSupported, initGrammars } from './grammars';
+import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
 import { logDebug, logWarn } from '../errors';
 import { captureException } from '../sentry';
 import { validatePathWithinRoot, normalizePath } from '../utils';
@@ -378,6 +378,12 @@ export class ExtractionOrchestrator {
       };
     }
 
+    // Load only the grammars needed for languages actually present in the project.
+    // This avoids compiling all 16+ WASM grammar modules upfront, which can cause
+    // V8 WASM Zone OOM on large codebases (see issue #54).
+    const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
+    await loadGrammarsForLanguages(neededLanguages);
+
     // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
     const total = files.length;
     let processed = 0;
@@ -683,7 +689,7 @@ export class ExtractionOrchestrator {
    * Uses git status as a fast path when available, falling back to full scan.
    */
   async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
-    await initGrammars();
+    await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
     const startTime = Date.now();
     let filesChecked = 0;
     let filesAdded = 0;
@@ -794,6 +800,12 @@ export class ExtractionOrchestrator {
       }
     }
 
+    // Load only grammars needed for changed files
+    if (filesToIndex.length > 0) {
+      const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
+      await loadGrammarsForLanguages(neededLanguages);
+    }
+
     // Index changed files
     const total = filesToIndex.length;
     for (let i = 0; i < filesToIndex.length; i++) {
@@ -920,4 +932,4 @@ export class ExtractionOrchestrator {
 
 // Re-export useful types and functions
 export { extractFromSource } from './tree-sitter';
-export { detectLanguage, isLanguageSupported, getSupportedLanguages, initGrammars } from './grammars';
+export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';

+ 1 - 1
src/index.ts

@@ -61,7 +61,7 @@ export {
   CODEGRAPH_DIR,
 } from './directory';
 export { IndexProgress, IndexResult, SyncResult } from './extraction';
-export { detectLanguage, isLanguageSupported, getSupportedLanguages, initGrammars } from './extraction';
+export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './extraction';
 export { ResolutionResult } from './resolution';
 export { EmbeddingProgress } from './vectors';
 export {

+ 3 - 1
src/vectors/embedder.ts

@@ -127,8 +127,10 @@ export class TextEmbedder {
       env.allowRemoteModels = false;
     }
 
-    // Load the pipeline
+    // Load the pipeline with quantized model to reduce WASM memory pressure.
+    // Quantized (int8/uint8) is ~4x smaller than FP32 with minimal quality loss.
     this.pipeline = await pipeline('feature-extraction', this.modelId, {
+      quantized: true,
       progress_callback: this.showProgress
         ? (progress: { status: string; file?: string; progress?: number }) => {
             if (progress.status === 'progress' && progress.file && progress.progress) {