Browse Source

Port bug fixes and stability improvements from PR #15

- Fix Float32Array embedder bug: was creating zero-filled array instead
  of copying data from TypedArray-like objects
- Fix VSS search query: use subquery pattern so LIMIT applies before JOIN
- Pin tree-sitter versions: remove caret ranges for ABI stability, add
  overrides to lock tree-sitter core at 0.22.4
- Lazy grammar loading: load native bindings on first use per language
  instead of all at startup, so one missing grammar doesn't affect others
- Remove stale src/extraction/queries copy from copy-assets script

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Colby McHenry 4 months ago
parent
commit
e07250ed60
4 changed files with 159 additions and 99 deletions
  1. 19 16
      package.json
  2. 131 75
      src/extraction/grammars.ts
  3. 1 1
      src/vectors/embedder.ts
  4. 8 7
      src/vectors/search.ts

+ 19 - 16
package.json

@@ -15,7 +15,7 @@
   "scripts": {
     "build": "tsc && npm run copy-assets",
     "postinstall": "node scripts/postinstall.js",
-    "copy-assets": "node -e \"const fs=require('fs'),p=require('path');function cpR(s,d){if(!fs.existsSync(s))return;fs.mkdirSync(d,{recursive:true});for(const f of fs.readdirSync(s)){const sp=p.join(s,f),dp=p.join(d,f);fs.statSync(sp).isDirectory()?cpR(sp,dp):fs.copyFileSync(sp,dp)}}cpR('src/extraction/queries','dist/extraction/queries');fs.mkdirSync('dist/db',{recursive:true});fs.copyFileSync('src/db/schema.sql','dist/db/schema.sql')\"",
+    "copy-assets": "node -e \"const fs=require('fs');fs.mkdirSync('dist/db',{recursive:true});fs.copyFileSync('src/db/schema.sql','dist/db/schema.sql')\"",
     "dev": "tsc --watch",
     "cli": "npm run build && node dist/bin/codegraph.js",
     "test": "vitest run",
@@ -38,22 +38,22 @@
     "commander": "^14.0.2",
     "figlet": "^1.8.0",
     "sqlite-vss": "^0.1.2",
-    "tree-sitter": "^0.22.4",
-    "tree-sitter-c": "^0.23.4",
-    "tree-sitter-c-sharp": "^0.23.1",
-    "tree-sitter-cpp": "^0.23.4",
-    "@sengac/tree-sitter-dart": "^1.1.6",
-    "tree-sitter-go": "^0.23.4",
-    "tree-sitter-java": "^0.23.5",
-    "tree-sitter-javascript": "^0.23.1",
-    "tree-sitter-kotlin": "^0.3.8",
+    "tree-sitter": "0.22.4",
+    "tree-sitter-c": "0.23.4",
+    "tree-sitter-c-sharp": "0.23.1",
+    "tree-sitter-cpp": "0.23.4",
+    "@sengac/tree-sitter-dart": "1.1.6",
+    "tree-sitter-go": "0.23.4",
+    "tree-sitter-java": "0.23.5",
+    "tree-sitter-javascript": "0.23.1",
+    "tree-sitter-kotlin": "0.3.8",
     "tree-sitter-liquid": "github:hankthetank27/tree-sitter-liquid",
-    "tree-sitter-php": "^0.23.11",
-    "tree-sitter-python": "^0.23.6",
-    "tree-sitter-ruby": "^0.23.1",
-    "tree-sitter-rust": "^0.23.2",
-    "tree-sitter-swift": "^0.7.1",
-    "tree-sitter-typescript": "^0.23.2"
+    "tree-sitter-php": "0.23.11",
+    "tree-sitter-python": "0.23.6",
+    "tree-sitter-ruby": "0.23.1",
+    "tree-sitter-rust": "0.23.2",
+    "tree-sitter-swift": "0.7.1",
+    "tree-sitter-typescript": "0.23.2"
   },
   "devDependencies": {
     "@types/better-sqlite3": "^7.6.0",
@@ -64,5 +64,8 @@
   },
   "engines": {
     "node": ">=18.0.0"
+  },
+  "overrides": {
+    "tree-sitter": "0.22.4"
   }
 }

+ 131 - 75
src/extraction/grammars.ts

@@ -1,73 +1,88 @@
 /**
  * Grammar Loading and Caching
  *
- * Manages tree-sitter language grammars.
+ * Uses lazy per-language loading so one missing native grammar does not
+ * break extraction for all other languages.
  */
 
 import Parser from 'tree-sitter';
 import { Language } from '../types';
 
-// Grammar module imports — wrapped in tryRequire so a missing native binding
-// (e.g. tree-sitter-kotlin on Windows) degrades gracefully instead of crashing.
-// eslint-disable-next-line @typescript-eslint/no-require-imports
-function tryRequire(id: string, prop?: string): unknown | null {
-  try {
-    // eslint-disable-next-line @typescript-eslint/no-require-imports
-    const mod = require(id);
-    return prop ? mod[prop] : mod;
-  } catch {
-    console.warn(`[CodeGraph] Failed to load ${id} — ${prop ?? id} parsing will be unavailable on this platform.`);
-    return null;
-  }
-}
-
-const TypeScript = tryRequire('tree-sitter-typescript', 'typescript');
-const TSX = tryRequire('tree-sitter-typescript', 'tsx');
-const JavaScript = tryRequire('tree-sitter-javascript');
-const Python = tryRequire('tree-sitter-python');
-const Go = tryRequire('tree-sitter-go');
-const Rust = tryRequire('tree-sitter-rust');
-const Java = tryRequire('tree-sitter-java');
-const C = tryRequire('tree-sitter-c');
-const Cpp = tryRequire('tree-sitter-cpp');
-const CSharp = tryRequire('tree-sitter-c-sharp');
-const PHP = tryRequire('tree-sitter-php', 'php');
-const Ruby = tryRequire('tree-sitter-ruby');
-const Swift = tryRequire('tree-sitter-swift');
-const Kotlin = tryRequire('tree-sitter-kotlin');
-const Dart = tryRequire('@sengac/tree-sitter-dart');
-// Note: tree-sitter-liquid has ABI compatibility issues with tree-sitter 0.22+
-// Liquid extraction is handled separately via regex in tree-sitter.ts
+type GrammarLoader = () => unknown;
+type GrammarLanguage = Exclude<Language, 'liquid' | 'unknown'>;
 
 /**
- * Mapping of Language to tree-sitter grammar.
- * Parsers that failed to load are excluded.
+ * Lazy grammar loaders — each language's native binding is only loaded
+ * on first use, so a failure in one grammar doesn't affect others.
  */
-const GRAMMAR_MAP: Record<string, unknown> = {};
-
-const grammarEntries: [string, unknown][] = [
-  ['typescript', TypeScript],
-  ['tsx', TSX],
-  ['javascript', JavaScript],
-  ['jsx', JavaScript], // JSX uses the JavaScript grammar
-  ['python', Python],
-  ['go', Go],
-  ['rust', Rust],
-  ['java', Java],
-  ['c', C],
-  ['cpp', Cpp],
-  ['csharp', CSharp],
-  ['php', PHP],
-  ['ruby', Ruby],
-  ['swift', Swift],
-  ['kotlin', Kotlin],
-  ['dart', Dart],
-  // liquid: uses custom regex-based extraction, not tree-sitter
-];
-
-for (const [lang, grammar] of grammarEntries) {
-  if (grammar) GRAMMAR_MAP[lang] = grammar;
-}
+const grammarLoaders: Record<GrammarLanguage, GrammarLoader> = {
+  typescript: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-typescript').typescript;
+  },
+  tsx: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-typescript').tsx;
+  },
+  javascript: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-javascript');
+  },
+  jsx: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-javascript');
+  },
+  python: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-python');
+  },
+  go: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-go');
+  },
+  rust: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-rust');
+  },
+  java: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-java');
+  },
+  c: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-c');
+  },
+  cpp: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-cpp');
+  },
+  csharp: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-c-sharp');
+  },
+  php: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-php').php;
+  },
+  ruby: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-ruby');
+  },
+  swift: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-swift');
+  },
+  kotlin: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('tree-sitter-kotlin');
+  },
+  dart: () => {
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    return require('@sengac/tree-sitter-dart');
+  },
+  // Note: tree-sitter-liquid has ABI compatibility issues with tree-sitter 0.22+
+  // Liquid extraction is handled separately via regex in tree-sitter.ts
+};
 
 /**
  * File extension to Language mapping
@@ -103,30 +118,59 @@ export const EXTENSION_MAP: Record<string, Language> = {
 };
 
 /**
- * Cache for initialized parsers
+ * Caches for loaded grammars and parsers
  */
 const parserCache = new Map<Language, Parser>();
+const grammarCache = new Map<Language, unknown | null>();
+const unavailableGrammarErrors = new Map<Language, string>();
+
+/**
+ * Load a grammar on demand, caching the result.
+ * Returns null if the grammar is not available on this platform.
+ */
+function loadGrammar(language: Language): unknown | null {
+  if (grammarCache.has(language)) {
+    return grammarCache.get(language) ?? null;
+  }
+
+  const loader = grammarLoaders[language as GrammarLanguage];
+  if (!loader) {
+    grammarCache.set(language, null);
+    return null;
+  }
+
+  try {
+    const grammar = loader();
+    if (!grammar) {
+      throw new Error(`Grammar loader returned empty value for ${language}`);
+    }
+    grammarCache.set(language, grammar);
+    return grammar;
+  } catch (error) {
+    const message = error instanceof Error ? error.message : String(error);
+    console.warn(`[CodeGraph] Failed to load ${language} grammar — parsing will be unavailable: ${message}`);
+    unavailableGrammarErrors.set(language, message);
+    grammarCache.set(language, null);
+    return null;
+  }
+}
 
 /**
  * Get a parser for the specified language
  */
 export function getParser(language: Language): Parser | null {
-  // Check cache first
   if (parserCache.has(language)) {
     return parserCache.get(language)!;
   }
 
-  // Get grammar for language
-  const grammar = GRAMMAR_MAP[language];
+  const grammar = loadGrammar(language);
   if (!grammar) {
     return null;
   }
 
-  // Create and cache parser
   const parser = new Parser();
   parser.setLanguage(grammar as Parameters<typeof parser.setLanguage>[0]);
   parserCache.set(language, parser);
-
   return parser;
 }
 
@@ -139,29 +183,41 @@ export function detectLanguage(filePath: string): Language {
 }
 
 /**
- * Check if a language is supported
+ * Check if a language is supported by currently available parsers.
  */
 export function isLanguageSupported(language: Language): boolean {
-  // Liquid uses custom regex-based extraction, not tree-sitter
-  if (language === 'liquid') return true;
-  return language !== 'unknown' && language in GRAMMAR_MAP;
+  if (language === 'liquid') return true; // custom regex extractor
+  if (language === 'unknown') return false;
+  return loadGrammar(language) !== null;
 }
 
 /**
- * Get all supported languages
+ * Get all currently supported languages.
  */
 export function getSupportedLanguages(): Language[] {
-  const languages = Object.keys(GRAMMAR_MAP) as Language[];
-  // Add Liquid which uses custom extraction
-  languages.push('liquid');
-  return languages;
+  const available = (Object.keys(grammarLoaders) as GrammarLanguage[])
+    .filter((language) => loadGrammar(language) !== null);
+  return [...available, 'liquid'];
 }
 
 /**
- * Clear the parser cache (useful for testing)
+ * Clear parser/grammar caches (useful for testing)
  */
 export function clearParserCache(): void {
   parserCache.clear();
+  grammarCache.clear();
+  unavailableGrammarErrors.clear();
+}
+
+/**
+ * Report grammars that failed to load.
+ */
+export function getUnavailableGrammarErrors(): Partial<Record<Language, string>> {
+  const out: Partial<Record<Language, string>> = {};
+  for (const [language, message] of unavailableGrammarErrors.entries()) {
+    out[language] = message;
+  }
+  return out;
 }
 
 /**

+ 1 - 1
src/vectors/embedder.ts

@@ -296,7 +296,7 @@ export class TextEmbedder {
     if (data && typeof data === 'object' && 'length' in data) {
       // Handle TypedArray-like objects
       const arr = data as ArrayLike<number>;
-      return new Float32Array(arr.length);
+      return Float32Array.from(Array.from(arr));
     }
     throw new Error('Unsupported data format for embedding');
   }

+ 8 - 7
src/vectors/search.ts

@@ -324,13 +324,14 @@ export class VectorSearchManager {
       const rows = this.db
         .prepare(
           `
-          SELECT
-            vss_map.node_id,
-            vss_vectors.distance
-          FROM vss_vectors
-          JOIN vss_map ON vss_map.rowid = vss_vectors.rowid
-          WHERE vss_search(vss_vectors.embedding, ?)
-          LIMIT ${safeLimit}
+          SELECT m.node_id, v.distance
+          FROM (
+            SELECT rowid, distance
+            FROM vss_vectors
+            WHERE vss_search(embedding, ?)
+            LIMIT ${safeLimit}
+          ) v
+          JOIN vss_map m ON m.rowid = v.rowid
         `
         )
         .all(vectorJson) as Array<{ node_id: string; distance: number }>;