Browse Source

fix(extraction): drop duplicate export-var nodes and honour maxFileSize in bulk path (#129)

Two correctness bugs in the core extraction pipeline, surfaced by an
adversarial stress corpus (5k synthetic export-const declarations
plus a deliberate 8MB single-line file):

1) Every `export const X = ...` produced TWO nodes for the same
   symbol — one kind:'variable' from extractExportedVariables, plus
   one kind:'constant' from extractVariable (called when the walker
   descended into the export_statement child). Stress test showed
   100% duplication across 5,003 export-const declarations. The
   dedicated extractVariable dispatch is the correct one — it picks
   kind from isConst, captures the initializer signature, and walks
   type annotations; the export-statement helper was redundant
   because the language extractors' isExported predicate already
   walks parent chains. Remove the export_statement branch from the
   dispatch (children are descended into normally) and drop the
   private helper.

2) The bulk indexAll path read each file's stats but never compared
   stats.size against config.maxFileSize. Vendored generated files
   (multi-MB headers, minified bundles, etc.) were indexed regardless
   of the user's size cap. The single-file extractFile path enforced
   it; only the bulk path was missing the check. Mirror the
   single-file behaviour: emit a 'size_exceeded' warning, count the
   file as skipped, advance progress, and continue.

On the stress workspace (5,005 synthetic files; 50,000 fns in one
3MB file; 8MB single-line file; 5,000 export-const declarations):

  before:  65,014 nodes (100% var/const duplication, every >1MB file
           indexed despite maxFileSize=1MB)
   after:  10,008 nodes (0 duplicates, large files correctly skipped
           with size_exceeded warnings)

Tests calibrated to the duplicate behavior were updated to look for
kind:'constant' on `export const`, which is the correct kind. Full
suite: 380 passed (was 374 passed, 6 failed before this fix).
andreinknv 1 month ago
parent
commit
4f6c51d381
3 changed files with 44 additions and 65 deletions
  1. 6 6
      __tests__/extraction.test.ts
  2. 20 0
      src/extraction/index.ts
  3. 18 59
      src/extraction/tree-sitter.ts

+ 6 - 6
__tests__/extraction.test.ts

@@ -376,7 +376,7 @@ export const useUIStore = create<UIState>((set) => ({
 `;
 `;
     const result = extractFromSource('store.ts', code);
     const result = extractFromSource('store.ts', code);
 
 
-    const varNode = result.nodes.find((n) => n.kind === 'variable' && n.name === 'useUIStore');
+    const varNode = result.nodes.find((n) => n.kind === 'constant' && n.name === 'useUIStore');
     expect(varNode).toBeDefined();
     expect(varNode).toBeDefined();
     expect(varNode?.isExported).toBe(true);
     expect(varNode?.isExported).toBe(true);
   });
   });
@@ -390,7 +390,7 @@ export const config = {
 `;
 `;
     const result = extractFromSource('config.ts', code);
     const result = extractFromSource('config.ts', code);
 
 
-    const varNode = result.nodes.find((n) => n.kind === 'variable' && n.name === 'config');
+    const varNode = result.nodes.find((n) => n.kind === 'constant' && n.name === 'config');
     expect(varNode).toBeDefined();
     expect(varNode).toBeDefined();
     expect(varNode?.isExported).toBe(true);
     expect(varNode?.isExported).toBe(true);
   });
   });
@@ -401,7 +401,7 @@ export const SCREEN_NAMES = ['home', 'settings', 'profile'] as const;
 `;
 `;
     const result = extractFromSource('constants.ts', code);
     const result = extractFromSource('constants.ts', code);
 
 
-    const varNode = result.nodes.find((n) => n.kind === 'variable' && n.name === 'SCREEN_NAMES');
+    const varNode = result.nodes.find((n) => n.kind === 'constant' && n.name === 'SCREEN_NAMES');
     expect(varNode).toBeDefined();
     expect(varNode).toBeDefined();
     expect(varNode?.isExported).toBe(true);
     expect(varNode?.isExported).toBe(true);
   });
   });
@@ -413,7 +413,7 @@ export const API_VERSION = "v2";
 `;
 `;
     const result = extractFromSource('constants.ts', code);
     const result = extractFromSource('constants.ts', code);
 
 
-    const variables = result.nodes.filter((n) => n.kind === 'variable');
+    const variables = result.nodes.filter((n) => n.kind === 'constant');
     expect(variables).toHaveLength(2);
     expect(variables).toHaveLength(2);
     expect(variables.map((n) => n.name).sort()).toEqual(['API_VERSION', 'MAX_RETRIES']);
     expect(variables.map((n) => n.name).sort()).toEqual(['API_VERSION', 'MAX_RETRIES']);
   });
   });
@@ -457,7 +457,7 @@ export const userSchema = z.object({
 `;
 `;
     const result = extractFromSource('schemas.ts', code);
     const result = extractFromSource('schemas.ts', code);
 
 
-    const varNode = result.nodes.find((n) => n.kind === 'variable' && n.name === 'userSchema');
+    const varNode = result.nodes.find((n) => n.kind === 'constant' && n.name === 'userSchema');
     expect(varNode).toBeDefined();
     expect(varNode).toBeDefined();
     expect(varNode?.isExported).toBe(true);
     expect(varNode?.isExported).toBe(true);
   });
   });
@@ -475,7 +475,7 @@ export const authMachine = createMachine({
 `;
 `;
     const result = extractFromSource('machine.ts', code);
     const result = extractFromSource('machine.ts', code);
 
 
-    const varNode = result.nodes.find((n) => n.kind === 'variable' && n.name === 'authMachine');
+    const varNode = result.nodes.find((n) => n.kind === 'constant' && n.name === 'authMachine');
     expect(varNode).toBeDefined();
     expect(varNode).toBeDefined();
     expect(varNode?.isExported).toBe(true);
     expect(varNode?.isExported).toBe(true);
   });
   });

+ 20 - 0
src/extraction/index.ts

@@ -689,6 +689,26 @@ export class ExtractionOrchestrator {
           continue;
           continue;
         }
         }
 
 
+        // Honour config.maxFileSize. Without this check, vendored
+        // generated headers, minified bundles, and other multi-MB
+        // files get indexed despite the user setting a size cap —
+        // wasting WASM heap and the worker recycle budget on inputs
+        // the user explicitly opted out of. The single-file extractFile
+        // path already enforces this; the bulk path used to silently
+        // skip the check.
+        if (stats.size > this.config.maxFileSize) {
+          processed++;
+          filesSkipped++;
+          errors.push({
+            message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
+            filePath,
+            severity: 'warning',
+            code: 'size_exceeded',
+          });
+          onProgress?.({ phase: 'parsing', current: processed, total });
+          continue;
+        }
+
         // Parse in worker thread (main thread stays unblocked).
         // Parse in worker thread (main thread stays unblocked).
         // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
         // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
         let result: ExtractionResult;
         let result: ExtractionResult;

+ 18 - 59
src/extraction/tree-sitter.ts

@@ -331,12 +331,19 @@ export class TreeSitterExtractor {
       this.extractVariable(node);
       this.extractVariable(node);
       skipChildren = true; // extractVariable handles children
       skipChildren = true; // extractVariable handles children
     }
     }
-    // Check for export statements containing non-function variable declarations
-    // e.g. `export const X = create(...)`, `export const X = { ... }`
-    else if (nodeType === 'export_statement') {
-      this.extractExportedVariables(node);
-      // Don't skip children — still need to visit inner nodes (functions, calls, etc.)
-    }
+    // `export_statement` itself is not extracted — the walker descends
+    // into children, where the inner declaration (lexical_declaration,
+    // function_declaration, class_declaration, etc.) is dispatched to
+    // its own extractor. `isExported` walks the parent chain, so the
+    // exported flag is preserved automatically.
+    //
+    // Calling extractExportedVariables here AND descending caused every
+    // `export const X = ...` to produce two nodes for the same symbol —
+    // one kind:'variable' from extractExportedVariables and one
+    // kind:'constant' from extractVariable. The dedicated dispatch is
+    // the correct one (it picks kind from isConst, captures the
+    // initializer signature, and walks type annotations); the
+    // export-statement helper was redundant.
     // Check for imports
     // Check for imports
     else if (this.extractor.importTypes.includes(nodeType)) {
     else if (this.extractor.importTypes.includes(nodeType)) {
       this.extractImport(node);
       this.extractImport(node);
@@ -1213,59 +1220,11 @@ export class TreeSitterExtractor {
     return false;
     return false;
   }
   }
 
 
-  /**
-   * Extract an exported variable declaration that isn't a function.
-   * Handles patterns like:
-   *   export const X = create(...)
-   *   export const X = { ... }
-   *   export const X = [...]
-   *   export const X = "value"
-   *
-   * This is called for `export_statement` nodes that contain a
-   * `lexical_declaration` with `variable_declarator` children whose
-   * values are NOT already handled by functionTypes (arrow_function,
-   * function_expression).
-   */
-  private extractExportedVariables(exportNode: SyntaxNode): void {
-    if (!this.extractor) return;
-
-    // Find the lexical_declaration or variable_declaration child
-    for (let i = 0; i < exportNode.namedChildCount; i++) {
-      const decl = exportNode.namedChild(i);
-      if (!decl || (decl.type !== 'lexical_declaration' && decl.type !== 'variable_declaration')) {
-        continue;
-      }
-
-      // Iterate over each variable_declarator in the declaration
-      for (let j = 0; j < decl.namedChildCount; j++) {
-        const declarator = decl.namedChild(j);
-        if (!declarator || declarator.type !== 'variable_declarator') continue;
-
-        const nameNode = getChildByField(declarator, 'name');
-        if (!nameNode) continue;
-        const name = getNodeText(nameNode, this.source);
-
-        // Skip if the value is a function type — those are already handled
-        // by extractFunction via the functionTypes dispatch
-        const value = getChildByField(declarator, 'value');
-        if (value) {
-          const valueType = value.type;
-          if (
-            this.extractor.functionTypes.includes(valueType)
-          ) {
-            continue; // Already handled by extractFunction
-          }
-        }
-
-        const docstring = getPrecedingDocstring(exportNode, this.source);
-
-        this.createNode('variable', name, declarator, {
-          docstring,
-          isExported: true,
-        });
-      }
-    }
-  }
+  // extractExportedVariables removed — the walker now descends into
+  // export_statement children and the inner declaration's dedicated
+  // extractor (extractVariable, extractFunction, extractClass, etc.)
+  // handles the symbol with isExported=true via parent-walk in the
+  // language extractor's isExported predicate.
 
 
   /**
   /**
    * Extract an import
    * Extract an import