1
0
Эх сурвалжийг харах

feat: file nodes, arrow function extraction, parallel I/O

- Create file-kind nodes for each parsed source file
- Add isInsideClassLikeNode() for method vs function detection
- Extract arrow functions and function expressions from variable declarators
- Batch file I/O with FILE_IO_BATCH_SIZE=10 using Promise.all
- Add symlink cycle detection with visitedDirs Set in scanDirectory
- Add lazy grammar loading with exported getGrammar() function
- Add indexFileWithContent() for pre-read content processing
- Add tests for file nodes and arrow function extraction
Martin Oehlert 4 сар өмнө
parent
commit
0f2eda8da3

+ 123 - 22
__tests__/extraction.test.ts

@@ -127,14 +127,19 @@ export function processPayment(amount: number): Promise<Receipt> {
 `;
     const result = extractFromSource('payment.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    // File node + function node
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+    expect(fileNode?.name).toBe('payment.ts');
+
+    const funcNode = result.nodes.find((n) => n.kind === 'function');
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'processPayment',
       language: 'typescript',
       isExported: true,
     });
-    expect(result.nodes[0]?.signature).toContain('amount: number');
+    expect(funcNode?.signature).toContain('amount: number');
   });
 
   it('should extract class declarations', () => {
@@ -175,8 +180,11 @@ export interface User {
 `;
     const result = extractFromSource('types.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+
+    const ifaceNode = result.nodes.find((n) => n.kind === 'interface');
+    expect(ifaceNode).toMatchObject({
       kind: 'interface',
       name: 'User',
       isExported: true,
@@ -207,8 +215,9 @@ export const useAuth = (): AuthContextValue => {
 `;
     const result = extractFromSource('hooks.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'useAuth');
+    expect(funcNode).toBeDefined();
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'useAuth',
       isExported: true,
@@ -223,8 +232,9 @@ export const processData = function(input: string): string {
 `;
     const result = extractFromSource('utils.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'processData');
+    expect(funcNode).toBeDefined();
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'processData',
       isExported: true,
@@ -286,8 +296,9 @@ export const fetchData = async () => {
 `;
     const result = extractFromSource('api.js', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'fetchData');
+    expect(funcNode).toBeDefined();
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'fetchData',
       isExported: true,
@@ -306,8 +317,8 @@ export type AuthContextValue = {
 `;
     const result = extractFromSource('types.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const typeNode = result.nodes.find((n) => n.kind === 'type_alias');
+    expect(typeNode).toMatchObject({
       kind: 'type_alias',
       name: 'AuthContextValue',
       isExported: true,
@@ -323,8 +334,8 @@ type InternalState = {
 `;
     const result = extractFromSource('internal.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const typeNode = result.nodes.find((n) => n.kind === 'type_alias');
+    expect(typeNode).toMatchObject({
       kind: 'type_alias',
       name: 'InternalState',
       isExported: false,
@@ -415,7 +426,7 @@ export const useAuth = () => {
     expect(varNodes).toHaveLength(0);
   });
 
-  it('should not extract non-exported const as exported variable', () => {
+  it('should extract non-exported const as non-exported variable', () => {
     const code = `
 const internalConfig = {
   debug: true,
@@ -423,10 +434,10 @@ const internalConfig = {
 `;
     const result = extractFromSource('internal.ts', code);
 
-    // Non-exported const should NOT create a variable node
-    // (only export_statement triggers extractExportedVariables)
-    const varNodes = result.nodes.filter((n) => n.kind === 'variable' && n.name === 'internalConfig');
-    expect(varNodes).toHaveLength(0);
+    // Non-exported const at file level should be extracted as a constant (not exported)
+    const varNodes = result.nodes.filter((n) => (n.kind === 'variable' || n.kind === 'constant') && n.name === 'internalConfig');
+    expect(varNodes).toHaveLength(1);
+    expect(varNodes[0]?.isExported).toBeFalsy();
   });
 
   it('should extract Zod schema exports', () => {
@@ -463,6 +474,93 @@ export const authMachine = createMachine({
   });
 });
 
+describe('File Node Extraction', () => {
+  it('should create a file-kind node for each parsed file', () => {
+    const code = `
+export function greet(name: string): string {
+  return "Hello " + name;
+}
+`;
+    const result = extractFromSource('greeter.ts', code);
+
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+    expect(fileNode?.name).toBe('greeter.ts');
+    expect(fileNode?.filePath).toBe('greeter.ts');
+    expect(fileNode?.language).toBe('typescript');
+    expect(fileNode?.startLine).toBe(1);
+  });
+
+  it('should create file nodes for Python files', () => {
+    const code = `
+def main():
+    pass
+`;
+    const result = extractFromSource('main.py', code);
+
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+    expect(fileNode?.name).toBe('main.py');
+    expect(fileNode?.language).toBe('python');
+  });
+
+  it('should create containment edges from file node to top-level declarations', () => {
+    const code = `
+export function foo() {}
+export function bar() {}
+`;
+    const result = extractFromSource('fns.ts', code);
+
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+
+    // There should be contains edges from the file node to each function
+    const containsEdges = result.edges.filter(
+      (e) => e.source === fileNode?.id && e.kind === 'contains'
+    );
+    expect(containsEdges.length).toBeGreaterThanOrEqual(2);
+  });
+});
+
+describe('Arrow Function Variable Extraction', () => {
+  it('should extract const arrow functions as function nodes', () => {
+    const code = `
+const handleClick = () => {
+  console.log('clicked');
+};
+`;
+    const result = extractFromSource('handler.ts', code);
+
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'handleClick');
+    expect(funcNode).toBeDefined();
+    expect(funcNode?.kind).toBe('function');
+  });
+
+  it('should detect async arrow functions', () => {
+    const code = `
+export const fetchUser = async (id: string) => {
+  return await db.find(id);
+};
+`;
+    const result = extractFromSource('api.ts', code);
+
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'fetchUser');
+    expect(funcNode).toBeDefined();
+    expect(funcNode?.isExported).toBe(true);
+  });
+
+  it('should not create duplicate nodes for arrow functions in export statements', () => {
+    const code = `
+export const compute = (x: number) => x * 2;
+`;
+    const result = extractFromSource('math.ts', code);
+
+    const funcNodes = result.nodes.filter((n) => n.kind === 'function' && n.name === 'compute');
+    // Should appear only once, not duplicated between extractFunctionVariable and extractFunction
+    expect(funcNodes).toHaveLength(1);
+  });
+});
+
 describe('Python Extraction', () => {
   it('should extract function definitions', () => {
     const code = `
@@ -473,8 +571,11 @@ def calculate_total(items: list, tax_rate: float) -> float:
 `;
     const result = extractFromSource('calc.py', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+
+    const funcNode = result.nodes.find((n) => n.kind === 'function');
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'calculate_total',
       language: 'python',

+ 8 - 0
src/extraction/grammars.ts

@@ -156,6 +156,14 @@ function loadGrammar(language: Language): unknown | null {
   }
 }
 
+/**
+ * Get a grammar by language, loading it lazily if needed.
+ * Exported for direct grammar access without parser initialization.
+ */
+export function getGrammar(language: string): unknown | null {
+  return loadGrammar(language as Language);
+}
+
 /**
  * Get a parser for the specified language
  */

+ 143 - 29
src/extraction/index.ts

@@ -18,10 +18,16 @@ import {
 import { QueryBuilder } from '../db/queries';
 import { extractFromSource } from './tree-sitter';
 import { detectLanguage, isLanguageSupported } from './grammars';
-import { logDebug } from '../errors';
+import { logDebug, logWarn } from '../errors';
 import { captureException } from '../sentry';
 import { validatePathWithinRoot } from '../utils';
 
+/**
+ * Number of files to read in parallel during indexing.
+ * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
+ */
+const FILE_IO_BATCH_SIZE = 10;
+
 /**
  * Progress callback for indexing operations
  */
@@ -129,22 +135,25 @@ export function scanDirectory(
 ): string[] {
   const files: string[] = [];
   let count = 0;
-  const visitedRealPaths = new Set<string>(); // Symlink cycle detection
+  // Track visited real paths to detect symlink cycles
+  const visitedDirs = new Set<string>();
 
   function walk(dir: string): void {
-    // Symlink cycle detection: resolve real path and skip if already visited
+    // Resolve real path to detect symlink cycles
+    let realDir: string;
     try {
-      const realDir = fs.realpathSync(dir);
-      if (visitedRealPaths.has(realDir)) {
-        logDebug('Skipping directory to prevent symlink cycle', { dir, realDir });
-        return;
-      }
-      visitedRealPaths.add(realDir);
+      realDir = fs.realpathSync(dir);
     } catch {
-      // If realpath fails, skip this directory
+      logDebug('Skipping unresolvable directory', { dir });
       return;
     }
 
+    if (visitedDirs.has(realDir)) {
+      logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
+      return;
+    }
+    visitedDirs.add(realDir);
+
     // Check for .codegraphignore marker file - skip entire directory tree if present
     const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
     if (fs.existsSync(ignoreMarker)) {
@@ -283,10 +292,11 @@ export class ExtractionOrchestrator {
       };
     }
 
-    // Phase 2: Parse files
+    // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
     const total = files.length;
+    let processed = 0;
 
-    for (let i = 0; i < files.length; i++) {
+    for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
       if (signal?.aborted) {
         return {
           success: false,
@@ -299,26 +309,69 @@ export class ExtractionOrchestrator {
         };
       }
 
-      const filePath = files[i]!;
-      onProgress?.({
-        phase: 'parsing',
-        current: i + 1,
-        total,
-        currentFile: filePath,
-      });
+      const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
 
-      const result = await this.indexFile(filePath);
+      // Read files in parallel (with path validation before any I/O)
+      const fileContents = await Promise.all(
+        batch.map(async (fp) => {
+          try {
+            const fullPath = validatePathWithinRoot(this.rootDir, fp);
+            if (!fullPath) {
+              logWarn('Path traversal blocked in batch reader', { filePath: fp });
+              return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
+            }
+            const content = await fsp.readFile(fullPath, 'utf-8');
+            const stats = await fsp.stat(fullPath);
+            return { filePath: fp, content, stats, error: null as Error | null };
+          } catch (err) {
+            return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
+          }
+        })
+      );
+
+      // Parse and store sequentially
+      for (const { filePath, content, stats, error } of fileContents) {
+        if (signal?.aborted) {
+          return {
+            success: false,
+            filesIndexed,
+            filesSkipped,
+            nodesCreated: totalNodes,
+            edgesCreated: totalEdges,
+            errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
+            durationMs: Date.now() - startTime,
+          };
+        }
 
-      if (result.errors.length > 0) {
-        errors.push(...result.errors);
-      }
+        processed++;
+        onProgress?.({
+          phase: 'parsing',
+          current: processed,
+          total,
+          currentFile: filePath,
+        });
 
-      if (result.nodes.length > 0) {
-        filesIndexed++;
-        totalNodes += result.nodes.length;
-        totalEdges += result.edges.length;
-      } else if (result.errors.length === 0) {
-        filesSkipped++;
+        if (error || content === null || stats === null) {
+          errors.push({
+            message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
+            severity: 'error',
+          });
+          continue;
+        }
+
+        const result = await this.indexFileWithContent(filePath, content, stats);
+
+        if (result.errors.length > 0) {
+          errors.push(...result.errors);
+        }
+
+        if (result.nodes.length > 0) {
+          filesIndexed++;
+          totalNodes += result.nodes.length;
+          totalEdges += result.edges.length;
+        } else if (result.errors.length === 0) {
+          filesSkipped++;
+        }
       }
     }
 
@@ -457,6 +510,67 @@ export class ExtractionOrchestrator {
     return result;
   }
 
+  /**
+   * Index a single file with pre-read content and stats.
+   * Used by the parallel batch reader to avoid redundant file I/O.
+   */
+  async indexFileWithContent(
+    relativePath: string,
+    content: string,
+    stats: fs.Stats
+  ): Promise<ExtractionResult> {
+    // Prevent path traversal
+    const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
+    if (!fullPath) {
+      logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
+      return {
+        nodes: [],
+        edges: [],
+        unresolvedReferences: [],
+        errors: [{ message: 'Path traversal blocked', severity: 'error' }],
+        durationMs: 0,
+      };
+    }
+
+    // Check file size
+    if (stats.size > this.config.maxFileSize) {
+      return {
+        nodes: [],
+        edges: [],
+        unresolvedReferences: [],
+        errors: [
+          {
+            message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
+            severity: 'warning',
+          },
+        ],
+        durationMs: 0,
+      };
+    }
+
+    // Detect language
+    const language = detectLanguage(relativePath);
+    if (!isLanguageSupported(language)) {
+      return {
+        nodes: [],
+        edges: [],
+        unresolvedReferences: [],
+        errors: [],
+        durationMs: 0,
+      };
+    }
+
+    // Extract from source
+    const result = extractFromSource(relativePath, content, language);
+
+    // Store in database
+    if (result.nodes.length > 0 || result.errors.length === 0) {
+      this.storeExtractionResult(relativePath, content, language, stats, result);
+    }
+
+    return result;
+  }
+
   /**
    * Store extraction result in database
    */

+ 110 - 5
src/extraction/tree-sitter.ts

@@ -6,6 +6,7 @@
 
 import { SyntaxNode, Tree } from 'tree-sitter';
 import * as crypto from 'crypto';
+import * as path from 'path';
 import {
   Language,
   Node,
@@ -875,7 +876,28 @@ export class TreeSitterExtractor {
 
     try {
       this.tree = parser.parse(this.source);
+
+      // Create file node representing the source file
+      const fileNode: Node = {
+        id: `file:${this.filePath}`,
+        kind: 'file',
+        name: path.basename(this.filePath),
+        qualifiedName: this.filePath,
+        filePath: this.filePath,
+        language: this.language,
+        startLine: 1,
+        endLine: this.source.split('\n').length,
+        startColumn: 0,
+        endColumn: 0,
+        isExported: false,
+        updatedAt: Date.now(),
+      };
+      this.nodes.push(fileNode);
+
+      // Push file node onto stack so top-level declarations get contains edges
+      this.nodeStack.push(fileNode.id);
       this.visitNode(this.tree.rootNode);
+      this.nodeStack.pop();
     } catch (error) {
       captureException(error, { operation: 'tree-sitter-parse', filePath: this.filePath, language: this.language });
       this.errors.push({
@@ -905,7 +927,7 @@ export class TreeSitterExtractor {
     // Check for function declarations
     // For Python/Ruby, function_definition inside a class should be treated as method
     if (this.extractor.functionTypes.includes(nodeType)) {
-      if (this.nodeStack.length > 0 && this.extractor.methodTypes.includes(nodeType)) {
+      if (this.isInsideClassLikeNode() && this.extractor.methodTypes.includes(nodeType)) {
         // Inside a class - treat as method
         this.extractMethod(node);
         skipChildren = true; // extractMethod visits children via visitFunctionBody
@@ -956,9 +978,17 @@ export class TreeSitterExtractor {
     else if (this.extractor.typeAliasTypes.includes(nodeType)) {
       this.extractTypeAlias(node);
     }
+    // Check for arrow functions / function expressions assigned to variables (JS/TS)
+    else if (nodeType === 'variable_declarator') {
+      const valueNode = getChildByField(node, 'value');
+      if (valueNode && (valueNode.type === 'arrow_function' || valueNode.type === 'function_expression')) {
+        this.extractFunctionVariable(node);
+        skipChildren = true;
+      }
+    }
     // Check for variable declarations (const, let, var, etc.)
     // Only extract top-level variables (not inside functions/methods)
-    else if (this.extractor.variableTypes.includes(nodeType) && this.nodeStack.length === 0) {
+    else if (this.extractor.variableTypes.includes(nodeType) && !this.isInsideClassLikeNode()) {
       this.extractVariable(node);
       skipChildren = true; // extractVariable handles children
     }
@@ -1060,6 +1090,81 @@ export class TreeSitterExtractor {
     return false;
   }
 
+  /**
+   * Check if the current node stack indicates we are inside a class-like node
+   * (class, struct, interface, trait). File nodes do not count as class-like.
+   */
+  private isInsideClassLikeNode(): boolean {
+    if (this.nodeStack.length === 0) return false;
+    const parentId = this.nodeStack[this.nodeStack.length - 1];
+    if (!parentId) return false;
+    const parentNode = this.nodes.find((n) => n.id === parentId);
+    if (!parentNode) return false;
+    return (
+      parentNode.kind === 'class' ||
+      parentNode.kind === 'struct' ||
+      parentNode.kind === 'interface' ||
+      parentNode.kind === 'trait' ||
+      parentNode.kind === 'enum'
+    );
+  }
+
+  /**
+   * Extract an arrow function or function expression assigned to a variable.
+   * Handles patterns like: const foo = () => {} or const bar = function() {}
+   */
+  private extractFunctionVariable(node: SyntaxNode): void {
+    if (!this.extractor) return;
+
+    // Only handle variable_declarator where value is arrow_function or function_expression
+    if (node.type !== 'variable_declarator') return;
+
+    const nameNode = getChildByField(node, 'name');
+    const valueNode = getChildByField(node, 'value');
+
+    if (!nameNode || !valueNode) return;
+    if (valueNode.type !== 'arrow_function' && valueNode.type !== 'function_expression') return;
+
+    const name = getNodeText(nameNode, this.source);
+    if (!name) return;
+
+    // Check if exported by walking parents
+    let isExported = false;
+    let current = node.parent;
+    while (current) {
+      if (current.type === 'export_statement') {
+        isExported = true;
+        break;
+      }
+      if (current.type === 'program' || current.type === 'module') break;
+      current = current.parent;
+    }
+
+    // Build signature from the arrow function parameters
+    let signature: string | undefined;
+    const params = getChildByField(valueNode, 'parameters');
+    if (params) {
+      signature = `${name}${getNodeText(params, this.source)}`;
+    }
+
+    // Check if async
+    const isAsync = this.extractor.isAsync?.(valueNode);
+
+    const funcNode = this.createNode('function', name, node, {
+      isExported,
+      signature: signature || undefined,
+      isAsync,
+    });
+
+    // Push to stack and visit body for call extraction
+    this.nodeStack.push(funcNode.id);
+    const body = getChildByField(valueNode, this.extractor.bodyField);
+    if (body) {
+      this.visitFunctionBody(body, funcNode.id);
+    }
+    this.nodeStack.pop();
+  }
+
   /**
    * Extract a function
    */
@@ -1160,10 +1265,10 @@ export class TreeSitterExtractor {
   private extractMethod(node: SyntaxNode): void {
     if (!this.extractor) return;
 
-    // For most languages, only extract as method if inside a class
+    // For most languages, only extract as method if inside a class-like node
     // But Go methods are top-level with a receiver, so always treat them as methods
-    if (this.nodeStack.length === 0 && this.language !== 'go') {
-      // Top-level and not Go, treat as function
+    if (!this.isInsideClassLikeNode() && this.language !== 'go') {
+      // Not inside a class-like node and not Go, treat as function
       this.extractFunction(node);
       return;
     }