Selaa lähdekoodia

Merge pull request #27 from MO2k4/feat/extraction-quality

feat: File nodes, arrow functions, parallel I/O
Colby Mchenry 4 kuukautta sitten
vanhempi
sitoutus
e2a95ee9ab
3 muutettua tiedostoa jossa 238 lisäystä ja 57 poistoa
  1. 84 22
      __tests__/extraction.test.ts
  2. 108 30
      src/extraction/index.ts
  3. 46 5
      src/extraction/tree-sitter.ts

+ 84 - 22
__tests__/extraction.test.ts

@@ -129,14 +129,19 @@ export function processPayment(amount: number): Promise<Receipt> {
 `;
     const result = extractFromSource('payment.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    // File node + function node
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+    expect(fileNode?.name).toBe('payment.ts');
+
+    const funcNode = result.nodes.find((n) => n.kind === 'function');
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'processPayment',
       language: 'typescript',
       isExported: true,
     });
-    expect(result.nodes[0]?.signature).toContain('amount: number');
+    expect(funcNode?.signature).toContain('amount: number');
   });
 
   it('should extract class declarations', () => {
@@ -177,8 +182,11 @@ export interface User {
 `;
     const result = extractFromSource('types.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+
+    const ifaceNode = result.nodes.find((n) => n.kind === 'interface');
+    expect(ifaceNode).toMatchObject({
       kind: 'interface',
       name: 'User',
       isExported: true,
@@ -209,8 +217,9 @@ export const useAuth = (): AuthContextValue => {
 `;
     const result = extractFromSource('hooks.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'useAuth');
+    expect(funcNode).toBeDefined();
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'useAuth',
       isExported: true,
@@ -225,8 +234,9 @@ export const processData = function(input: string): string {
 `;
     const result = extractFromSource('utils.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'processData');
+    expect(funcNode).toBeDefined();
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'processData',
       isExported: true,
@@ -288,8 +298,9 @@ export const fetchData = async () => {
 `;
     const result = extractFromSource('api.js', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const funcNode = result.nodes.find((n) => n.kind === 'function' && n.name === 'fetchData');
+    expect(funcNode).toBeDefined();
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'fetchData',
       isExported: true,
@@ -308,8 +319,8 @@ export type AuthContextValue = {
 `;
     const result = extractFromSource('types.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const typeNode = result.nodes.find((n) => n.kind === 'type_alias');
+    expect(typeNode).toMatchObject({
       kind: 'type_alias',
       name: 'AuthContextValue',
       isExported: true,
@@ -325,8 +336,8 @@ type InternalState = {
 `;
     const result = extractFromSource('internal.ts', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const typeNode = result.nodes.find((n) => n.kind === 'type_alias');
+    expect(typeNode).toMatchObject({
       kind: 'type_alias',
       name: 'InternalState',
       isExported: false,
@@ -417,7 +428,7 @@ export const useAuth = () => {
     expect(varNodes).toHaveLength(0);
   });
 
-  it('should not extract non-exported const as exported variable', () => {
+  it('should extract non-exported const as non-exported variable', () => {
     const code = `
 const internalConfig = {
   debug: true,
@@ -425,10 +436,10 @@ const internalConfig = {
 `;
     const result = extractFromSource('internal.ts', code);
 
-    // Non-exported const should NOT create a variable node
-    // (only export_statement triggers extractExportedVariables)
-    const varNodes = result.nodes.filter((n) => n.kind === 'variable' && n.name === 'internalConfig');
-    expect(varNodes).toHaveLength(0);
+    // Non-exported const at file level should be extracted as a constant (not exported)
+    const varNodes = result.nodes.filter((n) => (n.kind === 'variable' || n.kind === 'constant') && n.name === 'internalConfig');
+    expect(varNodes).toHaveLength(1);
+    expect(varNodes[0]?.isExported).toBeFalsy();
   });
 
   it('should extract Zod schema exports', () => {
@@ -465,6 +476,54 @@ export const authMachine = createMachine({
   });
 });
 
+describe('File Node Extraction', () => {
+  it('should create a file-kind node for each parsed file', () => {
+    const code = `
+export function greet(name: string): string {
+  return "Hello " + name;
+}
+`;
+    const result = extractFromSource('greeter.ts', code);
+
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+    expect(fileNode?.name).toBe('greeter.ts');
+    expect(fileNode?.filePath).toBe('greeter.ts');
+    expect(fileNode?.language).toBe('typescript');
+    expect(fileNode?.startLine).toBe(1);
+  });
+
+  it('should create file nodes for Python files', () => {
+    const code = `
+def main():
+    pass
+`;
+    const result = extractFromSource('main.py', code);
+
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+    expect(fileNode?.name).toBe('main.py');
+    expect(fileNode?.language).toBe('python');
+  });
+
+  it('should create containment edges from file node to top-level declarations', () => {
+    const code = `
+export function foo() {}
+export function bar() {}
+`;
+    const result = extractFromSource('fns.ts', code);
+
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+
+    // There should be contains edges from the file node to each function
+    const containsEdges = result.edges.filter(
+      (e) => e.source === fileNode?.id && e.kind === 'contains'
+    );
+    expect(containsEdges.length).toBeGreaterThanOrEqual(2);
+  });
+});
+
 describe('Python Extraction', () => {
   it('should extract function definitions', () => {
     const code = `
@@ -475,8 +534,11 @@ def calculate_total(items: list, tax_rate: float) -> float:
 `;
     const result = extractFromSource('calc.py', code);
 
-    expect(result.nodes).toHaveLength(1);
-    expect(result.nodes[0]).toMatchObject({
+    const fileNode = result.nodes.find((n) => n.kind === 'file');
+    expect(fileNode).toBeDefined();
+
+    const funcNode = result.nodes.find((n) => n.kind === 'function');
+    expect(funcNode).toMatchObject({
       kind: 'function',
       name: 'calculate_total',
       language: 'python',

+ 108 - 30
src/extraction/index.ts

@@ -19,10 +19,16 @@ import {
 import { QueryBuilder } from '../db/queries';
 import { extractFromSource } from './tree-sitter';
 import { detectLanguage, isLanguageSupported } from './grammars';
-import { logDebug } from '../errors';
+import { logDebug, logWarn } from '../errors';
 import { captureException } from '../sentry';
 import { validatePathWithinRoot, normalizePath } from '../utils';
 
+/**
+ * Number of files to read in parallel during indexing.
+ * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
+ */
+const FILE_IO_BATCH_SIZE = 10;
+
 /**
  * Progress callback for indexing operations
  */
@@ -158,22 +164,25 @@ export function scanDirectory(
 ): string[] {
   const files: string[] = [];
   let count = 0;
-  const visitedRealPaths = new Set<string>(); // Symlink cycle detection
+  // Track visited real paths to detect symlink cycles
+  const visitedDirs = new Set<string>();
   const gitIgnoredDirs = getGitIgnoredDirectories(rootDir);
 
   function walk(dir: string): void {
-    // Symlink cycle detection: resolve real path and skip if already visited
+    // Resolve real path to detect symlink cycles
+    let realDir: string;
     try {
-      const realDir = fs.realpathSync(dir);
-      if (visitedRealPaths.has(realDir)) {
-        logDebug('Skipping directory to prevent symlink cycle', { dir, realDir });
-        return;
-      }
-      visitedRealPaths.add(realDir);
+      realDir = fs.realpathSync(dir);
     } catch {
-      // If realpath fails, skip this directory
+      logDebug('Skipping unresolvable directory', { dir });
+      return;
+    }
+
+    if (visitedDirs.has(realDir)) {
+      logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
       return;
     }
+    visitedDirs.add(realDir);
 
     // Check for .codegraphignore marker file - skip entire directory tree if present
     const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
@@ -321,10 +330,11 @@ export class ExtractionOrchestrator {
       };
     }
 
-    // Phase 2: Parse files
+    // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
     const total = files.length;
+    let processed = 0;
 
-    for (let i = 0; i < files.length; i++) {
+    for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
       if (signal?.aborted) {
         return {
           success: false,
@@ -337,26 +347,69 @@ export class ExtractionOrchestrator {
         };
       }
 
-      const filePath = files[i]!;
-      onProgress?.({
-        phase: 'parsing',
-        current: i + 1,
-        total,
-        currentFile: filePath,
-      });
+      const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
 
-      const result = await this.indexFile(filePath);
+      // Read files in parallel (with path validation before any I/O)
+      const fileContents = await Promise.all(
+        batch.map(async (fp) => {
+          try {
+            const fullPath = validatePathWithinRoot(this.rootDir, fp);
+            if (!fullPath) {
+              logWarn('Path traversal blocked in batch reader', { filePath: fp });
+              return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
+            }
+            const content = await fsp.readFile(fullPath, 'utf-8');
+            const stats = await fsp.stat(fullPath);
+            return { filePath: fp, content, stats, error: null as Error | null };
+          } catch (err) {
+            return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
+          }
+        })
+      );
+
+      // Parse and store sequentially
+      for (const { filePath, content, stats, error } of fileContents) {
+        if (signal?.aborted) {
+          return {
+            success: false,
+            filesIndexed,
+            filesSkipped,
+            nodesCreated: totalNodes,
+            edgesCreated: totalEdges,
+            errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
+            durationMs: Date.now() - startTime,
+          };
+        }
 
-      if (result.errors.length > 0) {
-        errors.push(...result.errors);
-      }
+        processed++;
+        onProgress?.({
+          phase: 'parsing',
+          current: processed,
+          total,
+          currentFile: filePath,
+        });
 
-      if (result.nodes.length > 0) {
-        filesIndexed++;
-        totalNodes += result.nodes.length;
-        totalEdges += result.edges.length;
-      } else if (result.errors.length === 0) {
-        filesSkipped++;
+        if (error || content === null || stats === null) {
+          errors.push({
+            message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
+            severity: 'error',
+          });
+          continue;
+        }
+
+        const result = await this.indexFileWithContent(filePath, content, stats);
+
+        if (result.errors.length > 0) {
+          errors.push(...result.errors);
+        }
+
+        if (result.nodes.length > 0) {
+          filesIndexed++;
+          totalNodes += result.nodes.length;
+          totalEdges += result.edges.length;
+        } else if (result.errors.length === 0) {
+          filesSkipped++;
+        }
       }
     }
 
@@ -434,7 +487,7 @@ export class ExtractionOrchestrator {
       };
     }
 
-    // Check file exists and is readable
+    // Read file content and stats
     let content: string;
     let stats: fs.Stats;
     try {
@@ -456,6 +509,31 @@ export class ExtractionOrchestrator {
       };
     }
 
+    return this.indexFileWithContent(relativePath, content, stats);
+  }
+
+  /**
+   * Index a single file with pre-read content and stats.
+   * Used by the parallel batch reader to avoid redundant file I/O.
+   */
+  async indexFileWithContent(
+    relativePath: string,
+    content: string,
+    stats: fs.Stats
+  ): Promise<ExtractionResult> {
+    // Prevent path traversal
+    const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
+    if (!fullPath) {
+      logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
+      return {
+        nodes: [],
+        edges: [],
+        unresolvedReferences: [],
+        errors: [{ message: 'Path traversal blocked', severity: 'error' }],
+        durationMs: 0,
+      };
+    }
+
     // Check file size
     if (stats.size > this.config.maxFileSize) {
       return {

+ 46 - 5
src/extraction/tree-sitter.ts

@@ -6,6 +6,7 @@
 
 import { SyntaxNode, Tree } from 'tree-sitter';
 import * as crypto from 'crypto';
+import * as path from 'path';
 import {
   Language,
   Node,
@@ -875,7 +876,28 @@ export class TreeSitterExtractor {
 
     try {
       this.tree = parser.parse(this.source);
+
+      // Create file node representing the source file
+      const fileNode: Node = {
+        id: `file:${this.filePath}`,
+        kind: 'file',
+        name: path.basename(this.filePath),
+        qualifiedName: this.filePath,
+        filePath: this.filePath,
+        language: this.language,
+        startLine: 1,
+        endLine: this.source.split('\n').length,
+        startColumn: 0,
+        endColumn: 0,
+        isExported: false,
+        updatedAt: Date.now(),
+      };
+      this.nodes.push(fileNode);
+
+      // Push file node onto stack so top-level declarations get contains edges
+      this.nodeStack.push(fileNode.id);
       this.visitNode(this.tree.rootNode);
+      this.nodeStack.pop();
     } catch (error) {
       captureException(error, { operation: 'tree-sitter-parse', filePath: this.filePath, language: this.language });
       this.errors.push({
@@ -905,7 +927,7 @@ export class TreeSitterExtractor {
     // Check for function declarations
     // For Python/Ruby, function_definition inside a class should be treated as method
     if (this.extractor.functionTypes.includes(nodeType)) {
-      if (this.nodeStack.length > 0 && this.extractor.methodTypes.includes(nodeType)) {
+      if (this.isInsideClassLikeNode() && this.extractor.methodTypes.includes(nodeType)) {
         // Inside a class - treat as method
         this.extractMethod(node);
         skipChildren = true; // extractMethod visits children via visitFunctionBody
@@ -958,7 +980,7 @@ export class TreeSitterExtractor {
     }
     // Check for variable declarations (const, let, var, etc.)
     // Only extract top-level variables (not inside functions/methods)
-    else if (this.extractor.variableTypes.includes(nodeType) && this.nodeStack.length === 0) {
+    else if (this.extractor.variableTypes.includes(nodeType) && !this.isInsideClassLikeNode()) {
       this.extractVariable(node);
       skipChildren = true; // extractVariable handles children
     }
@@ -1060,6 +1082,25 @@ export class TreeSitterExtractor {
     return false;
   }
 
+  /**
+   * Check if the current node stack indicates we are inside a class-like node
+   * (class, struct, interface, trait). File nodes do not count as class-like.
+   */
+  private isInsideClassLikeNode(): boolean {
+    if (this.nodeStack.length === 0) return false;
+    const parentId = this.nodeStack[this.nodeStack.length - 1];
+    if (!parentId) return false;
+    const parentNode = this.nodes.find((n) => n.id === parentId);
+    if (!parentNode) return false;
+    return (
+      parentNode.kind === 'class' ||
+      parentNode.kind === 'struct' ||
+      parentNode.kind === 'interface' ||
+      parentNode.kind === 'trait' ||
+      parentNode.kind === 'enum'
+    );
+  }
+
   /**
    * Extract a function
    */
@@ -1160,10 +1201,10 @@ export class TreeSitterExtractor {
   private extractMethod(node: SyntaxNode): void {
     if (!this.extractor) return;
 
-    // For most languages, only extract as method if inside a class
+    // For most languages, only extract as method if inside a class-like node
     // But Go methods are top-level with a receiver, so always treat them as methods
-    if (this.nodeStack.length === 0 && this.language !== 'go') {
-      // Top-level and not Go, treat as function
+    if (!this.isInsideClassLikeNode() && this.language !== 'go') {
+      // Not inside a class-like node and not Go, treat as function
       this.extractFunction(node);
       return;
     }