فهرست منبع

Fix .gitignore support and Windows path separator bug in scanner

Normalize paths to forward slashes in matchesGlob() and scanDirectory()
so glob exclude patterns work on Windows. Add getGitIgnoredDirectories()
using git ls-files to skip .gitignore'd directories during indexing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Colby McHenry 4 ماه پیش
والد
کامیت
62a6cf38fe
4فایلهای تغییر یافته به همراه158 افزوده شده و 3 حذف شده
  1. 106 1
      __tests__/extraction.test.ts
  2. 4 0
      src/config.ts
  3. 40 2
      src/extraction/index.ts
  4. 8 0
      src/utils.ts

+ 106 - 1
__tests__/extraction.test.ts

@@ -9,8 +9,10 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { CodeGraph } from '../src';
-import { extractFromSource } from '../src/extraction';
+import { extractFromSource, scanDirectory, shouldIncludeFile } from '../src/extraction';
 import { detectLanguage, isLanguageSupported, getSupportedLanguages } from '../src/extraction/grammars';
+import { normalizePath } from '../src/utils';
+import { DEFAULT_CONFIG } from '../src/types';
 
 // Create a temporary directory for each test
 function createTempDir(): string {
@@ -1880,3 +1882,106 @@ export function multiply(a: number, b: number): number {
     cg.close();
   });
 });
+
+describe('Path Normalization', () => {
+  it('should convert backslashes to forward slashes', () => {
+    expect(normalizePath('gui\\node_modules\\foo')).toBe('gui/node_modules/foo');
+    expect(normalizePath('src\\components\\Button.tsx')).toBe('src/components/Button.tsx');
+  });
+
+  it('should leave forward-slash paths unchanged', () => {
+    expect(normalizePath('src/components/Button.tsx')).toBe('src/components/Button.tsx');
+  });
+
+  it('should handle empty string', () => {
+    expect(normalizePath('')).toBe('');
+  });
+});
+
+describe('Directory Exclusion', () => {
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = createTempDir();
+  });
+
+  afterEach(() => {
+    cleanupTempDir(tempDir);
+  });
+
+  it('should exclude node_modules directories', () => {
+    // Create structure: src/index.ts + node_modules/pkg/index.js
+    const srcDir = path.join(tempDir, 'src');
+    const nmDir = path.join(tempDir, 'node_modules', 'pkg');
+    fs.mkdirSync(srcDir, { recursive: true });
+    fs.mkdirSync(nmDir, { recursive: true });
+    fs.writeFileSync(path.join(srcDir, 'index.ts'), 'export const x = 1;');
+    fs.writeFileSync(path.join(nmDir, 'index.js'), 'module.exports = {};');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: tempDir };
+    const files = scanDirectory(tempDir, config);
+
+    expect(files).toContain('src/index.ts');
+    expect(files.every((f) => !f.includes('node_modules'))).toBe(true);
+  });
+
+  it('should exclude nested node_modules directories', () => {
+    // Create structure: packages/app/node_modules/pkg/index.js
+    const srcDir = path.join(tempDir, 'packages', 'app', 'src');
+    const nmDir = path.join(tempDir, 'packages', 'app', 'node_modules', 'pkg');
+    fs.mkdirSync(srcDir, { recursive: true });
+    fs.mkdirSync(nmDir, { recursive: true });
+    fs.writeFileSync(path.join(srcDir, 'index.ts'), 'export const x = 1;');
+    fs.writeFileSync(path.join(nmDir, 'index.js'), 'module.exports = {};');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: tempDir };
+    const files = scanDirectory(tempDir, config);
+
+    expect(files).toContain('packages/app/src/index.ts');
+    expect(files.every((f) => !f.includes('node_modules'))).toBe(true);
+  });
+
+  it('should exclude .git directories', () => {
+    const srcDir = path.join(tempDir, 'src');
+    const gitDir = path.join(tempDir, '.git', 'objects');
+    fs.mkdirSync(srcDir, { recursive: true });
+    fs.mkdirSync(gitDir, { recursive: true });
+    fs.writeFileSync(path.join(srcDir, 'index.ts'), 'export const x = 1;');
+    fs.writeFileSync(path.join(gitDir, 'pack.ts'), 'export const y = 2;');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: tempDir };
+    const files = scanDirectory(tempDir, config);
+
+    expect(files).toContain('src/index.ts');
+    expect(files.every((f) => !f.includes('.git'))).toBe(true);
+  });
+
+  it('should return forward-slash paths on all platforms', () => {
+    const srcDir = path.join(tempDir, 'src', 'components');
+    fs.mkdirSync(srcDir, { recursive: true });
+    fs.writeFileSync(path.join(srcDir, 'Button.tsx'), 'export function Button() {}');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: tempDir };
+    const files = scanDirectory(tempDir, config);
+
+    expect(files.length).toBe(1);
+    expect(files[0]).toBe('src/components/Button.tsx');
+    expect(files[0]).not.toContain('\\');
+  });
+
+  it('should respect .codegraphignore marker', () => {
+    const srcDir = path.join(tempDir, 'src');
+    const vendorDir = path.join(tempDir, 'vendor');
+    fs.mkdirSync(srcDir, { recursive: true });
+    fs.mkdirSync(vendorDir, { recursive: true });
+    fs.writeFileSync(path.join(srcDir, 'index.ts'), 'export const x = 1;');
+    fs.writeFileSync(path.join(vendorDir, 'lib.ts'), 'export const y = 2;');
+    fs.writeFileSync(path.join(vendorDir, '.codegraphignore'), '');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: tempDir };
+    const files = scanDirectory(tempDir, config);
+
+    expect(files).toContain('src/index.ts');
+    expect(files.every((f) => !f.includes('vendor'))).toBe(true);
+  });
+});

+ 4 - 0
src/config.ts

@@ -7,6 +7,7 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import { CodeGraphConfig, DEFAULT_CONFIG, Language, NodeKind } from './types';
+import { normalizePath } from './utils';
 
 /**
  * Configuration filename
@@ -240,6 +241,9 @@ export function addCustomPattern(
  * Check if a file path matches the include/exclude patterns
  */
 export function shouldIncludeFile(filePath: string, config: CodeGraphConfig): boolean {
+  // Normalize to forward slashes so Windows backslash paths match glob patterns
+  filePath = normalizePath(filePath);
+
   // Simple glob matching (for now, just check if any pattern matches)
   // A full implementation would use a proper glob library
 

+ 40 - 2
src/extraction/index.ts

@@ -8,6 +8,7 @@ import * as fs from 'fs';
 import * as fsp from 'fs/promises';
 import * as path from 'path';
 import * as crypto from 'crypto';
+import { execFileSync } from 'child_process';
 import {
   Language,
   FileRecord,
@@ -20,7 +21,7 @@ import { extractFromSource } from './tree-sitter';
 import { detectLanguage, isLanguageSupported } from './grammars';
 import { logDebug } from '../errors';
 import { captureException } from '../sentry';
-import { validatePathWithinRoot } from '../utils';
+import { validatePathWithinRoot, normalizePath } from '../utils';
 
 /**
  * Progress callback for indexing operations
@@ -68,6 +69,9 @@ export function hashContent(content: string): string {
  * Check if a path matches any glob pattern (simplified)
  */
 function matchesGlob(filePath: string, pattern: string): boolean {
+  // Normalize to forward slashes so Windows backslash paths match glob patterns
+  filePath = normalizePath(filePath);
+
   // Convert glob to regex using placeholders to avoid conflicts
   let regexStr = pattern;
 
@@ -114,6 +118,31 @@ export function shouldIncludeFile(
   return false;
 }
 
+/**
+ * Get directories ignored by .gitignore using git ls-files.
+ * Returns a Set of normalized relative directory paths (forward slashes, no trailing slash).
+ * Gracefully returns empty Set on any failure.
+ */
+function getGitIgnoredDirectories(rootDir: string): Set<string> {
+  try {
+    const output = execFileSync(
+      'git',
+      ['ls-files', '-oi', '--exclude-standard', '--directory'],
+      { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
+    );
+    const dirs = new Set<string>();
+    for (const line of output.split('\n')) {
+      const trimmed = line.trim();
+      if (trimmed.endsWith('/')) {
+        dirs.add(normalizePath(trimmed.slice(0, -1)));
+      }
+    }
+    return dirs;
+  } catch {
+    return new Set<string>();
+  }
+}
+
 /**
  * Marker file name that indicates a directory (and all children) should be skipped
  */
@@ -130,6 +159,7 @@ export function scanDirectory(
   const files: string[] = [];
   let count = 0;
   const visitedRealPaths = new Set<string>(); // Symlink cycle detection
+  const gitIgnoredDirs = getGitIgnoredDirectories(rootDir);
 
   function walk(dir: string): void {
     // Symlink cycle detection: resolve real path and skip if already visited
@@ -163,7 +193,7 @@ export function scanDirectory(
 
     for (const entry of entries) {
       const fullPath = path.join(dir, entry.name);
-      const relativePath = path.relative(rootDir, fullPath);
+      const relativePath = normalizePath(path.relative(rootDir, fullPath));
 
       // Follow symlinked directories, but skip symlinked files to non-project targets
       if (entry.isSymbolicLink()) {
@@ -171,6 +201,10 @@ export function scanDirectory(
           const realTarget = fs.realpathSync(fullPath);
           const stat = fs.statSync(realTarget);
           if (stat.isDirectory()) {
+            // Check gitignore first (fast O(1) lookup)
+            if (gitIgnoredDirs.has(relativePath)) {
+              continue;
+            }
             // Check exclusion, then recurse (cycle detection handles the rest)
             const dirPattern = relativePath + '/';
             let excluded = false;
@@ -199,6 +233,10 @@ export function scanDirectory(
       }
 
       if (entry.isDirectory()) {
+        // Check gitignore first (fast O(1) lookup)
+        if (gitIgnoredDirs.has(relativePath)) {
+          continue;
+        }
         // Check if directory should be excluded
         const dirPattern = relativePath + '/';
         let excluded = false;

+ 8 - 0
src/utils.ts

@@ -74,6 +74,14 @@ export function clamp(value: number, min: number, max: number): number {
   return Math.max(min, Math.min(max, value));
 }
 
+/**
+ * Normalize a file path to use forward slashes.
+ * Fixes Windows backslash paths so glob matching works consistently.
+ */
+export function normalizePath(filePath: string): string {
+  return filePath.replace(/\\/g, '/');
+}
+
 /**
  * Cross-process file lock using lock files.
  * Prevents concurrent database writes from CLI, MCP server, and git hooks.