Quellcode durchsuchen

fix(extraction): index nested non-submodule git repos (#193) (#217)

`codegraph init -i` from a git super-repo containing independent nested
git repositories (not submodules) reported "No files found to index":
git ls-files reports an embedded repo only as an opaque `subdir/` entry
and never lists its files. Detect embedded repos via that trailing-slash
signal and recurse `git ls-files` into each, indexing tracked + untracked
source and honoring each repo's own .gitignore.

Reported by @timxx.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Colby Mchenry vor 1 Monat
Ursprung
Commit
07c093cc3f
3 geänderte Dateien mit 138 neuen und 26 gelöschten Zeilen
  1. 11 0
      CHANGELOG.md
  2. 73 0
      __tests__/extraction.test.ts
  3. 54 26
      src/extraction/index.ts

+ 11 - 0
CHANGELOG.md

@@ -63,6 +63,17 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
   Thanks to [@essopsp](https://github.com/essopsp) for the repro.
 
 ### Fixed
+- **Indexing**: `codegraph init -i` now finds source inside nested, independent
+  git repositories — separate clones living inside the workspace that are **not**
+  git submodules (common in CMake "super-repo" layouts). When the top-level
+  workspace is itself a git repo, `git ls-files` reports an embedded repo only as
+  an opaque `subdir/` entry and never lists its files, so indexing from the
+  workspace root reported "No files found to index" even though indexing each
+  sub-repo individually worked. CodeGraph now detects these embedded repos and
+  indexes their tracked and untracked source, honoring each repo's own
+  `.gitignore`. Closes
+  [#193](https://github.com/colbymchenry/codegraph/issues/193). Thanks to
+  [@timxx](https://github.com/timxx) for the report.
 - **Native SQLite backend on Node 24**: indexing on Node 24 always dropped to
   the 5-10x-slower WASM backend, printing a `better-sqlite3 unavailable`
   warning that `npm rebuild better-sqlite3` / `xcode-select --install` could

+ 73 - 0
__tests__/extraction.test.ts

@@ -3132,6 +3132,79 @@ describe('Git Submodules', () => {
   });
 });
 
+describe('Nested non-submodule git repos', () => {
+  let tempDir: string;
+
+  beforeEach(() => {
+    tempDir = createTempDir();
+  });
+
+  afterEach(() => {
+    cleanupTempDir(tempDir);
+  });
+
+  it('should index files in embedded git repos run from a git super-repo (issue #193)', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) =>
+      execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    // Top-level workspace is itself a git repo, holding no source directly —
+    // the CMake "super-repo" layout from the issue.
+    const root = path.join(tempDir, 'root');
+    fs.mkdirSync(path.join(root, 'coding'), { recursive: true });
+    git(root, 'init', '-q');
+    git(root, 'config', 'user.email', 'test@test.com');
+    git(root, 'config', 'user.name', 'Test');
+    fs.writeFileSync(path.join(root, 'CMakeLists.txt'), 'cmake_minimum_required(VERSION 3.10)\n');
+
+    // Two independent clones living inside the workspace (NOT submodules):
+    // one with committed source, one with only untracked source.
+    const sub1 = path.join(root, 'sub_repo1', 'src');
+    fs.mkdirSync(sub1, { recursive: true });
+    git(path.join(root, 'sub_repo1'), 'init', '-q');
+    git(path.join(root, 'sub_repo1'), 'config', 'user.email', 'test@test.com');
+    git(path.join(root, 'sub_repo1'), 'config', 'user.name', 'Test');
+    fs.writeFileSync(path.join(sub1, 'one.ts'), 'export const one = 1;');
+    git(path.join(root, 'sub_repo1'), 'add', '-A');
+    git(path.join(root, 'sub_repo1'), 'commit', '-q', '-m', 'sub1 init');
+
+    const sub2 = path.join(root, 'sub_repo2', 'src');
+    fs.mkdirSync(sub2, { recursive: true });
+    git(path.join(root, 'sub_repo2'), 'init', '-q');
+    fs.writeFileSync(path.join(sub2, 'two.ts'), 'export const two = 2;');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: root };
+    const files = scanDirectory(root, config);
+
+    // Both committed and untracked source from the nested repos must be found.
+    expect(files).toContain('sub_repo1/src/one.ts');
+    expect(files).toContain('sub_repo2/src/two.ts');
+  });
+
+  it('should respect each embedded repo\'s own .gitignore', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) =>
+      execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    const root = path.join(tempDir, 'root');
+    fs.mkdirSync(root, { recursive: true });
+    git(root, 'init', '-q');
+
+    const sub = path.join(root, 'sub_repo', 'src');
+    fs.mkdirSync(sub, { recursive: true });
+    git(path.join(root, 'sub_repo'), 'init', '-q');
+    fs.writeFileSync(path.join(root, 'sub_repo', '.gitignore'), 'src/generated.ts\n');
+    fs.writeFileSync(path.join(sub, 'real.ts'), 'export const real = 1;');
+    fs.writeFileSync(path.join(sub, 'generated.ts'), 'export const generated = 1;');
+
+    const config = { ...DEFAULT_CONFIG, rootDir: root };
+    const files = scanDirectory(root, config);
+
+    expect(files).toContain('sub_repo/src/real.ts');
+    expect(files).not.toContain('sub_repo/src/generated.ts');
+  });
+});
+
 // =============================================================================
 // Scala
 // =============================================================================

+ 54 - 26
src/extraction/index.ts

@@ -125,10 +125,61 @@ export function shouldIncludeFile(
   return false;
 }
 
+/**
+ * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
+ * git repository rooted at `repoDir`, adding each to `files` with `prefix`
+ * prepended so paths stay relative to the original scan root.
+ *
+ * Recurses into embedded git repositories — nested repos that are NOT submodules
+ * (independent clones living inside the workspace, common in CMake "super-repo"
+ * layouts). The parent repo's `git ls-files` cannot see into them: tracked output
+ * skips them entirely, and untracked output reports them only as an opaque
+ * "subdir/" entry (trailing slash) rather than expanding their files. Each
+ * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
+ * (See issue #193.)
+ */
+function collectGitFiles(repoDir: string, prefix: string, files: Set<string>): void {
+  const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'] };
+
+  // Tracked files. --recurse-submodules pulls in files from active submodules,
+  // which the index would otherwise represent only as a commit pointer.
+  // Without this, monorepos using submodules index 0 files. (See issue #147.)
+  // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
+  // can't be combined with -o, so untracked files are gathered separately below.
+  const tracked = execFileSync('git', ['ls-files', '-c', '--recurse-submodules'], gitOpts);
+  for (const line of tracked.split('\n')) {
+    const trimmed = line.trim();
+    if (trimmed) {
+      files.add(normalizePath(prefix + trimmed));
+    }
+  }
+
+  // Untracked files (submodules manage their own untracked state). Embedded git
+  // repos surface here as a single "subdir/" entry that git refuses to descend
+  // into — recurse into those as their own repos so their source gets indexed.
+  const untracked = execFileSync('git', ['ls-files', '-o', '--exclude-standard'], gitOpts);
+  for (const line of untracked.split('\n')) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    if (trimmed.endsWith('/')) {
+      // git only emits a trailing-slash directory entry for an embedded repo.
+      // Guard with a .git check anyway, and skip anything else exactly as git
+      // itself skips it (we never descend into a non-repo opaque dir).
+      const childDir = path.join(repoDir, trimmed);
+      if (fs.existsSync(path.join(childDir, '.git'))) {
+        collectGitFiles(childDir, prefix + trimmed, files);
+      }
+      continue;
+    }
+    files.add(normalizePath(prefix + trimmed));
+  }
+}
+
 /**
  * Get all files visible to git (tracked + untracked but not ignored).
- * Respects .gitignore at all levels (root, subdirectories).
- * Returns null on failure (non-git project) so callers can fall back.
+ * Respects .gitignore at all levels (root, subdirectories) and descends into
+ * embedded (nested, non-submodule) git repos. Returns null on failure
+ * (non-git project) so callers can fall back to a filesystem walk.
  */
 function getGitVisibleFiles(rootDir: string): Set<string> | null {
   try {
@@ -157,30 +208,7 @@ function getGitVisibleFiles(rootDir: string): Set<string> | null {
     }
 
     const files = new Set<string>();
-    const gitOpts = { cwd: rootDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'] };
-
-    // Tracked files. --recurse-submodules pulls in files from active submodules,
-    // which the main repo's index would otherwise represent only as a commit pointer.
-    // Without this, monorepos using submodules index 0 files. (See issue #147.)
-    // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
-    // can't be combined with -o, so untracked files are gathered separately below.
-    const tracked = execFileSync('git', ['ls-files', '-c', '--recurse-submodules'], gitOpts);
-    for (const line of tracked.split('\n')) {
-      const trimmed = line.trim();
-      if (trimmed) {
-        files.add(normalizePath(trimmed));
-      }
-    }
-
-    // Untracked files in the main repo (submodules manage their own untracked state).
-    const untracked = execFileSync('git', ['ls-files', '-o', '--exclude-standard'], gitOpts);
-    for (const line of untracked.split('\n')) {
-      const trimmed = line.trim();
-      if (trimmed) {
-        files.add(normalizePath(trimmed));
-      }
-    }
-
+    collectGitFiles(rootDir, '', files);
     return files;
   } catch {
     return null;