Selaa lähdekoodia

fix(extraction): index nested repos recorded as gitlinks (#1031, #1033) (#1038)

A nested git repo tracked as a gitlink (mode 160000) — a clone `git add`ed
into the super-repo without a `.gitmodules` entry, or a submodule that
isn't active/initialized in this checkout — fell through both file-collection
passes: it's tracked, so the untracked `-o` listing skips it, but it's not
an active submodule, so `--recurse-submodules` won't expand it. Indexing the
top level therefore pulled in only the outer repo's own files and stopped at
the nested repo's boundary (one report: ~10 files at the root).

Switch the tracked scan to `ls-files -s` to expose file modes, collect the
unexpanded 160000 entries, and recurse into each that has a real working tree
on disk as its own embedded repo. Mirror the same discovery in
discoverEmbeddedRepoRoots so the watcher's scope stays equal to the indexer's.

Active submodules (#147) and untracked nested clones (#193) are unchanged;
gitlinks under default-ignored dirs (vendor/, node_modules/) stay excluded
(#407); an uninitialized submodule with no checkout on disk is left alone.
Adds four-shape coverage in extraction.test.ts.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby Mchenry 15 tuntia sitten
vanhempi
sitoutus
a4dfc3438f
3 muutettua tiedostoa jossa 196 lisäystä ja 4 poistoa
  1. 4 0
      CHANGELOG.md
  2. 127 0
      __tests__/extraction.test.ts
  3. 65 4
      src/extraction/index.ts

+ 4 - 0
CHANGELOG.md

@@ -9,6 +9,10 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Fixes
+
+- CodeGraph now indexes nested repositories that git records as gitlinks, so a workspace built by stacking several repos inside one another indexes completely from a single `codegraph init` at the top. When a repo contains another git repo that was `git add`ed into it — so git tracks it as a `160000` "commit" pointer rather than a folder of files — or a submodule that isn't an active, initialized submodule in your checkout, that nested repo's source used to be skipped entirely: indexing the top level stopped at the nested repo's boundary and pulled in only the outer repo's own files, so a stacked-repo project came up nearly empty (one report saw ~10 files indexed at the root). CodeGraph now descends into each such nested repo that has a real working tree on disk and indexes it as its own embedded repository, recursively, so every layer of a stacked workspace is covered. Active submodules (already handled) and plain untracked nested clones are unchanged; a nested repo under a dependency directory such as `vendor/` or `node_modules/` stays excluded; and a submodule with nothing checked out on disk is correctly left alone rather than reported as empty. Thanks @ofergr and @kun-yx for the reports. (#1031, #1033)
+
 
 ## [1.1.2] - 2026-06-28
 

+ 127 - 0
__tests__/extraction.test.ts

@@ -5456,6 +5456,133 @@ describe('Git Submodules', () => {
   });
 });
 
+describe('Nested gitlink repos (#1031, #1033)', () => {
+  let tempDir: string;
+  // Helper: make a self-contained git repo at `dir` with one committed TS file.
+  const makeRepo = async (dir: string, base: string) => {
+    const { execFileSync } = await import('child_process');
+    const git = (...args: string[]) => execFileSync('git', args, { cwd: dir, stdio: 'pipe' });
+    fs.mkdirSync(dir, { recursive: true });
+    git('init', '-q');
+    git('config', 'user.email', 'test@test.com');
+    git('config', 'user.name', 'Test');
+    fs.writeFileSync(path.join(dir, `${base}.ts`), `export const ${base} = 1;`);
+    git('add', '-A');
+    git('commit', '-q', '-m', `${base} init`);
+  };
+
+  beforeEach(() => {
+    tempDir = createTempDir();
+  });
+
+  afterEach(() => {
+    cleanupTempDir(tempDir);
+  });
+
+  // The #1031 case: a nested repo `git add`ed inside the super-repo becomes a
+  // gitlink (mode 160000) with NO `.gitmodules`. It is tracked (so it never shows
+  // in the untracked `-o` listing) yet not an active submodule (so
+  // `--recurse-submodules` won't expand it) — it used to fall through both passes
+  // and only the super-repo's own files got indexed.
+  it('indexes a bare gitlink (git add\'ed embedded repo, no .gitmodules), recursively', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) => execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    const root = path.join(tempDir, 'root');
+    await makeRepo(root, 'app');
+
+    // An embedded clone, itself holding a further nested clone (untracked inside it).
+    await makeRepo(path.join(root, 'embedded'), 'inner');
+    await makeRepo(path.join(root, 'embedded', 'deep'), 'deep');
+
+    // `git add embedded` records it as a 160000 gitlink (no fetch, no .gitmodules).
+    git(root, 'add', 'embedded');
+    git(root, 'commit', '-q', '-m', 'add embedded as gitlink');
+    expect(fs.existsSync(path.join(root, '.gitmodules'))).toBe(false);
+
+    const files = scanDirectory(root);
+
+    expect(files).toContain('app.ts');
+    expect(files).toContain('embedded/inner.ts'); // the gitlink's own source
+    expect(files).toContain('embedded/deep/deep.ts'); // recursion continues into its nested repo
+  });
+
+  // The -c → -s switch must not regress active submodules (#147): a repo can hold
+  // BOTH an active submodule (expanded by --recurse-submodules) and a bare gitlink
+  // (handled by the new pass), and the mixed 160000/100644 modes must parse right.
+  it('indexes a gitlink alongside an active submodule', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) => execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    const lib = path.join(tempDir, '_lib');
+    await makeRepo(lib, 'lib');
+
+    const root = path.join(tempDir, 'root');
+    await makeRepo(root, 'app');
+
+    // A proper, active submodule.
+    execFileSync('git', ['-c', 'protocol.file.allow=always', 'submodule', 'add', '-q', lib, 'libs/lib'], { cwd: root, stdio: 'pipe' });
+    git(root, 'commit', '-q', '-m', 'add submodule');
+
+    // A bare gitlink in the same repo (under a non-ignored dir name).
+    await makeRepo(path.join(root, 'external', 'tool'), 'tool');
+    git(root, 'add', 'external/tool');
+    git(root, 'commit', '-q', '-m', 'add gitlink');
+
+    const files = scanDirectory(root);
+
+    expect(files).toContain('app.ts');
+    expect(files).toContain('libs/lib/lib.ts'); // active submodule still expands (#147)
+    expect(files).toContain('external/tool/tool.ts'); // bare gitlink now indexed
+  });
+
+  // A gitlink under a built-in default-ignored directory (vendor/, node_modules/,
+  // …) stays excluded — a committed dependency doesn't become project code just
+  // because it's a nested repo. Mirrors how the untracked-embedded path treats
+  // the same dirs (#407), so the two passes agree.
+  it('does not index a gitlink under a default-ignored directory (e.g. vendor/)', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) => execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    const root = path.join(tempDir, 'root');
+    await makeRepo(root, 'app');
+    await makeRepo(path.join(root, 'vendor', 'pkg'), 'dep');
+    git(root, 'add', 'vendor/pkg');
+    git(root, 'commit', '-q', '-m', 'add vendored gitlink');
+
+    const files = scanDirectory(root);
+
+    expect(files).toContain('app.ts');
+    expect(files).not.toContain('vendor/pkg/dep.ts');
+  });
+
+  // A gitlink with NO working tree on disk (the common "cloned without
+  // --recurse-submodules" state) has nothing to index — we must leave it alone,
+  // not fabricate entries, and must not break the rest of the scan.
+  it('leaves an uninitialized submodule (no checkout on disk) alone', async () => {
+    const { execFileSync } = await import('child_process');
+
+    const lib = path.join(tempDir, '_lib');
+    await makeRepo(lib, 'lib');
+
+    const sup = path.join(tempDir, 'super');
+    await makeRepo(sup, 'app');
+    execFileSync('git', ['-c', 'protocol.file.allow=always', 'submodule', 'add', '-q', lib, 'libs/lib'], { cwd: sup, stdio: 'pipe' });
+    execFileSync('git', ['commit', '-q', '-m', 'add submodule'], { cwd: sup, stdio: 'pipe' });
+
+    // Clone the super-repo WITHOUT --recurse-submodules → libs/lib is an empty
+    // gitlink dir (mode 160000, no `.git` inside, no files).
+    const clone = path.join(tempDir, 'clone');
+    execFileSync('git', ['clone', '-q', sup, clone], { stdio: 'pipe' });
+    expect(fs.readdirSync(path.join(clone, 'libs', 'lib'))).toHaveLength(0);
+
+    const files = scanDirectory(clone);
+
+    expect(files).toContain('app.ts');
+    expect(files).not.toContain('libs/lib/lib.ts'); // not on disk → correctly absent
+  });
+});
+
 describe('Nested non-submodule git repos', () => {
   let tempDir: string;
 

+ 65 - 4
src/extraction/index.ts

@@ -520,6 +520,27 @@ export function discoverEmbeddedRepoRoots(rootDir: string): string[] {
         }
       }
     } catch { /* untracked listing failed — ignored-side discovery still runs */ }
+    // Unexpanded gitlinks (mode 160000) with a real checkout on disk — embedded
+    // repos `git add`ed without `.gitmodules`, or submodules not active here. The
+    // untracked listing above can't see them (they're tracked), so find them the
+    // same way collectGitFiles does, keeping watcher scope == indexer scope.
+    // (#1031, #1033)
+    try {
+      const staged = execFileSync(
+        'git',
+        ['ls-files', '-z', '-s', '--recurse-submodules'],
+        { cwd: repoAbs, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
+      );
+      for (const entry of staged.split('\0')) {
+        if (!entry || entry.slice(0, 6) !== '160000') continue;
+        const tab = entry.indexOf('\t');
+        if (tab === -1) continue;
+        const rel = entry.slice(tab + 1);
+        const relDir = rel.endsWith('/') ? rel : rel + '/';
+        if (defaults.ignores(relDir)) continue;
+        if (classifyGitDir(path.join(repoAbs, rel)) === 'embedded') candidates.push(relDir);
+      }
+    } catch { /* staged listing failed — other discovery still runs */ }
     candidates.push(...findIgnoredEmbeddedRepos(repoAbs, includeIgnored, prefix));
     for (const rel of candidates) {
       const full = normalizePath(prefix + rel);
@@ -585,13 +606,35 @@ function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, em
   // Without this, monorepos using submodules index 0 files. (See issue #147.)
   // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
   // can't be combined with -o, so untracked files are gathered separately below.
+  //
+  // We use --stage (-s) rather than -c so each entry carries its file mode. That
+  // lets us spot gitlink entries (mode 160000) that --recurse-submodules did NOT
+  // expand: a nested repo `git add`ed without a `.gitmodules` entry, or a
+  // submodule that isn't active/initialized in this checkout. Such a gitlink
+  // falls through every pass — it's tracked, so the untracked `-o` listing below
+  // never reports it, and --recurse-submodules only expands ACTIVE submodules —
+  // so its source would be silently skipped, leaving only the super-repo's own
+  // files indexed. We collect those gitlinks here and recurse into them below.
+  // (An active submodule is expanded inline by --recurse-submodules and so never
+  // surfaces as a 160000 entry — only the unhandled gitlinks do.) (#1031, #1033)
+  //
   // -z gives NUL-separated, unquoted output so non-ASCII (e.g. CJK) paths
   // survive verbatim. Without it git octal-escapes and double-quotes such paths
   // (the core.quotepath default), and the quoted form never matches a real file
-  // on disk → those files are silently dropped from the index. (#541)
-  const tracked = execFileSync('git', ['ls-files', '-z', '-c', '--recurse-submodules'], gitOpts);
-  for (const rel of tracked.split('\0')) {
-    if (rel) files.add(normalizePath(prefix + rel));
+  // on disk → those files are silently dropped from the index. (#541) With -s the
+  // path follows a TAB after the `<mode> <object> <stage>` prefix.
+  const gitlinkRels: string[] = [];
+  const tracked = execFileSync('git', ['ls-files', '-z', '-s', '--recurse-submodules'], gitOpts);
+  for (const entry of tracked.split('\0')) {
+    if (!entry) continue;
+    const tab = entry.indexOf('\t');
+    if (tab === -1) continue; // --stage always emits "<mode> <object> <stage>\t<path>"
+    const rel = entry.slice(tab + 1);
+    if (entry.slice(0, 6) === '160000') {
+      gitlinkRels.push(rel); // an unexpanded gitlink — recursed into below, not a source file itself
+      continue;
+    }
+    files.add(normalizePath(prefix + rel));
   }
 
   // Untracked files (submodules manage their own untracked state). Embedded git
@@ -618,6 +661,24 @@ function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, em
     files.add(normalizePath(prefix + rel));
   }
 
+  // Gitlink entries (mode 160000) that --recurse-submodules left unexpanded —
+  // an embedded repo `git add`ed without `.gitmodules`, or a submodule not
+  // active/initialized in this checkout. When such a gitlink has a real working
+  // tree on disk it is distinct first-party code we must index as its own
+  // embedded repo: the tracked pass skipped its contents and the untracked pass
+  // never sees it (it's tracked, not "other"). A gitlink with no checkout on disk
+  // (an uninitialized submodule — empty dir, no `.git`) has nothing to index and
+  // is left alone, as is a submodule worktree (a duplicate view, #945). (#1031, #1033)
+  for (const rel of gitlinkRels) {
+    const relDir = rel.endsWith('/') ? rel : rel + '/';
+    if (defaultsOnlyIgnore().ignores(relDir)) continue;
+    const childDir = path.join(repoDir, rel);
+    // 'embedded' = a real .git checkout on disk; 'worktree' and 'none' are skipped.
+    if (classifyGitDir(childDir) !== 'embedded') continue;
+    embeddedRoots?.add(normalizePath(prefix + relDir));
+    collectGitFiles(childDir, prefix + relDir, files, embeddedRoots, includeIgnored);
+  }
+
   // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a
   // super-repo .gitignore) never appear in any listing above. By default they
   // stay hidden — `.gitignore` is respected (#970, #976). They are recursed into