Parcourir la source

fix(index): respect .gitignore for tracked gitlink embedded repos (#1065) (#1066)

The gitlink discovery added in #1031/#1033 indexed a tracked 160000 gitlink even when the parent .gitignore excludes its directory, pulling a gitignored reference/benchmark corpus of git add'ed clones into the index (one report: ~138k files, 4.8 GiB, wedged "Resolving refs" watchdog).

Gate both gitlink-discovery sites on the same rule the untracked-embedded path already uses: skip a gitignored gitlink unless codegraph.json includeIgnored opts it in; index non-ignored gitlinks as before. Validated real-world on macOS, Linux, and Windows.

Closes #1065

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby Mchenry il y a 19 heures
Parent
commit
4159539fb8
3 fichiers modifiés avec 106 ajouts et 10 suppressions
  1. 3 0
      CHANGELOG.md
  2. 50 1
      __tests__/extraction.test.ts
  3. 53 9
      src/extraction/index.ts

+ 3 - 0
CHANGELOG.md

@@ -9,6 +9,9 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Fixes
+
+- CodeGraph again respects `.gitignore` for nested repositories that git tracks as gitlinks. The recent change that taught CodeGraph to descend into nested repos recorded as `160000` "commit" pointers (#1031, #1033) did so even when your `.gitignore` excludes the directory those repos live in — so a gitignored reference or benchmark corpus full of cloned repositories got pulled into the index anyway. One project with a gitignored `benchmark/repos/` of 19 cloned repos saw over 138,000 files swept in and a 4.8 GiB graph, and a full index then stalled in the "Resolving refs" phase until the watchdog killed it. CodeGraph now treats a gitignored gitlink the same as any other gitignored embedded repo: excluded by default, and re-included only when you opt the directory in with `codegraph.json` `includeIgnored`. Nested repos in non-ignored locations — the case #1031/#1033 fixed — are unchanged. Thanks @AriaShishegaran for the detailed report. (#1065)
 
 ## [1.1.3] - 2026-06-29
 

+ 50 - 1
__tests__/extraction.test.ts

@@ -9,7 +9,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as os from 'os';
 import { CodeGraph } from '../src';
-import { extractFromSource, scanDirectory, buildDefaultIgnore } from '../src/extraction';
+import { extractFromSource, scanDirectory, buildDefaultIgnore, discoverEmbeddedRepoRoots, buildScopeIgnore } from '../src/extraction';
 import { detectLanguage, isLanguageSupported, getSupportedLanguages, initGrammars, loadAllGrammars, isSourceFile } from '../src/extraction/grammars';
 import { stripCppTemplateArgs } from '../src/extraction/languages/c-cpp';
 import { normalizePath } from '../src/utils';
@@ -5659,6 +5659,55 @@ describe('Nested gitlink repos (#1031, #1033)', () => {
     expect(files).toContain('app.ts');
     expect(files).not.toContain('libs/lib/lib.ts'); // not on disk → correctly absent
   });
+
+  // #1065: a gitlink under a path the super-repo's OWN `.gitignore` covers is the
+  // tracked-gitlink twin of the untracked-ignored embedded repo (#514, #970). The
+  // gitlink-discovery pass must honor that `.gitignore` the same way — otherwise a
+  // gitignored reference/benchmark corpus full of `git add`ed clones gets pulled
+  // into the index (the 138k-file blow-up the reporter hit). Respect it by default;
+  // re-include only via `codegraph.json` `includeIgnored`.
+  it('does not index a gitlink under a gitignored directory by default (#1065)', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) => execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    const root = path.join(tempDir, 'root');
+    await makeRepo(root, 'app');
+    // An embedded clone under a path the super-repo gitignores (a benchmark corpus).
+    await makeRepo(path.join(root, 'benchmark', 'repos', 'ref'), 'ref');
+    git(root, 'add', 'benchmark/repos/ref'); // tracked as a 160000 gitlink
+    fs.writeFileSync(path.join(root, '.gitignore'), 'benchmark/repos/\n');
+    git(root, 'add', '.gitignore');
+    git(root, 'commit', '-q', '-m', 'add gitignored gitlink + ignore rule');
+
+    const files = scanDirectory(root);
+    expect(files).toContain('app.ts');
+    expect(files).not.toContain('benchmark/repos/ref/ref.ts'); // gitignored → excluded
+
+    // The watcher path agrees: the ignored root is never discovered, and the dir is
+    // pruned (the reporter's exact clue — `ignores('benchmark/repos/')` was false).
+    expect(discoverEmbeddedRepoRoots(root)).toEqual([]);
+    expect(buildScopeIgnore(root).ignores('benchmark/repos/')).toBe(true);
+    expect(buildScopeIgnore(root).ignores('benchmark/repos/ref/ref.ts')).toBe(true);
+  });
+
+  it('re-includes a gitignored gitlink when codegraph.json includeIgnored opts in (#1065)', async () => {
+    const { execFileSync } = await import('child_process');
+    const git = (cwd: string, ...args: string[]) => execFileSync('git', args, { cwd, stdio: 'pipe' });
+
+    const root = path.join(tempDir, 'root');
+    await makeRepo(root, 'app');
+    await makeRepo(path.join(root, 'benchmark', 'repos', 'ref'), 'ref');
+    git(root, 'add', 'benchmark/repos/ref');
+    fs.writeFileSync(path.join(root, '.gitignore'), 'benchmark/repos/\n');
+    fs.writeFileSync(path.join(root, 'codegraph.json'), JSON.stringify({ includeIgnored: ['benchmark/repos/'] }));
+    git(root, 'add', '.gitignore', 'codegraph.json');
+    git(root, 'commit', '-q', '-m', 'opt the gitignored gitlink back in');
+
+    const files = scanDirectory(root);
+    expect(files).toContain('app.ts');
+    expect(files).toContain('benchmark/repos/ref/ref.ts'); // opted in → indexed
+    expect(discoverEmbeddedRepoRoots(root)).toContain('benchmark/repos/ref/');
+  });
 });
 
 describe('Nested non-submodule git repos', () => {

+ 53 - 9
src/extraction/index.ts

@@ -511,6 +511,40 @@ export function buildScopeIgnore(rootDir: string, embeddedRoots?: Iterable<strin
   );
 }
 
+/**
+ * Whether an embedded repo found as a tracked gitlink (mode 160000, #1031/#1033)
+ * must be SKIPPED rather than indexed. A gitlink is tracked, so `.gitignore`
+ * can't untrack it — but the discovery passes for it must still honor the same
+ * scope rules as every other path, or a gitignored reference/data dir full of
+ * `git add`ed clones gets pulled into the index against the user's stated intent
+ * (#1065). Two reasons to skip:
+ *   1. It sits in a built-in default-ignored location — an npm git-dependency
+ *      under `node_modules` is never project code; not even an explicit opt-in
+ *      revives it (matches `findIgnoredEmbeddedRepos`).
+ *   2. The parent repo's own `.gitignore` covers its path and the project did
+ *      NOT opt that path in via `codegraph.json` `includeIgnored`. The gitignore
+ *      rule is the user's stated intent to keep that path out of scope, exactly
+ *      as for an UNtracked embedded repo — respect it by default, opt back in
+ *      with `includeIgnored` (#514, #970, #976).
+ * `relDir` is repoDir-relative (trailing-slashed); `prefix` is repoDir's
+ * scan-root-relative path so the `includeIgnored` pattern is matched on the full
+ * scan-root-relative path. `defaults` is `defaultsOnlyIgnore()` and `repoIgnore`
+ * is `buildDefaultIgnore(repoDir)` (defaults + the repo's own `.gitignore`),
+ * both passed in so they're built once per repo level rather than per gitlink.
+ */
+function gitlinkEmbeddedRepoSkipped(
+  relDir: string,
+  prefix: string,
+  defaults: Ignore,
+  repoIgnore: Ignore,
+  includeIgnored: Ignore | null,
+): boolean {
+  if (defaults.ignores(relDir)) return true;        // default-ignored — never index, opt-in can't revive
+  if (!repoIgnore.ignores(relDir)) return false;    // not ignored at all — index as before (#1031/#1033)
+  // Gitignored by the repo's own rules — skip unless the project opted it in.
+  return !includeIgnored?.ignores(normalizePath(prefix + relDir));
+}
+
 /**
  * Standalone discovery of every embedded repo root under `rootDir` (relative,
  * trailing-slashed) — the untracked kind (#193) always, and the gitignored kind
@@ -554,13 +588,16 @@ export function discoverEmbeddedRepoRoots(rootDir: string): string[] {
         ['ls-files', '-z', '-s', '--recurse-submodules'],
         { cwd: repoAbs, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
       );
+      const repoIgnore = buildDefaultIgnore(repoAbs);
       for (const entry of staged.split('\0')) {
         if (!entry || entry.slice(0, 6) !== '160000') continue;
         const tab = entry.indexOf('\t');
         if (tab === -1) continue;
         const rel = entry.slice(tab + 1);
         const relDir = rel.endsWith('/') ? rel : rel + '/';
-        if (defaults.ignores(relDir)) continue;
+        // A gitlink under a gitignored path is respected (not indexed) unless the
+        // project opted it in — same rule as the untracked-ignored kind (#1065).
+        if (gitlinkEmbeddedRepoSkipped(relDir, prefix, defaults, repoIgnore, includeIgnored)) continue;
         if (classifyGitDir(path.join(repoAbs, rel)) === 'embedded') candidates.push(relDir);
       }
     } catch { /* staged listing failed — other discovery still runs */ }
@@ -692,14 +729,21 @@ function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, em
   // never sees it (it's tracked, not "other"). A gitlink with no checkout on disk
   // (an uninitialized submodule — empty dir, no `.git`) has nothing to index and
   // is left alone, as is a submodule worktree (a duplicate view, #945). (#1031, #1033)
-  for (const rel of gitlinkRels) {
-    const relDir = rel.endsWith('/') ? rel : rel + '/';
-    if (defaultsOnlyIgnore().ignores(relDir)) continue;
-    const childDir = path.join(repoDir, rel);
-    // 'embedded' = a real .git checkout on disk; 'worktree' and 'none' are skipped.
-    if (classifyGitDir(childDir) !== 'embedded') continue;
-    embeddedRoots?.add(normalizePath(prefix + relDir));
-    collectGitFiles(childDir, prefix + relDir, files, embeddedRoots, includeIgnored);
+  if (gitlinkRels.length > 0) {
+    const defaults = defaultsOnlyIgnore();
+    const repoIgnore = buildDefaultIgnore(repoDir);
+    for (const rel of gitlinkRels) {
+      const relDir = rel.endsWith('/') ? rel : rel + '/';
+      // A gitlink under a gitignored path is respected (not indexed) unless the
+      // project opted it in via `includeIgnored` — keep tracked gitlinks under
+      // the same scope rule as the untracked-ignored kind below (#1065).
+      if (gitlinkEmbeddedRepoSkipped(relDir, prefix, defaults, repoIgnore, includeIgnored)) continue;
+      const childDir = path.join(repoDir, rel);
+      // 'embedded' = a real .git checkout on disk; 'worktree' and 'none' are skipped.
+      if (classifyGitDir(childDir) !== 'embedded') continue;
+      embeddedRoots?.add(normalizePath(prefix + relDir));
+      collectGitFiles(childDir, prefix + relDir, files, embeddedRoots, includeIgnored);
+    }
   }
 
   // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a