Explorar el Código

feat(index): default-ignore dependency/build/cache dirs (#407) (#417)

node_modules and other dependency/build/cache dirs were indexed whenever a project lacked a .gitignore excluding them (common in non-git projects), flooding context/search with third-party symbols. Added a built-in default-ignore set spanning every supported language/framework (node_modules, vendor, dist, build, target, .venv, __pycache__, Pods, .next, etc.), applied UNIFORMLY in both the git and non-git enumeration paths — including to tracked files, since committing/vendoring a dependency dir doesn't make it project code. The only opt-in is an explicit .gitignore negation (e.g. !vendor/). First-party-prone names (packages, lib, app, bin, src, deps, env) are deliberately excluded from the list.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Colby Mchenry hace 4 semanas
padre
commit
832f9063b1
Se han modificado 2 ficheros con 98 adiciones y 3 borrados
  1. 9 0
      CHANGELOG.md
  2. 89 3
      src/extraction/index.ts

+ 9 - 0
CHANGELOG.md

@@ -22,6 +22,15 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
   reconciles anything that changed while it wasn't running (e.g. a `git pull` from
   the terminal), so the first query reflects the current code instead of a stale
   snapshot — rather than waiting for the next live edit.
+- **Dependency, build, and cache directories are now excluded by default** —
+  `node_modules`, `vendor`, `dist`, `build`, `target`, `.venv`, `__pycache__`,
+  `Pods`, `.next`, and the like across every supported language/framework — so
+  `context` and `search` reflect your code instead of third-party noise, even in a
+  project with no `.gitignore` (#407). The defaults apply uniformly, including to
+  committed files (vendoring a dependency into the repo doesn't make it project
+  code). To index one anyway, add a `.gitignore` negation (e.g. `!vendor/`).
+  First-party-prone names like `packages/`, `lib/`, `app/`, and `src/` are never on
+  the default list.
 
 ## [0.9.4] - 2026-05-24
 

+ 89 - 3
src/extraction/index.ts

@@ -99,6 +99,84 @@ export function hashContent(content: string): string {
  */
 const MAX_FILE_SIZE = 1024 * 1024;
 
+/**
+ * Directory names that are dependency, build, cache, or tooling output across the
+ * languages/frameworks CodeGraph supports — curated from the canonical
+ * github/gitignore templates. Excluded by default so the graph reflects your code,
+ * not third-party noise, without requiring a `.gitignore` (issue #407). The
+ * exclusion applies uniformly (git or not, tracked or not); the only opt-in is an
+ * explicit `.gitignore` negation (e.g. `!vendor/`). First-party-prone or generic
+ * names (`packages`, `lib`, `app`, `bin`, `src`, `deps`, `env`, `tmp`, `storage`,
+ * `Library`) are deliberately NOT listed, to avoid ever hiding real source.
+ *
+ * Only dirs that actually contain *indexable source* (or are enormous) earn a slot
+ * — IDE/state dirs like `.idea`/`.vs` are omitted because CodeGraph indexes only
+ * recognized source extensions, so they produce no symbols regardless.
+ */
+const DEFAULT_IGNORE_DIRS: ReadonlySet<string> = new Set([
+  // JS / TS — dependency directories
+  'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
+  '.yarn', '.pnpm-store',
+  // JS / TS — framework & bundler build / cache / deploy output
+  '.next', '.nuxt', '.svelte-kit', '.turbo', '.vite', '.parcel-cache', '.angular',
+  '.docusaurus', 'storybook-static', '.vinxi', '.nitro', 'out-tsc',
+  '.vercel', '.netlify', '.wrangler',
+  // Build output (common across ecosystems)
+  'dist', 'build', 'out', '.output',
+  // Test / coverage
+  'coverage', '.nyc_output',
+  // Python
+  '__pycache__', '__pypackages__', '.venv', 'venv', '.pixi', '.pdm-build',
+  '.mypy_cache', '.pytest_cache', '.ruff_cache', '.tox', '.nox', '.hypothesis',
+  '.ipynb_checkpoints', '.eggs',
+  // Rust / JVM (Maven, Gradle, Scala)
+  'target', '.gradle',
+  // .NET
+  'obj',
+  // Vendored deps (Go, PHP/Composer, Ruby/Bundler)
+  'vendor',
+  // Swift / iOS
+  '.build', 'Pods', 'Carthage', 'DerivedData', '.swiftpm',
+  // Dart / Flutter
+  '.dart_tool', '.pub-cache',
+  // Native (Android NDK, C/C++ deps)
+  '.cxx', '.externalNativeBuild', 'vcpkg_installed',
+  // Scala tooling
+  '.bloop', '.metals',
+  // Lua / Luau (LuaRocks)
+  'lua_modules', '.luarocks',
+  // Delphi / RAD Studio IDE backups (duplicate .pas source — would double-count)
+  '__history', '__recovery',
+  // Generic cache
+  '.cache',
+]);
+
+/** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
+const DEFAULT_IGNORE_PATTERNS: string[] = [
+  ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
+  '*.egg-info/',     // Python packaging metadata
+  'cmake-build-*/',  // CLion / CMake build trees
+  'bazel-*/',        // Bazel output symlink trees
+];
+
+/**
+ * An `ignore` matcher seeded with the built-in defaults, merged with the project's
+ * root .gitignore so a negation there (e.g. `!vendor/`) overrides a default. Shared
+ * by both enumeration paths so behavior is identical with or without git — and so
+ * the defaults apply to tracked files too (committing a dependency dir doesn't make
+ * it project code; the explicit `.gitignore` negation is the only opt-in).
+ */
+function buildDefaultIgnore(rootDir: string): Ignore {
+  const ig = ignore().add(DEFAULT_IGNORE_PATTERNS);
+  try {
+    const rootGitignore = path.join(rootDir, '.gitignore');
+    if (fs.existsSync(rootGitignore)) ig.add(fs.readFileSync(rootGitignore, 'utf-8'));
+  } catch {
+    // Unreadable root .gitignore — the built-in defaults still apply.
+  }
+  return ig;
+}
+
 /**
  * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
  * git repository rooted at `repoDir`, adding each to `files` with `prefix`
@@ -183,7 +261,11 @@ function getGitVisibleFiles(rootDir: string): Set<string> | null {
 
     const files = new Set<string>();
     collectGitFiles(rootDir, '', files);
-    return files;
+    // Apply built-in default ignores uniformly — to tracked files too, since
+    // committing a dependency/build dir doesn't make it project code. A
+    // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
+    const ig = buildDefaultIgnore(rootDir);
+    return new Set([...files].filter((f) => !ig.ignores(f)));
   } catch {
     return null;
   }
@@ -358,7 +440,9 @@ function scanDirectoryWalk(
     visitedDirs.add(realDir);
 
     // This directory's own .gitignore (if present) applies to everything below it.
-    const own = loadIgnore(dir);
+    // The root's .gitignore is already merged into the seeded base matcher (so a
+    // negation there can override a built-in default), so skip it here.
+    const own = dir === rootDir ? null : loadIgnore(dir);
     const active = own ? [...matchers, own] : matchers;
 
     let entries: fs.Dirent[];
@@ -411,7 +495,9 @@ function scanDirectoryWalk(
     }
   }
 
-  walk(rootDir, []);
+  // Seed a base matcher with the built-in default ignores (merged with the root
+  // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
+  walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
   return files;
 }