index.ts 81 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as os from 'os';
  10. import * as crypto from 'crypto';
  11. import { execFileSync } from 'child_process';
  12. import {
  13. Language,
  14. FileRecord,
  15. ExtractionResult,
  16. ExtractionError,
  17. Edge,
  18. } from '../types';
  19. import { QueryBuilder } from '../db/queries';
  20. import { extractFromSource } from './tree-sitter';
  21. import { ParseWorkerPool, resolveParsePoolSize } from './parse-pool';
  22. import { detectLanguage, isSourceFile, isLanguageSupported, isFileLevelOnlyLanguage, initGrammars, loadGrammarsForLanguages } from './grammars';
  23. import { loadExtensionOverrides, loadIncludeIgnoredPatterns, loadExcludePatterns } from '../project-config';
  24. import { isCodeGraphDataDir } from '../directory';
  25. import { logDebug, logWarn } from '../errors';
  26. import { validatePathWithinRoot, normalizePath } from '../utils';
  27. import ignore, { Ignore } from 'ignore';
  28. import { detectFrameworks } from '../resolution/frameworks';
  29. import type { ResolutionContext } from '../resolution/types';
  30. /**
  31. * Number of files to read in parallel during indexing.
  32. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  33. */
  34. const FILE_IO_BATCH_SIZE = 10;
  35. /**
  36. * How many files the `sync()` reconcile processes between cooperative yields to
  37. * the event loop. The reconcile runs two O(files) loops of synchronous `fs`
  38. * calls (existsSync for removals, statSync for adds/mods); on a very large repo
  39. * (~100k files) an un-yielded run wedges the main thread for minutes, which both
  40. * trips the liveness watchdog (it SIGKILLs a process whose loop stops turning)
  41. * and blocks the first MCP tool call behind the catch-up gate (issue #905).
  42. * Yielding every N files keeps the socket, the watchdog heartbeat, and any
  43. * concurrent read query responsive while the reconcile runs.
  44. */
  45. const SYNC_RECONCILE_YIELD_INTERVAL = 1000;
  46. // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
  47. /**
  48. * Maximum time (ms) to wait for a single file to parse in the worker thread.
  49. * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
  50. * indexing run from freezing. The worker is restarted after a timeout.
  51. */
  52. const PARSE_TIMEOUT_MS = 10_000;
  53. /**
  54. * Number of files to parse before recycling the worker thread.
  55. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
  56. * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
  57. * V8 isolate by terminating the worker thread and spawning a fresh one.
  58. * This interval balances memory usage against the cost of reloading grammars.
  59. */
  60. const WORKER_RECYCLE_INTERVAL = 250;
  61. /**
  62. * Progress callback for indexing operations
  63. */
  64. export interface IndexProgress {
  65. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  66. current: number;
  67. total: number;
  68. currentFile?: string;
  69. }
  70. /**
  71. * Result of an indexing operation
  72. */
  73. export interface IndexResult {
  74. success: boolean;
  75. filesIndexed: number;
  76. filesSkipped: number;
  77. filesErrored: number;
  78. nodesCreated: number;
  79. edgesCreated: number;
  80. errors: ExtractionError[];
  81. durationMs: number;
  82. }
  83. /**
  84. * Result of a sync operation
  85. */
  86. export interface SyncResult {
  87. filesChecked: number;
  88. filesAdded: number;
  89. filesModified: number;
  90. filesRemoved: number;
  91. nodesUpdated: number;
  92. durationMs: number;
  93. changedFilePaths?: string[];
  94. }
  95. /**
  96. * Calculate SHA256 hash of file contents
  97. */
  98. export function hashContent(content: string): string {
  99. return crypto.createHash('sha256').update(content).digest('hex');
  100. }
  101. /**
  102. * Skip files larger than this (bytes). Generated bundles, minified JS, and
  103. * vendored blobs blow the WASM heap and the worker-recycle budget for no useful
  104. * symbols. 1 MB covers essentially all hand-written source.
  105. */
  106. const MAX_FILE_SIZE = 1024 * 1024;
  107. /**
  108. * Directory names that are dependency, build, cache, or tooling output across the
  109. * languages/frameworks CodeGraph supports — curated from the canonical
  110. * github/gitignore templates. Excluded by default so the graph reflects your code,
  111. * not third-party noise, without requiring a `.gitignore` (issue #407). The
  112. * exclusion applies uniformly (git or not, tracked or not); the only opt-in is an
  113. * explicit `.gitignore` negation (e.g. `!vendor/`). First-party-prone or generic
  114. * names (`packages`, `lib`, `app`, `bin`, `src`, `deps`, `env`, `tmp`, `storage`,
  115. * `Library`) are deliberately NOT listed, to avoid ever hiding real source.
  116. *
  117. * Only dirs that actually contain *indexable source* (or are enormous) earn a slot
  118. * — IDE/state dirs like `.idea`/`.vs` are omitted because CodeGraph indexes only
  119. * recognized source extensions, so they produce no symbols regardless.
  120. */
  121. const DEFAULT_IGNORE_DIRS: ReadonlySet<string> = new Set([
  122. // JS / TS — dependency directories
  123. 'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
  124. '.yarn', '.pnpm-store',
  125. // JS / TS — framework & bundler build / cache / deploy output
  126. '.next', '.nuxt', '.svelte-kit', '.turbo', '.vite', '.parcel-cache', '.angular',
  127. '.docusaurus', 'storybook-static', '.vinxi', '.nitro', 'out-tsc',
  128. '.vercel', '.netlify', '.wrangler',
  129. // Build output (common across ecosystems)
  130. 'dist', 'build', 'out', '.output',
  131. // Test / coverage
  132. 'coverage', '.nyc_output',
  133. // Python
  134. '__pycache__', '__pypackages__', '.venv', 'venv', '.pixi', '.pdm-build',
  135. '.mypy_cache', '.pytest_cache', '.ruff_cache', '.tox', '.nox', '.hypothesis',
  136. '.ipynb_checkpoints', '.eggs',
  137. // Rust / JVM (Maven, Gradle, Scala)
  138. 'target', '.gradle',
  139. // .NET
  140. 'obj',
  141. // Vendored deps (Go, PHP/Composer, Ruby/Bundler)
  142. 'vendor',
  143. // Swift / iOS
  144. '.build', 'Pods', 'Carthage', 'DerivedData', '.swiftpm',
  145. // Dart / Flutter
  146. '.dart_tool', '.pub-cache',
  147. // Native (Android NDK, C/C++ deps)
  148. '.cxx', '.externalNativeBuild', 'vcpkg_installed',
  149. // Scala tooling
  150. '.bloop', '.metals',
  151. // Lua / Luau (LuaRocks)
  152. 'lua_modules', '.luarocks',
  153. // Delphi / RAD Studio IDE backups (duplicate .pas source — would double-count)
  154. '__history', '__recovery',
  155. // Generic cache
  156. '.cache',
  157. ]);
  158. /** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
  159. const DEFAULT_IGNORE_PATTERNS: string[] = [
  160. ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
  161. '*.egg-info/', // Python packaging metadata
  162. 'cmake-build-*/', // CLion / CMake build trees
  163. 'bazel-*/', // Bazel output symlink trees
  164. ];
  165. /** True if `buf` decodes as strict UTF-8 (no invalid byte sequences). */
  166. function isValidUtf8(buf: Buffer): boolean {
  167. try {
  168. new TextDecoder('utf-8', { fatal: true }).decode(buf);
  169. return true;
  170. } catch {
  171. return false;
  172. }
  173. }
  174. /**
  175. * Read a `.gitignore` and return patterns safe to hand to the `ignore` matcher —
  176. * never throwing, even when the file isn't real gitignore text. Two failure
  177. * modes, both seen in the wild (issue #682):
  178. *
  179. * - The file isn't valid UTF-8 — e.g. transparently encrypted in place by
  180. * corporate DLP / endpoint-security software, leaving a UTF-16 header plus
  181. * ciphertext. None of it is meaningful patterns, so the whole file is skipped.
  182. * - The file is text but a single line can't be compiled to a regex by the
  183. * `ignore` library — `\\[` and friends throw "Unterminated character class".
  184. * Crucially the throw is LAZY (at match time, not `.add()`), so it would
  185. * otherwise escape mid-scan. That one pattern is dropped; the rest are kept.
  186. *
  187. * Either way a warning that NAMES the file is logged (the reporter couldn't tell
  188. * which `.gitignore` was at fault) and indexing continues instead of aborting.
  189. * Returns '' when there's nothing usable.
  190. */
  191. function readGitignorePatterns(giPath: string): string {
  192. let buf: Buffer;
  193. try {
  194. buf = fs.readFileSync(giPath);
  195. } catch {
  196. return ''; // unreadable (permissions / race) — treat as absent
  197. }
  198. // A NUL byte never appears in real gitignore text, and a fatal UTF-8 decode
  199. // catches the rest. Such a file isn't ignore patterns at all.
  200. if (buf.includes(0) || !isValidUtf8(buf)) {
  201. logWarn(
  202. 'Ignoring a .gitignore that is not valid UTF-8 text — it may have been encrypted ' +
  203. 'in place by endpoint-security software. Indexing continues without it.',
  204. { file: giPath },
  205. );
  206. return '';
  207. }
  208. const content = buf.toString('utf-8');
  209. // Fast path: one `.ignores()` call forces the library to compile EVERY rule,
  210. // so if it doesn't throw, the whole file is safe to use verbatim.
  211. try {
  212. ignore().add(content).ignores('.codegraph-probe');
  213. return content;
  214. } catch {
  215. // Fall through: a line is uncompilable — keep the good ones, drop the bad.
  216. }
  217. const kept: string[] = [];
  218. let dropped = 0;
  219. for (const line of content.split(/\r?\n/)) {
  220. try {
  221. ignore().add(line).ignores('.codegraph-probe');
  222. kept.push(line);
  223. } catch {
  224. dropped++;
  225. }
  226. }
  227. if (dropped > 0) {
  228. logWarn(
  229. `Skipped ${dropped} unparseable pattern(s) in a .gitignore; the rest are applied.`,
  230. { file: giPath },
  231. );
  232. }
  233. return kept.join('\n');
  234. }
  235. /**
  236. * An `ignore` matcher seeded with the built-in defaults, merged with the project's
  237. * root .gitignore so a negation there (e.g. `!vendor/`) overrides a default. Shared
  238. * by both enumeration paths so behavior is identical with or without git — and so
  239. * the defaults apply to tracked files too (committing a dependency dir doesn't make
  240. * it project code; the explicit `.gitignore` negation is the only opt-in).
  241. */
  242. export function buildDefaultIgnore(rootDir: string): Ignore {
  243. const ig = ignore().add(DEFAULT_IGNORE_PATTERNS);
  244. const rootGitignore = path.join(rootDir, '.gitignore');
  245. if (fs.existsSync(rootGitignore)) ig.add(readGitignorePatterns(rootGitignore));
  246. return ig;
  247. }
  248. /**
  249. * Defaults-only ignore matcher (no root `.gitignore` merged). Used wherever the
  250. * parent repo's own ignore rules must NOT apply — inside embedded child repos,
  251. * whose gitignore semantics their own `git ls-files` already enforced (#514).
  252. */
  253. function defaultsOnlyIgnore(): Ignore {
  254. return ignore().add(DEFAULT_IGNORE_PATTERNS);
  255. }
  256. /**
  257. * Matcher for the project's `codegraph.json` `includeIgnored` patterns — the
  258. * explicit opt-in to index embedded git repos living inside gitignored
  259. * directories (#622, #699). Returns `null` when the project opted in nothing,
  260. * which is the zero-config DEFAULT: `.gitignore` is then fully respected and a
  261. * gitignored directory (even one holding nested repos) is never walked or
  262. * indexed (#970, #976). Built once per scan/sync/scope operation from the scan
  263. * root and threaded down — never global, so multi-project daemons stay isolated.
  264. */
  265. function loadIncludeIgnoredMatcher(rootDir: string): Ignore | null {
  266. const patterns = loadIncludeIgnoredPatterns(rootDir);
  267. return patterns.length > 0 ? ignore().add(patterns) : null;
  268. }
  269. /**
  270. * Matcher for the project's `codegraph.json` `exclude` patterns — paths to keep
  271. * OUT of the index even when git-tracked, which `.gitignore` cannot do (#999).
  272. * The escape hatch for a committed vendor/theme/SDK directory. Returns `null`
  273. * when nothing is excluded (the zero-config default → no overhead). Matched
  274. * against project-root-relative paths, so it applies uniformly across the whole
  275. * workspace, including inside embedded repos (excluding `static/` means gone
  276. * everywhere). Built once per scan/sync/scope operation from the scan root.
  277. */
  278. function loadExcludeMatcher(rootDir: string): Ignore | null {
  279. const patterns = loadExcludePatterns(rootDir);
  280. return patterns.length > 0 ? ignore().add(patterns) : null;
  281. }
  282. /**
  283. * `git ls-files --directory` collapses a wholly-untracked/ignored directory into
  284. * one entry — and when the command's own cwd is such a directory (the indexed
  285. * root is itself a git-ignored subdir of an enclosing repo), git emits the
  286. * literal `./` meaning "this entire directory". That sentinel is not a real
  287. * nested path: feeding it to the `ignore` matcher throws ("path should be a
  288. * `path.relative()`d string, but got "./""), which used to abort `buildScopeIgnore`
  289. * and so break the MCP daemon's watcher/auto-sync on connect; and joining it back
  290. * onto `repoDir` would just re-point at the cwd. Drop it wherever we consume
  291. * `--directory` output. (#936)
  292. */
  293. function isWholeCwdEntry(entry: string): boolean {
  294. return entry === './' || entry === '.' || entry === '';
  295. }
  296. /**
  297. * List the gitignored DIRECTORIES of a repo (collapsed, trailing-slash form),
  298. * relative to `repoDir`. These are invisible to every other `git ls-files` /
  299. * `git status` mode — and in a multi-repo workspace they are exactly where the
  300. * nested project repos live (a super-repo `.gitignore`s its child repos to keep
  301. * `git status` quiet; that does not make them third-party code). (#514)
  302. */
  303. function listIgnoredDirs(repoDir: string): string[] {
  304. try {
  305. const out = execFileSync(
  306. 'git',
  307. ['ls-files', '-z', '-o', '-i', '--exclude-standard', '--directory'],
  308. { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true }
  309. );
  310. return out.split('\0').filter((e) => e.endsWith('/') && !isWholeCwdEntry(e));
  311. } catch {
  312. return [];
  313. }
  314. }
  315. /** Max directory depth searched below an ignored dir for nested `.git` roots. */
  316. const EMBEDDED_REPO_SEARCH_DEPTH = 4;
  317. /** Max directories examined per search — a huge ignored data dir must never stall a scan/sync. */
  318. const EMBEDDED_REPO_SEARCH_ENTRIES = 2000;
  319. /**
  320. * Classify a directory's `.git` entry for embedded-repo discovery.
  321. *
  322. * - A `.git` **directory** is an embedded clone — distinct first-party code a
  323. * super-repo merely hides from git; index it (#193, #514).
  324. * - A `.git` **file** is a pointer (`gitdir: …`). A git **worktree** points into
  325. * the host repo's own `.git/worktrees/<name>`, so it is a second working view
  326. * of a repo CodeGraph already indexes — indexing it just duplicates the whole
  327. * graph N times; skip it (#848). A **submodule worktree** points into
  328. * `.git/modules/<module>/worktrees/<name>` — same duplication, so skip it too
  329. * (#945). A **submodule** checkout points into `.git/modules/<module>` (no
  330. * `worktrees/` segment) and is distinct code, so index it as before.
  331. *
  332. * Returns `'none'` when there is no `.git` entry here.
  333. */
  334. function classifyGitDir(absDir: string): 'embedded' | 'worktree' | 'none' {
  335. let st: fs.Stats;
  336. try {
  337. st = fs.statSync(path.join(absDir, '.git'));
  338. } catch {
  339. return 'none';
  340. }
  341. if (st.isDirectory()) return 'embedded';
  342. if (!st.isFile()) return 'none';
  343. try {
  344. const gitdir = fs.readFileSync(path.join(absDir, '.git'), 'utf8').match(/^gitdir:\s*(.+)$/m)?.[1]?.trim();
  345. // A worktree's gitdir lives under some repo's `.git/worktrees/<name>` —
  346. // either the top-level repo's (`.git/worktrees/`) or, for a worktree of a
  347. // submodule, that submodule's gitdir (`.git/modules/<module>/worktrees/`).
  348. // The optional `modules/<module>` segment covers the submodule case (#945).
  349. // Match both separators so a Windows-style pointer is recognized too.
  350. if (gitdir && /(^|[\\/])\.git[\\/](modules[\\/][^\\/]+[\\/])?worktrees[\\/]/.test(gitdir)) return 'worktree';
  351. } catch {
  352. // Unreadable `.git` pointer — fall back to the prior "index it" behavior.
  353. }
  354. return 'embedded';
  355. }
  356. /**
  357. * Find git repositories nested under `absDir` (inclusive), shallow bounded BFS.
  358. * Stops descending at each repo root found — contents belong to that repo's own
  359. * enumeration. Skips default-ignored dirs (`node_modules` can contain `.git`
  360. * from npm git-dependencies — that never makes it project code) and CodeGraph
  361. * data dirs. Depth- and entry-capped so a huge ignored tree can't stall the scan.
  362. */
  363. function findNestedGitRepos(absDir: string, relPrefix: string): string[] {
  364. const found: string[] = [];
  365. const defaults = defaultsOnlyIgnore();
  366. const queue: Array<{ abs: string; rel: string; depth: number }> = [
  367. { abs: absDir, rel: relPrefix, depth: 0 },
  368. ];
  369. let examined = 0;
  370. while (queue.length > 0) {
  371. const { abs, rel, depth } = queue.shift()!;
  372. if (++examined > EMBEDDED_REPO_SEARCH_ENTRIES) {
  373. logDebug('Embedded-repo search entry cap hit — deeper repos (if any) not discovered', { under: relPrefix });
  374. break;
  375. }
  376. const cls = classifyGitDir(abs);
  377. if (cls === 'worktree') {
  378. continue; // a git worktree duplicates an already-indexed repo (#848) — skip
  379. }
  380. if (cls === 'embedded') {
  381. found.push(rel);
  382. continue; // its own git handles everything below
  383. }
  384. if (depth >= EMBEDDED_REPO_SEARCH_DEPTH) continue;
  385. let entries: fs.Dirent[];
  386. try {
  387. entries = fs.readdirSync(abs, { withFileTypes: true });
  388. } catch {
  389. continue;
  390. }
  391. for (const entry of entries) {
  392. if (!entry.isDirectory()) continue;
  393. if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue;
  394. const childRel = rel + entry.name + '/';
  395. if (defaults.ignores(childRel)) continue;
  396. queue.push({ abs: path.join(abs, entry.name), rel: childRel, depth: depth + 1 });
  397. }
  398. }
  399. return found;
  400. }
  401. /**
  402. * Workspace-scope ignore matcher. Ordinary paths get the root's matcher
  403. * (built-in defaults + root `.gitignore`); paths inside an EMBEDDED repo get
  404. * that repo's own matcher (defaults + its root `.gitignore`) — the parent's
  405. * `.gitignore` hides a child repo from git, not from the index (#514). A
  406. * directory path (trailing slash) that is an ANCESTOR of an embedded root is
  407. * never ignored, so directory-pruning callers (the Linux per-directory
  408. * watcher) still descend to reach the embedded repos.
  409. *
  410. * Single source of truth for indexer and watcher scope — they must not diverge.
  411. */
  412. export class ScopeIgnore {
  413. private embedded: Array<{ root: string; matcher: Ignore }>;
  414. private defaults: Ignore = defaultsOnlyIgnore();
  415. constructor(
  416. private rootMatcher: Ignore,
  417. embedded: Array<{ root: string; matcher: Ignore }>,
  418. /**
  419. * Project `codegraph.json` `exclude` patterns (#999), matched against the
  420. * full root-relative path. Wins over everything else — an explicit user
  421. * exclude applies even to tracked files and even inside embedded repos.
  422. */
  423. private exclude: Ignore | null = null,
  424. ) {
  425. // Longest root first so paths in nested embedded repos hit the innermost matcher.
  426. this.embedded = [...embedded].sort((a, b) => b.root.length - a.root.length);
  427. }
  428. ignores(rel: string): boolean {
  429. // User `exclude` (#999) is checked first and against the full root-relative
  430. // path: it must drop git-TRACKED paths (which `.gitignore` can't) and apply
  431. // everywhere, including ancestors of embedded repos.
  432. if (this.exclude && this.exclude.ignores(rel)) return true;
  433. for (const { root, matcher } of this.embedded) {
  434. if (rel.startsWith(root)) {
  435. const inner = rel.slice(root.length);
  436. if (inner === '') return false;
  437. // Built-in defaults apply to the FULL path uniformly (#407) — an
  438. // embedded repo inside node_modules (an npm git-dependency) must stay
  439. // excluded even though its own rules wouldn't ignore its files.
  440. return this.defaults.ignores(rel) || matcher.ignores(inner);
  441. }
  442. }
  443. // Never prune a directory that leads to an embedded repo.
  444. if (rel.endsWith('/') && this.embedded.some(({ root }) => root.startsWith(rel))) {
  445. return false;
  446. }
  447. return this.rootMatcher.ignores(rel);
  448. }
  449. }
  450. /**
  451. * Build the workspace-scope matcher. When the caller already knows the
  452. * embedded roots (the scanner discovers them during collection), pass them to
  453. * skip rediscovery; otherwise they're discovered here (the watcher path).
  454. */
  455. export function buildScopeIgnore(rootDir: string, embeddedRoots?: Iterable<string>): ScopeIgnore {
  456. const roots = embeddedRoots ? [...embeddedRoots] : discoverEmbeddedRepoRoots(rootDir);
  457. return new ScopeIgnore(
  458. buildDefaultIgnore(rootDir),
  459. roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })),
  460. loadExcludeMatcher(rootDir),
  461. );
  462. }
  463. /**
  464. * Standalone discovery of every embedded repo root under `rootDir` (relative,
  465. * trailing-slashed) — the untracked kind (#193) always, and the gitignored kind
  466. * (#514) only for directories the project opted in via `codegraph.json`
  467. * `includeIgnored` (#622, #699); otherwise `.gitignore` is respected and they
  468. * are not discovered (#970, #976). Recursive (an embedded repo can embed further
  469. * repos). Returns [] for non-git roots: the filesystem walk handles nested repos
  470. * there already.
  471. */
  472. export function discoverEmbeddedRepoRoots(rootDir: string): string[] {
  473. try {
  474. execFileSync('git', ['rev-parse', '--git-dir'], { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true });
  475. } catch {
  476. return [];
  477. }
  478. const out: string[] = [];
  479. const defaults = defaultsOnlyIgnore();
  480. const includeIgnored = loadIncludeIgnoredMatcher(rootDir);
  481. const visit = (repoAbs: string, prefix: string): void => {
  482. const candidates: string[] = [];
  483. try {
  484. const o = execFileSync(
  485. 'git',
  486. ['ls-files', '-z', '-o', '--exclude-standard', '--directory'],
  487. { cwd: repoAbs, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  488. );
  489. for (const e of o.split('\0')) {
  490. if (e.endsWith('/') && !isWholeCwdEntry(e) && !defaults.ignores(e)) {
  491. candidates.push(...findNestedGitRepos(path.join(repoAbs, e), e));
  492. }
  493. }
  494. } catch { /* untracked listing failed — ignored-side discovery still runs */ }
  495. candidates.push(...findIgnoredEmbeddedRepos(repoAbs, includeIgnored, prefix));
  496. for (const rel of candidates) {
  497. const full = normalizePath(prefix + rel);
  498. out.push(full);
  499. visit(path.join(repoAbs, rel), full);
  500. }
  501. };
  502. visit(rootDir, '');
  503. return out;
  504. }
  505. /**
  506. * Discover embedded repos hidden by `repoDir`'s OWN gitignore rules: for each
  507. * gitignored directory, search for nested `.git` roots. Returns repo paths
  508. * relative to `repoDir`, trailing-slashed.
  509. *
  510. * OPT-IN ONLY. Walking into a gitignored directory contradicts what every other
  511. * tool (and CodeGraph's own `git ls-files` foundation) does — `.gitignore`
  512. * excludes. So this returns `[]` unless the project opted the directory in via
  513. * `codegraph.json` `includeIgnored`; without that, a gitignored dir — including
  514. * a huge reference/data dir full of nested clones — is left untouched (#970,
  515. * #976). When opted in, it restores the super-repo-of-clones behavior (#622,
  516. * #699). `prefix` is the scan-root-relative path of `repoDir`, so a pattern like
  517. * `services/` opts that whole subtree in at any recursion depth. Built-in
  518. * default excludes (`node_modules`, …) are always skipped.
  519. */
  520. function findIgnoredEmbeddedRepos(repoDir: string, includeIgnored: Ignore | null, prefix: string): string[] {
  521. if (!includeIgnored) return [];
  522. const defaults = defaultsOnlyIgnore();
  523. const repos: string[] = [];
  524. for (const dir of listIgnoredDirs(repoDir)) {
  525. if (defaults.ignores(dir)) continue;
  526. if (!includeIgnored.ignores(normalizePath(prefix + dir))) continue;
  527. repos.push(...findNestedGitRepos(path.join(repoDir, dir), dir));
  528. }
  529. return repos;
  530. }
  531. /**
  532. * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
  533. * git repository rooted at `repoDir`, adding each to `files` with `prefix`
  534. * prepended so paths stay relative to the original scan root.
  535. *
  536. * Recurses into embedded git repositories — nested repos that are NOT submodules
  537. * (independent clones living inside the workspace, common in CMake "super-repo"
  538. * layouts). The parent repo's `git ls-files` cannot see into them: tracked output
  539. * skips them entirely, and untracked output reports them only as an opaque
  540. * "subdir/" entry (trailing slash) rather than expanding their files. Each
  541. * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
  542. * (See issue #193.) GITIGNORED embedded repos are invisible even to that; they
  543. * are discovered separately via `findIgnoredEmbeddedRepos` (#514) but ONLY for
  544. * directories the project opted in through `codegraph.json` `includeIgnored`
  545. * (`includeIgnored` here, threaded from the scan root) — by default `.gitignore`
  546. * is respected and they stay out (#970, #976). Every embedded repo root (however
  547. * found) is recorded in `embeddedRoots` so callers can exempt its files from the
  548. * parent's own gitignore rules.
  549. */
  550. function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, embeddedRoots?: Set<string>, includeIgnored: Ignore | null = null): void {
  551. const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true };
  552. // Tracked files. --recurse-submodules pulls in files from active submodules,
  553. // which the index would otherwise represent only as a commit pointer.
  554. // Without this, monorepos using submodules index 0 files. (See issue #147.)
  555. // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
  556. // can't be combined with -o, so untracked files are gathered separately below.
  557. // -z gives NUL-separated, unquoted output so non-ASCII (e.g. CJK) paths
  558. // survive verbatim. Without it git octal-escapes and double-quotes such paths
  559. // (the core.quotepath default), and the quoted form never matches a real file
  560. // on disk → those files are silently dropped from the index. (#541)
  561. const tracked = execFileSync('git', ['ls-files', '-z', '-c', '--recurse-submodules'], gitOpts);
  562. for (const rel of tracked.split('\0')) {
  563. if (rel) files.add(normalizePath(prefix + rel));
  564. }
  565. // Untracked files (submodules manage their own untracked state). Embedded git
  566. // repos surface here as a single "subdir/" entry that git refuses to descend
  567. // into — recurse into those as their own repos so their source gets indexed.
  568. const untracked = execFileSync('git', ['ls-files', '-z', '-o', '--exclude-standard'], gitOpts);
  569. for (const rel of untracked.split('\0')) {
  570. if (!rel) continue;
  571. if (rel.endsWith('/')) {
  572. // git only emits a trailing-slash directory entry for an embedded repo.
  573. // Guard with a .git check anyway, and skip anything else exactly as git
  574. // itself skips it (we never descend into a non-repo opaque dir). Never
  575. // descend into default-ignored locations — an embedded repo inside
  576. // node_modules is an npm git-dependency, not project code.
  577. const childDir = path.join(repoDir, rel);
  578. // A git worktree surfaces here as an opaque untracked dir too — skip it,
  579. // it's a duplicate working view of an already-indexed repo (#848).
  580. if (classifyGitDir(childDir) === 'embedded' && !defaultsOnlyIgnore().ignores(rel)) {
  581. embeddedRoots?.add(normalizePath(prefix + rel));
  582. collectGitFiles(childDir, prefix + rel, files, embeddedRoots, includeIgnored);
  583. }
  584. continue;
  585. }
  586. files.add(normalizePath(prefix + rel));
  587. }
  588. // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a
  589. // super-repo .gitignore) never appear in any listing above. By default they
  590. // stay hidden — `.gitignore` is respected (#970, #976). They are recursed into
  591. // only when the project opted the directory in via `codegraph.json`
  592. // `includeIgnored` (#622, #699), which `findIgnoredEmbeddedRepos` enforces.
  593. for (const rel of findIgnoredEmbeddedRepos(repoDir, includeIgnored, prefix)) {
  594. embeddedRoots?.add(normalizePath(prefix + rel));
  595. collectGitFiles(path.join(repoDir, rel), prefix + rel, files, embeddedRoots, includeIgnored);
  596. }
  597. }
  598. /**
  599. * Get all files visible to git (tracked + untracked but not ignored).
  600. * Respects .gitignore at all levels (root, subdirectories) and descends into
  601. * embedded (nested, non-submodule) git repos. Returns null on failure
  602. * (non-git project) so callers can fall back to a filesystem walk.
  603. */
  604. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  605. try {
  606. // Check if the project directory is gitignored by a parent repo.
  607. // When rootDir lives inside a parent git repo that ignores it,
  608. // `git ls-files` returns nothing — fall back to filesystem walk.
  609. const gitRoot = execFileSync(
  610. 'git',
  611. ['rev-parse', '--show-toplevel'],
  612. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  613. ).trim();
  614. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  615. try {
  616. // git check-ignore exits 0 if the path IS ignored, 1 if not
  617. execFileSync(
  618. 'git',
  619. ['check-ignore', '-q', path.resolve(rootDir)],
  620. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  621. );
  622. // Directory is gitignored by parent repo — fall back to filesystem walk
  623. return null;
  624. } catch {
  625. // Not ignored — safe to use git ls-files
  626. }
  627. }
  628. const files = new Set<string>();
  629. const embeddedRoots = new Set<string>();
  630. collectGitFiles(rootDir, '', files, embeddedRoots, loadIncludeIgnoredMatcher(rootDir));
  631. // Apply built-in default ignores uniformly — to tracked files too, since
  632. // committing a dependency/build dir doesn't make it project code. A
  633. // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
  634. // Files inside an EMBEDDED repo are matched against that repo's own rules,
  635. // not the parent's: the parent's .gitignore hides the child repo from git,
  636. // not from the index. (#514)
  637. const ig = buildScopeIgnore(rootDir, embeddedRoots);
  638. return new Set([...files].filter((f) => !ig.ignores(f)));
  639. } catch {
  640. return null;
  641. }
  642. }
  643. /**
  644. * Result of git-based change detection.
  645. * Returns null when git is unavailable (non-git project or command failure),
  646. * signaling the caller to fall back to full filesystem scan.
  647. */
  648. interface GitChanges {
  649. modified: string[]; // M, MM, AM — files to re-hash + re-index
  650. added: string[]; // ?? — new untracked files to index
  651. deleted: string[]; // D — files to remove from DB
  652. }
  653. /**
  654. * Use `git status` to detect changed files instead of scanning every file.
  655. * Returns null on failure so callers fall back to full scan.
  656. *
  657. * Recurses into embedded repos — the untracked kind (#193: the parent's status
  658. * collapses them to an opaque `?? subdir/` entry) always, and the gitignored
  659. * kind (#514: they never appear in the parent's status at all) only for
  660. * directories opted in via `codegraph.json` `includeIgnored` (#622, #699) —
  661. * running `git status` inside each, so changes in a multi-repo workspace sync
  662. * without a full rescan. By default a gitignored dir is left alone, matching the
  663. * full-index scan (#970, #976). Deleting an ENTIRE embedded repo dir is the one
  664. * case this cannot see (the child status that would report the deletions is gone
  665. * with it); a full `codegraph index` reconciles that.
  666. */
  667. function getGitChangedFiles(rootDir: string): GitChanges | null {
  668. try {
  669. const changes: GitChanges = { modified: [], added: [], deleted: [] };
  670. // Custom extension → language overrides from the project's codegraph.json,
  671. // so change detection sees the same custom-extension files the full index does.
  672. const overrides = loadExtensionOverrides(rootDir);
  673. collectGitStatus(rootDir, '', changes, overrides, loadIncludeIgnoredMatcher(rootDir), loadExcludeMatcher(rootDir));
  674. return changes;
  675. } catch {
  676. return null;
  677. }
  678. }
  679. function collectGitStatus(repoDir: string, prefix: string, out: GitChanges, overrides?: Record<string, Language>, includeIgnored: Ignore | null = null, exclude: Ignore | null = null): void {
  680. const output = execFileSync(
  681. 'git',
  682. ['status', '--porcelain', '--no-renames'],
  683. { cwd: repoDir, encoding: 'utf-8', timeout: 10000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  684. );
  685. // This repo's own ignore rules — built-in defaults (#407) plus its .gitignore.
  686. // Change detection must exclude the SAME files the full index does, but git
  687. // status hides neither: it ignores nothing for *tracked* paths, and the
  688. // built-in defaults aren't gitignore at all. Without this filter a committed
  689. // vendor/ dir, or a tracked file under a .gitignored dir, surfaces here as a
  690. // change — so `codegraph status` (which reads getChangedFiles) reports a
  691. // pending edit the full index never tracks and `sync` never clears. Matching
  692. // repo-relative `rel` at each recursion level mirrors getGitVisibleFiles'
  693. // ScopeIgnore: every embedded repo is judged by ITS OWN rules, never the
  694. // parent's. (#766)
  695. const ig = buildDefaultIgnore(repoDir);
  696. const untrackedDirs: string[] = [];
  697. for (const line of output.split('\n')) {
  698. if (line.length < 4) continue; // Minimum: "XY file"
  699. const statusCode = line.substring(0, 2);
  700. const rel = normalizePath(line.substring(3));
  701. // Untracked directory entries (trailing slash) may hide an embedded repo —
  702. // collect for the recursion below instead of treating as a file.
  703. if (statusCode === '??' && rel.endsWith('/')) {
  704. untrackedDirs.push(rel);
  705. continue;
  706. }
  707. const filePath = normalizePath(prefix + rel);
  708. if (!isSourceFile(filePath, overrides)) continue;
  709. if (statusCode.includes('D')) {
  710. // Deletions stay unfiltered: getChangedFiles acts on one only when the
  711. // path is already tracked in the DB, where removal is always correct — and
  712. // that lets a newly-excluded dir's stale rows clean themselves up. (#766)
  713. out.deleted.push(filePath);
  714. continue;
  715. }
  716. // Added (`??`) / modified files inside an excluded dir must not enter the
  717. // index — match against the repo-relative path, same as the full scan. (#766)
  718. if (ig.ignores(rel)) continue;
  719. // User `codegraph.json` `exclude` (#999) is project-root-relative, so it's
  720. // matched against the full path — sync must not re-add a tracked file the
  721. // full index now keeps out. Deletions above stay unfiltered so a file that
  722. // WAS indexed before an exclude was added still cleans itself out.
  723. if (exclude && exclude.ignores(filePath)) continue;
  724. if (statusCode === '??') {
  725. out.added.push(filePath);
  726. } else {
  727. // M, MM, AM, A (staged), etc. — treat as modified
  728. out.modified.push(filePath);
  729. }
  730. }
  731. // Recurse embedded repos found under untracked dirs (at the dir itself or
  732. // nested deeper). Gitignored dirs are walked only for the directories the
  733. // project opted in via `includeIgnored`; by default `.gitignore` is respected
  734. // and they are left alone (#970, #976), mirroring the full-index scan.
  735. for (const rel of untrackedDirs) {
  736. for (const repoRel of findNestedGitRepos(path.join(repoDir, rel), rel)) {
  737. collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out, overrides, includeIgnored, exclude);
  738. }
  739. }
  740. for (const rel of findIgnoredEmbeddedRepos(repoDir, includeIgnored, prefix)) {
  741. collectGitStatus(path.join(repoDir, rel), prefix + rel, out, overrides, includeIgnored, exclude);
  742. }
  743. }
  744. /**
  745. * Recursively scan a directory for source files.
  746. *
  747. * In git repos, uses `git ls-files` (inherently respects .gitignore at all
  748. * levels), then keeps files with a supported source extension. For non-git
  749. * projects, falls back to a filesystem walk that parses .gitignore itself.
  750. */
  751. export function scanDirectory(
  752. rootDir: string,
  753. onProgress?: (current: number, file: string) => void
  754. ): string[] {
  755. // Custom extension → language overrides from the project's codegraph.json.
  756. const overrides = loadExtensionOverrides(rootDir);
  757. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  758. const gitFiles = getGitVisibleFiles(rootDir);
  759. if (gitFiles) {
  760. const files: string[] = [];
  761. let count = 0;
  762. for (const filePath of gitFiles) {
  763. if (isSourceFile(filePath, overrides)) {
  764. files.push(filePath);
  765. count++;
  766. onProgress?.(count, filePath);
  767. }
  768. }
  769. return files;
  770. }
  771. // Fallback: walk filesystem for non-git projects
  772. return scanDirectoryWalk(rootDir, onProgress);
  773. }
  774. /**
  775. * Async variant of scanDirectory that yields to the event loop periodically,
  776. * allowing worker threads to receive and render progress messages.
  777. */
  778. export async function scanDirectoryAsync(
  779. rootDir: string,
  780. onProgress?: (current: number, file: string) => void
  781. ): Promise<string[]> {
  782. // Custom extension → language overrides from the project's codegraph.json.
  783. const overrides = loadExtensionOverrides(rootDir);
  784. const gitFiles = getGitVisibleFiles(rootDir);
  785. if (gitFiles) {
  786. const files: string[] = [];
  787. let count = 0;
  788. for (const filePath of gitFiles) {
  789. if (isSourceFile(filePath, overrides)) {
  790. files.push(filePath);
  791. count++;
  792. onProgress?.(count, filePath);
  793. // Yield every 100 files so worker threads can render progress
  794. if (count % 100 === 0) {
  795. await new Promise<void>(r => setImmediate(r));
  796. }
  797. }
  798. }
  799. return files;
  800. }
  801. return scanDirectoryWalk(rootDir, onProgress);
  802. }
  803. /**
  804. * Filesystem walk fallback for non-git projects.
  805. */
  806. function scanDirectoryWalk(
  807. rootDir: string,
  808. onProgress?: (current: number, file: string) => void
  809. ): string[] {
  810. const files: string[] = [];
  811. let count = 0;
  812. const visitedDirs = new Set<string>();
  813. // Custom extension → language overrides from the project's codegraph.json.
  814. const overrides = loadExtensionOverrides(rootDir);
  815. // A .gitignore matcher scoped to the directory that declared it. Patterns in
  816. // a nested .gitignore are relative to that directory, so we keep the dir
  817. // alongside the matcher and test paths relative to it — mirroring how git
  818. // applies .gitignore files at every level.
  819. interface ScopedIgnore {
  820. dir: string;
  821. ig: Ignore;
  822. }
  823. const loadIgnore = (dir: string): ScopedIgnore | null => {
  824. const giPath = path.join(dir, '.gitignore');
  825. if (!fs.existsSync(giPath)) return null;
  826. // readGitignorePatterns is defensive: a non-UTF-8 (DLP-encrypted) or
  827. // uncompilable .gitignore is skipped/filtered with a warning, never thrown
  828. // (issue #682) — so the per-file `.ignores()` calls below can't crash.
  829. const patterns = readGitignorePatterns(giPath);
  830. return patterns ? { dir, ig: ignore().add(patterns) } : null;
  831. };
  832. const isIgnored = (fullPath: string, isDir: boolean, matchers: ScopedIgnore[]): boolean => {
  833. for (const { dir, ig } of matchers) {
  834. let rel = normalizePath(path.relative(dir, fullPath));
  835. if (!rel || rel.startsWith('..')) continue; // not under this matcher's dir
  836. if (isDir) rel += '/'; // dir-only rules (e.g. `build/`) only match with the slash
  837. if (ig.ignores(rel)) return true;
  838. }
  839. return false;
  840. };
  841. function walk(dir: string, matchers: ScopedIgnore[]): void {
  842. let realDir: string;
  843. try {
  844. realDir = fs.realpathSync(dir);
  845. } catch {
  846. logDebug('Skipping unresolvable directory', { dir });
  847. return;
  848. }
  849. if (visitedDirs.has(realDir)) {
  850. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  851. return;
  852. }
  853. visitedDirs.add(realDir);
  854. // This directory's own .gitignore (if present) applies to everything below it.
  855. // The root's .gitignore is already merged into the seeded base matcher (so a
  856. // negation there can override a built-in default), so skip it here.
  857. const own = dir === rootDir ? null : loadIgnore(dir);
  858. const active = own ? [...matchers, own] : matchers;
  859. let entries: fs.Dirent[];
  860. try {
  861. entries = fs.readdirSync(dir, { withFileTypes: true });
  862. } catch (error) {
  863. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  864. return;
  865. }
  866. for (const entry of entries) {
  867. // Never descend into git internals or any CodeGraph data directory
  868. // (the active one or a sibling another environment created — #636).
  869. if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue;
  870. const fullPath = path.join(dir, entry.name);
  871. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  872. if (entry.isSymbolicLink()) {
  873. try {
  874. const realTarget = fs.realpathSync(fullPath);
  875. const stat = fs.statSync(realTarget);
  876. if (stat.isDirectory()) {
  877. if (!isIgnored(fullPath, true, active)) {
  878. walk(fullPath, active);
  879. }
  880. } else if (stat.isFile()) {
  881. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath, overrides)) {
  882. files.push(relativePath);
  883. count++;
  884. onProgress?.(count, relativePath);
  885. }
  886. }
  887. } catch {
  888. logDebug('Skipping broken symlink', { path: fullPath });
  889. }
  890. continue;
  891. }
  892. if (entry.isDirectory()) {
  893. if (!isIgnored(fullPath, true, active)) {
  894. walk(fullPath, active);
  895. }
  896. } else if (entry.isFile()) {
  897. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath, overrides)) {
  898. files.push(relativePath);
  899. count++;
  900. onProgress?.(count, relativePath);
  901. }
  902. }
  903. }
  904. }
  905. // Seed a base matcher with the built-in default ignores (merged with the root
  906. // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
  907. const baseMatchers: ScopedIgnore[] = [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }];
  908. // Project `codegraph.json` `exclude` patterns (#999), rooted at the project so
  909. // `isIgnored` matches them against root-relative paths — same coverage the
  910. // git path gets via ScopeIgnore, for non-git projects.
  911. const exclude = loadExcludeMatcher(rootDir);
  912. if (exclude) baseMatchers.push({ dir: rootDir, ig: exclude });
  913. walk(rootDir, baseMatchers);
  914. return files;
  915. }
  916. /**
  917. * Extraction orchestrator
  918. */
  919. export class ExtractionOrchestrator {
  920. private rootDir: string;
  921. private queries: QueryBuilder;
  922. /**
  923. * Names of frameworks detected for this project, populated by indexAll().
  924. * Passed to extractFromSource so framework-specific extractors (route nodes,
  925. * middleware, etc.) run after the tree-sitter pass. Cleared if detection
  926. * hasn't run yet so single-file re-index paths can detect on the spot.
  927. */
  928. private detectedFrameworkNames: string[] | null = null;
  929. constructor(rootDir: string, queries: QueryBuilder) {
  930. this.rootDir = rootDir;
  931. this.queries = queries;
  932. }
  933. /**
  934. * Build a filesystem-backed ResolutionContext sufficient for framework
  935. * detection. Graph-query methods (getNodesByName etc.) return empty because
  936. * the DB hasn't been populated yet, but detect() only uses readFile,
  937. * fileExists, and getAllFiles, so that's fine.
  938. */
  939. private buildDetectionContext(files: string[]): ResolutionContext {
  940. const rootDir = this.rootDir;
  941. return {
  942. getNodesInFile: () => [],
  943. getNodesByName: () => [],
  944. getNodesByQualifiedName: () => [],
  945. getNodesByKind: () => [],
  946. getNodesByLowerName: () => [],
  947. getImportMappings: () => [],
  948. getAllFiles: () => files,
  949. getProjectRoot: () => rootDir,
  950. fileExists: (relativePath: string) => {
  951. const full = validatePathWithinRoot(rootDir, relativePath);
  952. if (!full) return false;
  953. try {
  954. return fs.existsSync(full);
  955. } catch {
  956. return false;
  957. }
  958. },
  959. readFile: (relativePath: string) => {
  960. const full = validatePathWithinRoot(rootDir, relativePath);
  961. if (!full) return null;
  962. try {
  963. return fs.readFileSync(full, 'utf-8');
  964. } catch {
  965. return null;
  966. }
  967. },
  968. // Monorepo support — needed by framework detect()s that probe
  969. // subpackage manifests (e.g. fabric-view looking at
  970. // packages/<sub>/package.json when the root manifest is just a
  971. // workspace declaration). Matches the resolver-context shape.
  972. listDirectories: (relativePath: string) => {
  973. const target =
  974. relativePath === '.' || relativePath === ''
  975. ? rootDir
  976. : path.join(rootDir, relativePath);
  977. try {
  978. return fs
  979. .readdirSync(target, { withFileTypes: true })
  980. .filter((entry) => entry.isDirectory())
  981. .map((entry) => entry.name);
  982. } catch {
  983. return [];
  984. }
  985. },
  986. };
  987. }
  988. /**
  989. * Detect frameworks on demand using the current scanned files (or a fresh
  990. * scan if none are provided). Cached on the orchestrator so repeat calls
  991. * inside a single run don't re-scan.
  992. */
  993. private ensureDetectedFrameworks(files?: string[]): string[] {
  994. if (this.detectedFrameworkNames !== null) return this.detectedFrameworkNames;
  995. const fileList = files ?? scanDirectory(this.rootDir);
  996. const context = this.buildDetectionContext(fileList);
  997. this.detectedFrameworkNames = detectFrameworks(context).map((r) => r.name);
  998. return this.detectedFrameworkNames;
  999. }
  1000. /**
  1001. * Index all files in the project
  1002. */
  1003. async indexAll(
  1004. onProgress?: (progress: IndexProgress) => void,
  1005. signal?: AbortSignal,
  1006. verbose?: boolean
  1007. ): Promise<IndexResult> {
  1008. await initGrammars();
  1009. const startTime = Date.now();
  1010. const errors: ExtractionError[] = [];
  1011. let filesIndexed = 0;
  1012. let filesSkipped = 0;
  1013. let filesErrored = 0;
  1014. let totalNodes = 0;
  1015. let totalEdges = 0;
  1016. // Custom extension → language overrides from the project's codegraph.json.
  1017. // Threaded into language detection so custom-extension files load the right
  1018. // grammar and store under the mapped language.
  1019. const overrides = loadExtensionOverrides(this.rootDir);
  1020. const log = verbose
  1021. ? (msg: string) => { console.log(`[worker] ${msg}`); }
  1022. : (_msg: string) => {};
  1023. // Phase 1: Scan for files
  1024. onProgress?.({
  1025. phase: 'scanning',
  1026. current: 0,
  1027. total: 0,
  1028. });
  1029. const files = await scanDirectoryAsync(this.rootDir, (current, file) => {
  1030. onProgress?.({
  1031. phase: 'scanning',
  1032. current,
  1033. total: 0,
  1034. currentFile: file,
  1035. });
  1036. });
  1037. // Detect frameworks once per indexAll run using the scanned file list.
  1038. // Names are passed to each parse call so framework-specific extractors
  1039. // (route nodes, middleware, etc.) run after the tree-sitter pass.
  1040. // Framework detection is reset each run so adding e.g. requirements.txt
  1041. // between runs is picked up without restarting the process.
  1042. this.detectedFrameworkNames = null;
  1043. const frameworkNames = this.ensureDetectedFrameworks(files);
  1044. if (signal?.aborted) {
  1045. return {
  1046. success: false,
  1047. filesIndexed: 0,
  1048. filesSkipped: 0,
  1049. filesErrored: 0,
  1050. nodesCreated: 0,
  1051. edgesCreated: 0,
  1052. errors: [{ message: 'Aborted', severity: 'error' }],
  1053. durationMs: Date.now() - startTime,
  1054. };
  1055. }
  1056. // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
  1057. const total = files.length;
  1058. let processed = 0;
  1059. // Emit parsing phase immediately so the progress bar appears during worker setup.
  1060. // The yield lets the shimmer worker flush the phase transition to stdout before
  1061. // the main thread starts synchronous grammar detection work.
  1062. onProgress?.({
  1063. phase: 'parsing',
  1064. current: 0,
  1065. total,
  1066. });
  1067. await new Promise(resolve => setImmediate(resolve));
  1068. // Detect needed languages and load grammars in the parse worker
  1069. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f, undefined, overrides)))];
  1070. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
  1071. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  1072. neededLanguages.push('cpp');
  1073. }
  1074. // Parse files on a pool of worker threads (keeps the main thread free for UI
  1075. // and uses every core). Falls back to in-process parsing when the compiled
  1076. // worker is unavailable (e.g. running from source in tests).
  1077. const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
  1078. const useWorker = fs.existsSync(parseWorkerPath);
  1079. let pool: ParseWorkerPool | null = null;
  1080. if (useWorker) {
  1081. // CODEGRAPH_PARSE_WORKERS: explicit worker count; 1 = the old single-worker
  1082. // behaviour (the conservative rollback). Unset → clamp(cores-1, 1, 8).
  1083. const poolSize = resolveParsePoolSize(process.env.CODEGRAPH_PARSE_WORKERS, os.cpus().length);
  1084. pool = new ParseWorkerPool({
  1085. languages: neededLanguages,
  1086. size: poolSize,
  1087. workerScriptPath: parseWorkerPath,
  1088. recycleInterval: WORKER_RECYCLE_INTERVAL,
  1089. parseTimeoutMs: PARSE_TIMEOUT_MS,
  1090. log,
  1091. });
  1092. log(`Parse worker pool: ${poolSize} worker(s)`);
  1093. } else {
  1094. // In-process fallback: load grammars locally and parse on the main thread.
  1095. await loadGrammarsForLanguages(neededLanguages);
  1096. }
  1097. /**
  1098. * Parse one file: on the pool when available (the promise REJECTS on a worker
  1099. * crash/timeout — the caller records it and the retry pass re-attempts), or
  1100. * in-process synchronously as the no-worker fallback. The language is resolved
  1101. * here on the main thread, where the codegraph.json overrides are loaded.
  1102. */
  1103. const parseFile = (filePath: string, content: string): Promise<ExtractionResult> => {
  1104. const language = detectLanguage(filePath, content, overrides);
  1105. if (!pool) return Promise.resolve(extractFromSource(filePath, content, language, frameworkNames));
  1106. return pool.requestParse({ filePath, content, language, frameworkNames });
  1107. };
  1108. // --- Bounded rolling-window dispatch, ordered commit ---
  1109. // Reads stay batched/parallel; parses run concurrently across the pool; the
  1110. // SQLite store stays on the main thread (it isn't thread-safe). Crucially we
  1111. // COMMIT results in original file order, not parse-completion order: the
  1112. // resolution phase (run after indexing) resolves an ambiguous reference to one
  1113. // of several same-named candidates by the nodes' DB insertion order, so a
  1114. // stable commit order keeps the resulting graph deterministic — byte-identical
  1115. // to the single-worker path — instead of drifting with parse timing. The
  1116. // `completed` buffer holds at most ~windowSize out-of-order results, so memory
  1117. // stays bounded.
  1118. const windowSize = pool ? Math.max(4, pool.size * 2) : 1;
  1119. const inFlight = new Set<Promise<void>>();
  1120. const completed = new Map<number,
  1121. | { ok: true; filePath: string; content: string; stats: fs.Stats; result: ExtractionResult }
  1122. | { ok: false; filePath: string; err: unknown }>();
  1123. let nextSeq = 0; // file-order sequence assigned at dispatch
  1124. let nextToStore = 0; // cursor: next sequence to commit
  1125. let aborted = false;
  1126. const storeResult = (filePath: string, content: string, stats: fs.Stats, result: ExtractionResult): void => {
  1127. processed++;
  1128. // Store in database on main thread (SQLite is not thread-safe)
  1129. if (result.nodes.length > 0 || result.errors.length === 0) {
  1130. const language = detectLanguage(filePath, content, overrides);
  1131. this.storeExtractionResult(filePath, content, language, stats, result);
  1132. }
  1133. if (result.errors.length > 0) {
  1134. for (const err of result.errors) {
  1135. if (!err.filePath) err.filePath = filePath;
  1136. }
  1137. errors.push(...result.errors);
  1138. }
  1139. if (result.nodes.length > 0) {
  1140. filesIndexed++;
  1141. totalNodes += result.nodes.length;
  1142. totalEdges += result.edges.length;
  1143. } else if (result.errors.some((e) => e.severity === 'error')) {
  1144. filesErrored++;
  1145. } else {
  1146. // Files with no symbols but no errors (yaml, twig, properties) are
  1147. // tracked at the file level — count them as indexed so the CLI doesn't
  1148. // misleadingly report "No files found to index".
  1149. const lang = detectLanguage(filePath, content, overrides);
  1150. if (isFileLevelOnlyLanguage(lang)) {
  1151. filesIndexed++;
  1152. } else {
  1153. filesSkipped++;
  1154. }
  1155. }
  1156. onProgress?.({ phase: 'parsing', current: processed, total, currentFile: filePath });
  1157. };
  1158. const recordParseFailure = (filePath: string, err: unknown): void => {
  1159. processed++;
  1160. filesErrored++;
  1161. errors.push({
  1162. message: err instanceof Error ? err.message : String(err),
  1163. filePath,
  1164. severity: 'error',
  1165. code: 'parse_error',
  1166. });
  1167. onProgress?.({ phase: 'parsing', current: processed, total });
  1168. };
  1169. // Commit buffered parses to the DB in file order, advancing the cursor over
  1170. // contiguous completed results. Runs after each parse settles (and once more
  1171. // after the drain). storeResult / recordParseFailure run here single-threaded,
  1172. // so shared counters and SQLite writes never race despite parallel parsing.
  1173. const flushOrdered = (): void => {
  1174. if (aborted) return;
  1175. while (completed.has(nextToStore)) {
  1176. const item = completed.get(nextToStore)!;
  1177. completed.delete(nextToStore);
  1178. nextToStore++;
  1179. if (item.ok) storeResult(item.filePath, item.content, item.stats, item.result);
  1180. else recordParseFailure(item.filePath, item.err);
  1181. }
  1182. };
  1183. // Dispatch one file's parse (parses run concurrently across the pool), tagged
  1184. // with its file-order sequence so flushOrdered commits results in order. The
  1185. // backpressure below bounds how far parsing runs ahead of the in-order commit.
  1186. const feed = async (filePath: string, content: string, stats: fs.Stats): Promise<void> => {
  1187. const seq = nextSeq++;
  1188. const p = (async () => {
  1189. try {
  1190. const result = await parseFile(filePath, content);
  1191. completed.set(seq, { ok: true, filePath, content, stats, result });
  1192. } catch (parseErr) {
  1193. completed.set(seq, { ok: false, filePath, err: parseErr });
  1194. }
  1195. flushOrdered();
  1196. })();
  1197. const tracked = p.finally(() => { inFlight.delete(tracked); });
  1198. inFlight.add(tracked);
  1199. // Backpressure on the dispatched-but-not-yet-committed count (in-flight +
  1200. // buffered), not just in-flight: a slow file sitting at the commit cursor
  1201. // lets later parses finish and buffer, which would otherwise grow without
  1202. // bound. Wait for parses to settle (each may advance the cursor) until the
  1203. // window has room. `inFlight.size > 0` guards against an empty race — the
  1204. // cursor file is always still in flight when the window is full.
  1205. while (nextSeq - nextToStore >= windowSize && inFlight.size > 0) {
  1206. await Promise.race(inFlight);
  1207. }
  1208. };
  1209. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  1210. if (signal?.aborted) { aborted = true; break; }
  1211. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  1212. // Read files in parallel (with path validation before any I/O)
  1213. const fileContents = await Promise.all(
  1214. batch.map(async (fp) => {
  1215. try {
  1216. // Indexing read: follow in-root symlinks the directory walk already
  1217. // descended into (the `../` guard still applies) so files reached
  1218. // via an in-root symlink-to-outside still index (#935).
  1219. const fullPath = validatePathWithinRoot(this.rootDir, fp, { allowSymlinkEscape: true });
  1220. if (!fullPath) {
  1221. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  1222. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  1223. }
  1224. const content = await fsp.readFile(fullPath, 'utf-8');
  1225. const stats = await fsp.stat(fullPath);
  1226. return { filePath: fp, content, stats, error: null as Error | null };
  1227. } catch (err) {
  1228. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  1229. }
  1230. })
  1231. );
  1232. // Dispatch each readable file into the bounded parse window; the window
  1233. // stores results on the main thread as they arrive.
  1234. for (const { filePath, content, stats, error } of fileContents) {
  1235. if (signal?.aborted) { aborted = true; break; }
  1236. if (error || content === null || stats === null) {
  1237. processed++;
  1238. filesErrored++;
  1239. errors.push({
  1240. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1241. filePath,
  1242. severity: 'error',
  1243. code: 'read_error',
  1244. });
  1245. onProgress?.({ phase: 'parsing', current: processed, total });
  1246. continue;
  1247. }
  1248. // Honour MAX_FILE_SIZE. Without this check, vendored generated
  1249. // headers, minified bundles, and other multi-MB files get indexed,
  1250. // wasting WASM heap and the worker recycle budget on inputs with no
  1251. // useful symbols. The single-file extractFile path already enforces
  1252. // this; the bulk path used to silently skip the check.
  1253. if (stats.size > MAX_FILE_SIZE) {
  1254. processed++;
  1255. filesSkipped++;
  1256. errors.push({
  1257. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1258. filePath,
  1259. severity: 'warning',
  1260. code: 'size_exceeded',
  1261. });
  1262. onProgress?.({ phase: 'parsing', current: processed, total });
  1263. continue;
  1264. }
  1265. // Parse on the pool (main thread stays unblocked). Errors/timeouts are
  1266. // handled inside feed() → recordParseFailure, feeding the retry pass.
  1267. await feed(filePath, content, stats);
  1268. }
  1269. if (aborted) break;
  1270. }
  1271. // Drain parses still in flight (skip on abort — we tear down below instead),
  1272. // then commit any results the cursor hasn't reached yet.
  1273. if (!aborted) {
  1274. await Promise.all(inFlight);
  1275. flushOrdered();
  1276. }
  1277. if (signal?.aborted || aborted) {
  1278. if (pool) await pool.destroy();
  1279. return {
  1280. success: false,
  1281. filesIndexed,
  1282. filesSkipped,
  1283. filesErrored,
  1284. nodesCreated: totalNodes,
  1285. edgesCreated: totalEdges,
  1286. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  1287. durationMs: Date.now() - startTime,
  1288. };
  1289. }
  1290. // Report 100% so the progress bar doesn't hang at 99%
  1291. onProgress?.({
  1292. phase: 'parsing',
  1293. current: total,
  1294. total,
  1295. });
  1296. // Yield so the shimmer worker's buffered stdout writes can flush.
  1297. // Worker thread stdout is proxied through the main thread's event loop,
  1298. // so synchronous work here blocks the animation from rendering.
  1299. await new Promise(resolve => setImmediate(resolve));
  1300. // Retry pass: files that failed due to WASM memory corruption may succeed
  1301. // on a fresh worker with a clean heap. Recycle before each attempt so
  1302. // every file gets the absolute cleanest WASM state possible.
  1303. const retryableErrors = errors.filter(
  1304. (e) => e.code === 'parse_error' && e.filePath &&
  1305. (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
  1306. );
  1307. if (retryableErrors.length > 0 && pool) {
  1308. log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
  1309. // Fresh WASM heaps for the retry phase. A retry that still crashes its
  1310. // worker makes the pool respawn it, so later retries keep landing on clean
  1311. // workers too.
  1312. pool.recycleAll();
  1313. const stillFailing: typeof retryableErrors = [];
  1314. for (const errEntry of retryableErrors) {
  1315. const filePath = errEntry.filePath!;
  1316. if (signal?.aborted) break;
  1317. let content: string;
  1318. try {
  1319. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  1320. if (!fullPath) continue;
  1321. content = await fsp.readFile(fullPath, 'utf-8');
  1322. } catch {
  1323. continue;
  1324. }
  1325. let result: ExtractionResult;
  1326. try {
  1327. result = await parseFile(filePath, content);
  1328. } catch {
  1329. stillFailing.push(errEntry);
  1330. continue;
  1331. }
  1332. if (result.nodes.length > 0 || result.errors.length === 0) {
  1333. const language = detectLanguage(filePath, content, overrides);
  1334. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  1335. this.storeExtractionResult(filePath, content, language, stats, result);
  1336. const idx = errors.indexOf(errEntry);
  1337. if (idx >= 0) errors.splice(idx, 1);
  1338. filesErrored--;
  1339. filesIndexed++;
  1340. totalNodes += result.nodes.length;
  1341. totalEdges += result.edges.length;
  1342. log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
  1343. }
  1344. }
  1345. // Last resort: for files that still crash on a clean worker, strip
  1346. // comment-only lines to reduce WASM memory pressure. Many compiler
  1347. // test files are 90%+ comments (CHECK directives) that don't contribute
  1348. // code nodes but consume parser memory.
  1349. if (stillFailing.length > 0) {
  1350. log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
  1351. pool.recycleAll();
  1352. for (const errEntry of stillFailing) {
  1353. const filePath = errEntry.filePath!;
  1354. if (signal?.aborted) break;
  1355. let fullContent: string;
  1356. try {
  1357. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  1358. if (!fullPath) continue;
  1359. fullContent = await fsp.readFile(fullPath, 'utf-8');
  1360. } catch {
  1361. continue;
  1362. }
  1363. // Strip lines that are entirely comments (preserving line numbers
  1364. // by replacing with empty lines so node positions stay correct)
  1365. const stripped = fullContent
  1366. .split('\n')
  1367. .map(line => /^\s*\/\//.test(line) ? '' : line)
  1368. .join('\n');
  1369. let result: ExtractionResult;
  1370. try {
  1371. result = await parseFile(filePath, stripped);
  1372. } catch {
  1373. continue;
  1374. }
  1375. if (result.nodes.length > 0 || result.errors.length === 0) {
  1376. const language = detectLanguage(filePath, fullContent, overrides);
  1377. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  1378. this.storeExtractionResult(filePath, fullContent, language, stats, result);
  1379. const idx = errors.indexOf(errEntry);
  1380. if (idx >= 0) errors.splice(idx, 1);
  1381. filesErrored--;
  1382. filesIndexed++;
  1383. totalNodes += result.nodes.length;
  1384. totalEdges += result.edges.length;
  1385. log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
  1386. }
  1387. }
  1388. }
  1389. }
  1390. // Shut down the parse worker pool.
  1391. if (pool) await pool.destroy();
  1392. return {
  1393. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1394. filesIndexed,
  1395. filesSkipped,
  1396. filesErrored,
  1397. nodesCreated: totalNodes,
  1398. edgesCreated: totalEdges,
  1399. errors,
  1400. durationMs: Date.now() - startTime,
  1401. };
  1402. }
  1403. /**
  1404. * Index specific files
  1405. */
  1406. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  1407. const startTime = Date.now();
  1408. const errors: ExtractionError[] = [];
  1409. let filesIndexed = 0;
  1410. let filesSkipped = 0;
  1411. let filesErrored = 0;
  1412. let totalNodes = 0;
  1413. let totalEdges = 0;
  1414. for (const filePath of filePaths) {
  1415. const result = await this.indexFile(filePath);
  1416. if (result.errors.length > 0) {
  1417. errors.push(...result.errors);
  1418. }
  1419. if (result.nodes.length > 0) {
  1420. filesIndexed++;
  1421. totalNodes += result.nodes.length;
  1422. totalEdges += result.edges.length;
  1423. } else if (result.errors.some((e) => e.severity === 'error')) {
  1424. filesErrored++;
  1425. } else {
  1426. const tracked = this.queries.getFileByPath(filePath);
  1427. if (tracked && isFileLevelOnlyLanguage(tracked.language)) {
  1428. filesIndexed++;
  1429. } else {
  1430. filesSkipped++;
  1431. }
  1432. }
  1433. }
  1434. return {
  1435. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1436. filesIndexed,
  1437. filesSkipped,
  1438. filesErrored,
  1439. nodesCreated: totalNodes,
  1440. edgesCreated: totalEdges,
  1441. errors,
  1442. durationMs: Date.now() - startTime,
  1443. };
  1444. }
  1445. /**
  1446. * Index a single file
  1447. */
  1448. async indexFile(relativePath: string): Promise<ExtractionResult> {
  1449. // Indexing read: follow in-root symlinks (the `../` guard still applies), #935.
  1450. const fullPath = validatePathWithinRoot(this.rootDir, relativePath, { allowSymlinkEscape: true });
  1451. if (!fullPath) {
  1452. return {
  1453. nodes: [],
  1454. edges: [],
  1455. unresolvedReferences: [],
  1456. errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1457. durationMs: 0,
  1458. };
  1459. }
  1460. // Read file content and stats
  1461. let content: string;
  1462. let stats: fs.Stats;
  1463. try {
  1464. stats = await fsp.stat(fullPath);
  1465. content = await fsp.readFile(fullPath, 'utf-8');
  1466. } catch (error) {
  1467. return {
  1468. nodes: [],
  1469. edges: [],
  1470. unresolvedReferences: [],
  1471. errors: [
  1472. {
  1473. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1474. filePath: relativePath,
  1475. severity: 'error',
  1476. code: 'read_error',
  1477. },
  1478. ],
  1479. durationMs: 0,
  1480. };
  1481. }
  1482. return this.indexFileWithContent(relativePath, content, stats);
  1483. }
  1484. /**
  1485. * Index a single file with pre-read content and stats.
  1486. * Used by the parallel batch reader to avoid redundant file I/O.
  1487. */
  1488. async indexFileWithContent(
  1489. relativePath: string,
  1490. content: string,
  1491. stats: fs.Stats
  1492. ): Promise<ExtractionResult> {
  1493. // Prevent `../` traversal; follow in-root symlinks like the directory walk (#935).
  1494. const fullPath = validatePathWithinRoot(this.rootDir, relativePath, { allowSymlinkEscape: true });
  1495. if (!fullPath) {
  1496. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  1497. return {
  1498. nodes: [],
  1499. edges: [],
  1500. unresolvedReferences: [],
  1501. errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1502. durationMs: 0,
  1503. };
  1504. }
  1505. // Check file size
  1506. if (stats.size > MAX_FILE_SIZE) {
  1507. return {
  1508. nodes: [],
  1509. edges: [],
  1510. unresolvedReferences: [],
  1511. errors: [
  1512. {
  1513. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1514. filePath: relativePath,
  1515. severity: 'warning',
  1516. code: 'size_exceeded',
  1517. },
  1518. ],
  1519. durationMs: 0,
  1520. };
  1521. }
  1522. // Detect language (honoring the project's codegraph.json extension overrides)
  1523. const language = detectLanguage(relativePath, content, loadExtensionOverrides(this.rootDir));
  1524. if (!isLanguageSupported(language)) {
  1525. return {
  1526. nodes: [],
  1527. edges: [],
  1528. unresolvedReferences: [],
  1529. errors: [],
  1530. durationMs: 0,
  1531. };
  1532. }
  1533. // Extract from source. Use cached framework names if indexAll has run,
  1534. // otherwise detect on the spot so single-file re-index paths still emit
  1535. // route nodes / middleware / etc.
  1536. const frameworkNames = this.ensureDetectedFrameworks();
  1537. const result = extractFromSource(relativePath, content, language, frameworkNames);
  1538. // Store in database
  1539. if (result.nodes.length > 0 || result.errors.length === 0) {
  1540. this.storeExtractionResult(relativePath, content, language, stats, result);
  1541. }
  1542. return result;
  1543. }
  1544. /**
  1545. * Store extraction result in database
  1546. */
  1547. private storeExtractionResult(
  1548. filePath: string,
  1549. content: string,
  1550. language: Language,
  1551. stats: fs.Stats,
  1552. result: ExtractionResult
  1553. ): void {
  1554. const contentHash = hashContent(content);
  1555. // Check if file already exists and hasn't changed
  1556. const existingFile = this.queries.getFileByPath(filePath);
  1557. if (existingFile && existingFile.contentHash === contentHash) {
  1558. return; // No changes
  1559. }
  1560. // Snapshot incoming cross-file edges BEFORE deleting this file's nodes.
  1561. // `deleteFile` cascades to delete every edge whose source OR target is a
  1562. // node in this file (edges.FK ... ON DELETE CASCADE). Edges whose SOURCE is
  1563. // in this file are re-emitted by the extractor below, but edges whose SOURCE
  1564. // is in a *different* (unchanged) file are not — they would be silently
  1565. // dropped, which is issue #899: re-indexing a callee file severs `calls`/
  1566. // `references` edges from callers that import it via module-attribute
  1567. // access (`pkg.mod.fn(...)`).
  1568. //
  1569. // We snapshot the edge plus the target node's (name, kind) so we can
  1570. // re-resolve to the re-indexed target's NEW id. Node ids are
  1571. // `sha256(filePath:kind:name:line)`, so any line shift in the callee file
  1572. // (e.g. a docstring-only edit above the symbol) changes every target id and
  1573. // a naive re-insert by old id would silently drop every edge. Matching by
  1574. // (filePath, kind, name) is stable across line shifts; if the symbol was
  1575. // renamed/removed, no match is found and the edge stays dropped (correct).
  1576. const crossFileIncomingEdges = existingFile
  1577. ? this.queries.getCrossFileIncomingEdgesWithTarget(filePath)
  1578. : [];
  1579. // Delete existing data for this file
  1580. if (existingFile) {
  1581. this.queries.deleteFile(filePath);
  1582. }
  1583. // Filter out nodes with missing required fields before insertion.
  1584. // This prevents FK violations when edges reference nodes that would
  1585. // be silently skipped by insertNode() (see issue #42).
  1586. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  1587. // Insert nodes
  1588. if (validNodes.length > 0) {
  1589. this.queries.insertNodes(validNodes);
  1590. }
  1591. // Filter edges to only reference nodes that were actually inserted
  1592. if (result.edges.length > 0) {
  1593. const insertedIds = new Set(validNodes.map((n) => n.id));
  1594. const validEdges = result.edges.filter(
  1595. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  1596. );
  1597. if (validEdges.length > 0) {
  1598. this.queries.insertEdges(validEdges);
  1599. }
  1600. }
  1601. // Re-insert cross-file incoming edges snapshotted before the delete,
  1602. // re-resolving each edge's target to the re-indexed node's new id by
  1603. // (filePath, kind, name). Node ids include the source line, so any line
  1604. // shift in the callee file (e.g. a docstring-only edit above the symbol)
  1605. // changes every target id and a naive re-insert by old id would drop them
  1606. // all. `insertEdges` still filters to endpoints that exist, so edges whose
  1607. // caller (source) was deleted, or whose callee (target) was renamed/removed
  1608. // during the re-index (no match in `newTargetIds`), are dropped. This
  1609. // closes the #899 edge-drop on `sync`.
  1610. if (crossFileIncomingEdges.length > 0) {
  1611. const newNodesByKindName = new Map<string, string>();
  1612. for (const n of validNodes) {
  1613. newNodesByKindName.set(`${n.kind}\0${n.name}`, n.id);
  1614. }
  1615. const reinserted: Edge[] = [];
  1616. for (const e of crossFileIncomingEdges) {
  1617. const newTargetId = newNodesByKindName.get(`${e.targetKind}\0${e.targetName}`);
  1618. if (newTargetId) {
  1619. reinserted.push({ source: e.source, target: newTargetId, kind: e.kind, metadata: e.metadata, line: e.line, column: e.column, provenance: e.provenance });
  1620. }
  1621. }
  1622. if (reinserted.length > 0) {
  1623. this.queries.insertEdges(reinserted);
  1624. }
  1625. }
  1626. // Insert unresolved references in batch with denormalized filePath/language
  1627. if (result.unresolvedReferences.length > 0) {
  1628. const insertedIds = new Set(validNodes.map((n) => n.id));
  1629. const refsWithContext = result.unresolvedReferences
  1630. .filter((ref) => insertedIds.has(ref.fromNodeId))
  1631. .map((ref) => ({
  1632. ...ref,
  1633. filePath: ref.filePath ?? filePath,
  1634. language: ref.language ?? language,
  1635. }));
  1636. if (refsWithContext.length > 0) {
  1637. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  1638. }
  1639. }
  1640. // Insert file record
  1641. const fileRecord: FileRecord = {
  1642. path: filePath,
  1643. contentHash,
  1644. language,
  1645. size: stats.size,
  1646. modifiedAt: stats.mtimeMs,
  1647. indexedAt: Date.now(),
  1648. nodeCount: result.nodes.length,
  1649. errors: result.errors.length > 0 ? result.errors : undefined,
  1650. };
  1651. this.queries.upsertFile(fileRecord);
  1652. }
  1653. /**
  1654. * Sync the index with the current file state.
  1655. *
  1656. * Change detection is filesystem-based, never git: a (size, mtime) stat
  1657. * pre-filter skips unchanged files, then a content-hash compare confirms real
  1658. * changes. This works in non-git projects and catches committed changes from
  1659. * `git pull`/`checkout`/`merge`/`rebase` that `git status` cannot see.
  1660. */
  1661. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  1662. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  1663. const startTime = Date.now();
  1664. let filesChecked = 0;
  1665. let filesAdded = 0;
  1666. let filesModified = 0;
  1667. let filesRemoved = 0;
  1668. let nodesUpdated = 0;
  1669. const changedFilePaths: string[] = [];
  1670. onProgress?.({
  1671. phase: 'scanning',
  1672. current: 0,
  1673. total: 0,
  1674. });
  1675. const filesToIndex: string[] = [];
  1676. // === Filesystem reconcile (git-independent) ===
  1677. // The source of truth for "what changed" is the filesystem vs the indexed
  1678. // state — never git. We enumerate the current source files and reconcile
  1679. // each against the DB. A cheap (size, mtime) stat pre-filter skips unchanged
  1680. // files without reading or hashing them, so the expensive read+hash+parse
  1681. // only runs for files that actually changed. This catches edits/adds/deletes
  1682. // whether or not the project uses git, and crucially also catches committed
  1683. // changes from `git pull`/`checkout`/`merge`/`rebase` — which `git status`
  1684. // cannot see, because the working tree is clean afterward.
  1685. const currentFiles = await scanDirectoryAsync(this.rootDir);
  1686. filesChecked = currentFiles.length;
  1687. const currentSet = new Set(currentFiles);
  1688. const trackedFiles = this.queries.getAllFiles();
  1689. const trackedMap = new Map<string, FileRecord>();
  1690. for (const f of trackedFiles) {
  1691. trackedMap.set(f.path, f);
  1692. }
  1693. // Removals: tracked in the DB but no longer a present source file. Check the
  1694. // filesystem directly — `scanDirectory` (via `git ls-files`) still lists a
  1695. // file deleted from disk but not yet staged, so set membership alone misses it.
  1696. // `reconcileChecks` drives the cooperative yield shared with the adds/mods loop
  1697. // below (see SYNC_RECONCILE_YIELD_INTERVAL / issue #905).
  1698. let reconcileChecks = 0;
  1699. for (const tracked of trackedFiles) {
  1700. if (!currentSet.has(tracked.path) || !fs.existsSync(path.join(this.rootDir, tracked.path))) {
  1701. this.queries.deleteFile(tracked.path);
  1702. filesRemoved++;
  1703. }
  1704. if (++reconcileChecks % SYNC_RECONCILE_YIELD_INTERVAL === 0) {
  1705. await new Promise<void>((resolve) => setImmediate(resolve));
  1706. }
  1707. }
  1708. // Adds / modifications.
  1709. for (const filePath of currentFiles) {
  1710. // Same cooperative yield as the removals loop — this is the other O(files)
  1711. // synchronous-stat loop that wedges the main thread on a large repo (#905).
  1712. // Yield at the top of the body so the `continue` fast-paths below still hit it.
  1713. if (++reconcileChecks % SYNC_RECONCILE_YIELD_INTERVAL === 0) {
  1714. await new Promise<void>((resolve) => setImmediate(resolve));
  1715. }
  1716. const fullPath = path.join(this.rootDir, filePath);
  1717. const tracked = trackedMap.get(filePath);
  1718. // Cheap pre-filter: an already-indexed file whose size AND mtime both match
  1719. // the DB is unchanged — skip it without reading or hashing. (A content
  1720. // change that preserves both exactly is the blind spot every mtime-based
  1721. // incremental tool accepts; `index --force` is the escape hatch. Git bumps
  1722. // mtime on every file it writes during checkout/merge, so pulls are caught.)
  1723. if (tracked) {
  1724. try {
  1725. const stat = fs.statSync(fullPath);
  1726. if (stat.size === tracked.size && Math.floor(stat.mtimeMs) === Math.floor(tracked.modifiedAt)) {
  1727. continue;
  1728. }
  1729. } catch (error) {
  1730. logDebug('Skipping unstattable file during sync', { filePath, error: String(error) });
  1731. continue;
  1732. }
  1733. }
  1734. // New, or size/mtime changed — read + hash to confirm a real content change.
  1735. let content: string;
  1736. try {
  1737. content = fs.readFileSync(fullPath, 'utf-8');
  1738. } catch (error) {
  1739. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  1740. continue;
  1741. }
  1742. const contentHash = hashContent(content);
  1743. if (!tracked) {
  1744. filesToIndex.push(filePath);
  1745. changedFilePaths.push(filePath);
  1746. filesAdded++;
  1747. } else if (tracked.contentHash !== contentHash) {
  1748. filesToIndex.push(filePath);
  1749. changedFilePaths.push(filePath);
  1750. filesModified++;
  1751. }
  1752. }
  1753. // Load only grammars needed for changed files
  1754. if (filesToIndex.length > 0) {
  1755. const overrides = loadExtensionOverrides(this.rootDir);
  1756. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f, undefined, overrides)))];
  1757. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
  1758. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  1759. neededLanguages.push('cpp');
  1760. }
  1761. await loadGrammarsForLanguages(neededLanguages);
  1762. }
  1763. // Index changed files
  1764. const total = filesToIndex.length;
  1765. for (let i = 0; i < filesToIndex.length; i++) {
  1766. const filePath = filesToIndex[i]!;
  1767. onProgress?.({
  1768. phase: 'parsing',
  1769. current: i + 1,
  1770. total,
  1771. currentFile: filePath,
  1772. });
  1773. const result = await this.indexFile(filePath);
  1774. nodesUpdated += result.nodes.length;
  1775. }
  1776. return {
  1777. filesChecked,
  1778. filesAdded,
  1779. filesModified,
  1780. filesRemoved,
  1781. nodesUpdated,
  1782. durationMs: Date.now() - startTime,
  1783. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  1784. };
  1785. }
  1786. /**
  1787. * Get files that have changed since last index.
  1788. * Uses git status as a fast path when available, falling back to full scan.
  1789. */
  1790. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  1791. const gitChanges = getGitChangedFiles(this.rootDir);
  1792. if (gitChanges) {
  1793. // === Git fast path ===
  1794. const added: string[] = [];
  1795. const modified: string[] = [];
  1796. const removed: string[] = [];
  1797. // Deleted files — only report if tracked in DB
  1798. for (const filePath of gitChanges.deleted) {
  1799. const tracked = this.queries.getFileByPath(filePath);
  1800. if (tracked) {
  1801. removed.push(filePath);
  1802. }
  1803. }
  1804. // Modified + added files — read + hash, compare with DB. Untracked (`??`)
  1805. // files stay untracked in git even after indexing, so they must be
  1806. // hash-compared like modified files instead of always counting as added —
  1807. // otherwise status reports them as pending forever. (See issue #206.)
  1808. for (const filePath of [...gitChanges.modified, ...gitChanges.added]) {
  1809. const fullPath = path.join(this.rootDir, filePath);
  1810. let content: string;
  1811. try {
  1812. content = fs.readFileSync(fullPath, 'utf-8');
  1813. } catch (error) {
  1814. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1815. continue;
  1816. }
  1817. const contentHash = hashContent(content);
  1818. const tracked = this.queries.getFileByPath(filePath);
  1819. if (!tracked) {
  1820. added.push(filePath);
  1821. } else if (tracked.contentHash !== contentHash) {
  1822. modified.push(filePath);
  1823. }
  1824. }
  1825. return { added, modified, removed };
  1826. }
  1827. // === Fallback: full scan (non-git project or git failure) ===
  1828. const currentFiles = new Set(scanDirectory(this.rootDir));
  1829. const trackedFiles = this.queries.getAllFiles();
  1830. // Build Map for O(1) lookups
  1831. const trackedMap = new Map<string, FileRecord>();
  1832. for (const f of trackedFiles) {
  1833. trackedMap.set(f.path, f);
  1834. }
  1835. const added: string[] = [];
  1836. const modified: string[] = [];
  1837. const removed: string[] = [];
  1838. // Find removed files
  1839. for (const tracked of trackedFiles) {
  1840. if (!currentFiles.has(tracked.path)) {
  1841. removed.push(tracked.path);
  1842. }
  1843. }
  1844. // Find added and modified files
  1845. for (const filePath of currentFiles) {
  1846. const fullPath = path.join(this.rootDir, filePath);
  1847. let content: string;
  1848. try {
  1849. content = fs.readFileSync(fullPath, 'utf-8');
  1850. } catch (error) {
  1851. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1852. continue;
  1853. }
  1854. const contentHash = hashContent(content);
  1855. const tracked = trackedMap.get(filePath);
  1856. if (!tracked) {
  1857. added.push(filePath);
  1858. } else if (tracked.contentHash !== contentHash) {
  1859. modified.push(filePath);
  1860. }
  1861. }
  1862. return { added, modified, removed };
  1863. }
  1864. }
  1865. // Re-export useful types and functions
  1866. export { extractFromSource } from './tree-sitter';
  1867. export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';