1
0

index.ts 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. Edge,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isSourceFile, isLanguageSupported, isFileLevelOnlyLanguage, initGrammars, loadGrammarsForLanguages } from './grammars';
  21. import { loadExtensionOverrides } from '../project-config';
  22. import { isCodeGraphDataDir } from '../directory';
  23. import { logDebug, logWarn } from '../errors';
  24. import { validatePathWithinRoot, normalizePath } from '../utils';
  25. import ignore, { Ignore } from 'ignore';
  26. import { detectFrameworks } from '../resolution/frameworks';
  27. import type { ResolutionContext } from '../resolution/types';
  28. /**
  29. * Number of files to read in parallel during indexing.
  30. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  31. */
  32. const FILE_IO_BATCH_SIZE = 10;
  33. /**
  34. * How many files the `sync()` reconcile processes between cooperative yields to
  35. * the event loop. The reconcile runs two O(files) loops of synchronous `fs`
  36. * calls (existsSync for removals, statSync for adds/mods); on a very large repo
  37. * (~100k files) an un-yielded run wedges the main thread for minutes, which both
  38. * trips the liveness watchdog (it SIGKILLs a process whose loop stops turning)
  39. * and blocks the first MCP tool call behind the catch-up gate (issue #905).
  40. * Yielding every N files keeps the socket, the watchdog heartbeat, and any
  41. * concurrent read query responsive while the reconcile runs.
  42. */
  43. const SYNC_RECONCILE_YIELD_INTERVAL = 1000;
  44. // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
  45. /**
  46. * Maximum time (ms) to wait for a single file to parse in the worker thread.
  47. * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
  48. * indexing run from freezing. The worker is restarted after a timeout.
  49. */
  50. const PARSE_TIMEOUT_MS = 10_000;
  51. /**
  52. * Number of files to parse before recycling the worker thread.
  53. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
  54. * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
  55. * V8 isolate by terminating the worker thread and spawning a fresh one.
  56. * This interval balances memory usage against the cost of reloading grammars.
  57. */
  58. const WORKER_RECYCLE_INTERVAL = 250;
  59. /**
  60. * Progress callback for indexing operations
  61. */
  62. export interface IndexProgress {
  63. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  64. current: number;
  65. total: number;
  66. currentFile?: string;
  67. }
  68. /**
  69. * Result of an indexing operation
  70. */
  71. export interface IndexResult {
  72. success: boolean;
  73. filesIndexed: number;
  74. filesSkipped: number;
  75. filesErrored: number;
  76. nodesCreated: number;
  77. edgesCreated: number;
  78. errors: ExtractionError[];
  79. durationMs: number;
  80. }
  81. /**
  82. * Result of a sync operation
  83. */
  84. export interface SyncResult {
  85. filesChecked: number;
  86. filesAdded: number;
  87. filesModified: number;
  88. filesRemoved: number;
  89. nodesUpdated: number;
  90. durationMs: number;
  91. changedFilePaths?: string[];
  92. }
  93. /**
  94. * Calculate SHA256 hash of file contents
  95. */
  96. export function hashContent(content: string): string {
  97. return crypto.createHash('sha256').update(content).digest('hex');
  98. }
  99. /**
  100. * Skip files larger than this (bytes). Generated bundles, minified JS, and
  101. * vendored blobs blow the WASM heap and the worker-recycle budget for no useful
  102. * symbols. 1 MB covers essentially all hand-written source.
  103. */
  104. const MAX_FILE_SIZE = 1024 * 1024;
  105. /**
  106. * Directory names that are dependency, build, cache, or tooling output across the
  107. * languages/frameworks CodeGraph supports — curated from the canonical
  108. * github/gitignore templates. Excluded by default so the graph reflects your code,
  109. * not third-party noise, without requiring a `.gitignore` (issue #407). The
  110. * exclusion applies uniformly (git or not, tracked or not); the only opt-in is an
  111. * explicit `.gitignore` negation (e.g. `!vendor/`). First-party-prone or generic
  112. * names (`packages`, `lib`, `app`, `bin`, `src`, `deps`, `env`, `tmp`, `storage`,
  113. * `Library`) are deliberately NOT listed, to avoid ever hiding real source.
  114. *
  115. * Only dirs that actually contain *indexable source* (or are enormous) earn a slot
  116. * — IDE/state dirs like `.idea`/`.vs` are omitted because CodeGraph indexes only
  117. * recognized source extensions, so they produce no symbols regardless.
  118. */
  119. const DEFAULT_IGNORE_DIRS: ReadonlySet<string> = new Set([
  120. // JS / TS — dependency directories
  121. 'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
  122. '.yarn', '.pnpm-store',
  123. // JS / TS — framework & bundler build / cache / deploy output
  124. '.next', '.nuxt', '.svelte-kit', '.turbo', '.vite', '.parcel-cache', '.angular',
  125. '.docusaurus', 'storybook-static', '.vinxi', '.nitro', 'out-tsc',
  126. '.vercel', '.netlify', '.wrangler',
  127. // Build output (common across ecosystems)
  128. 'dist', 'build', 'out', '.output',
  129. // Test / coverage
  130. 'coverage', '.nyc_output',
  131. // Python
  132. '__pycache__', '__pypackages__', '.venv', 'venv', '.pixi', '.pdm-build',
  133. '.mypy_cache', '.pytest_cache', '.ruff_cache', '.tox', '.nox', '.hypothesis',
  134. '.ipynb_checkpoints', '.eggs',
  135. // Rust / JVM (Maven, Gradle, Scala)
  136. 'target', '.gradle',
  137. // .NET
  138. 'obj',
  139. // Vendored deps (Go, PHP/Composer, Ruby/Bundler)
  140. 'vendor',
  141. // Swift / iOS
  142. '.build', 'Pods', 'Carthage', 'DerivedData', '.swiftpm',
  143. // Dart / Flutter
  144. '.dart_tool', '.pub-cache',
  145. // Native (Android NDK, C/C++ deps)
  146. '.cxx', '.externalNativeBuild', 'vcpkg_installed',
  147. // Scala tooling
  148. '.bloop', '.metals',
  149. // Lua / Luau (LuaRocks)
  150. 'lua_modules', '.luarocks',
  151. // Delphi / RAD Studio IDE backups (duplicate .pas source — would double-count)
  152. '__history', '__recovery',
  153. // Generic cache
  154. '.cache',
  155. ]);
  156. /** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
  157. const DEFAULT_IGNORE_PATTERNS: string[] = [
  158. ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
  159. '*.egg-info/', // Python packaging metadata
  160. 'cmake-build-*/', // CLion / CMake build trees
  161. 'bazel-*/', // Bazel output symlink trees
  162. ];
  163. /** True if `buf` decodes as strict UTF-8 (no invalid byte sequences). */
  164. function isValidUtf8(buf: Buffer): boolean {
  165. try {
  166. new TextDecoder('utf-8', { fatal: true }).decode(buf);
  167. return true;
  168. } catch {
  169. return false;
  170. }
  171. }
  172. /**
  173. * Read a `.gitignore` and return patterns safe to hand to the `ignore` matcher —
  174. * never throwing, even when the file isn't real gitignore text. Two failure
  175. * modes, both seen in the wild (issue #682):
  176. *
  177. * - The file isn't valid UTF-8 — e.g. transparently encrypted in place by
  178. * corporate DLP / endpoint-security software, leaving a UTF-16 header plus
  179. * ciphertext. None of it is meaningful patterns, so the whole file is skipped.
  180. * - The file is text but a single line can't be compiled to a regex by the
  181. * `ignore` library — `\\[` and friends throw "Unterminated character class".
  182. * Crucially the throw is LAZY (at match time, not `.add()`), so it would
  183. * otherwise escape mid-scan. That one pattern is dropped; the rest are kept.
  184. *
  185. * Either way a warning that NAMES the file is logged (the reporter couldn't tell
  186. * which `.gitignore` was at fault) and indexing continues instead of aborting.
  187. * Returns '' when there's nothing usable.
  188. */
  189. function readGitignorePatterns(giPath: string): string {
  190. let buf: Buffer;
  191. try {
  192. buf = fs.readFileSync(giPath);
  193. } catch {
  194. return ''; // unreadable (permissions / race) — treat as absent
  195. }
  196. // A NUL byte never appears in real gitignore text, and a fatal UTF-8 decode
  197. // catches the rest. Such a file isn't ignore patterns at all.
  198. if (buf.includes(0) || !isValidUtf8(buf)) {
  199. logWarn(
  200. 'Ignoring a .gitignore that is not valid UTF-8 text — it may have been encrypted ' +
  201. 'in place by endpoint-security software. Indexing continues without it.',
  202. { file: giPath },
  203. );
  204. return '';
  205. }
  206. const content = buf.toString('utf-8');
  207. // Fast path: one `.ignores()` call forces the library to compile EVERY rule,
  208. // so if it doesn't throw, the whole file is safe to use verbatim.
  209. try {
  210. ignore().add(content).ignores('.codegraph-probe');
  211. return content;
  212. } catch {
  213. // Fall through: a line is uncompilable — keep the good ones, drop the bad.
  214. }
  215. const kept: string[] = [];
  216. let dropped = 0;
  217. for (const line of content.split(/\r?\n/)) {
  218. try {
  219. ignore().add(line).ignores('.codegraph-probe');
  220. kept.push(line);
  221. } catch {
  222. dropped++;
  223. }
  224. }
  225. if (dropped > 0) {
  226. logWarn(
  227. `Skipped ${dropped} unparseable pattern(s) in a .gitignore; the rest are applied.`,
  228. { file: giPath },
  229. );
  230. }
  231. return kept.join('\n');
  232. }
  233. /**
  234. * An `ignore` matcher seeded with the built-in defaults, merged with the project's
  235. * root .gitignore so a negation there (e.g. `!vendor/`) overrides a default. Shared
  236. * by both enumeration paths so behavior is identical with or without git — and so
  237. * the defaults apply to tracked files too (committing a dependency dir doesn't make
  238. * it project code; the explicit `.gitignore` negation is the only opt-in).
  239. */
  240. export function buildDefaultIgnore(rootDir: string): Ignore {
  241. const ig = ignore().add(DEFAULT_IGNORE_PATTERNS);
  242. const rootGitignore = path.join(rootDir, '.gitignore');
  243. if (fs.existsSync(rootGitignore)) ig.add(readGitignorePatterns(rootGitignore));
  244. return ig;
  245. }
  246. /**
  247. * Defaults-only ignore matcher (no root `.gitignore` merged). Used wherever the
  248. * parent repo's own ignore rules must NOT apply — inside embedded child repos,
  249. * whose gitignore semantics their own `git ls-files` already enforced (#514).
  250. */
  251. function defaultsOnlyIgnore(): Ignore {
  252. return ignore().add(DEFAULT_IGNORE_PATTERNS);
  253. }
  254. /**
  255. * `git ls-files --directory` collapses a wholly-untracked/ignored directory into
  256. * one entry — and when the command's own cwd is such a directory (the indexed
  257. * root is itself a git-ignored subdir of an enclosing repo), git emits the
  258. * literal `./` meaning "this entire directory". That sentinel is not a real
  259. * nested path: feeding it to the `ignore` matcher throws ("path should be a
  260. * `path.relative()`d string, but got "./""), which used to abort `buildScopeIgnore`
  261. * and so break the MCP daemon's watcher/auto-sync on connect; and joining it back
  262. * onto `repoDir` would just re-point at the cwd. Drop it wherever we consume
  263. * `--directory` output. (#936)
  264. */
  265. function isWholeCwdEntry(entry: string): boolean {
  266. return entry === './' || entry === '.' || entry === '';
  267. }
  268. /**
  269. * List the gitignored DIRECTORIES of a repo (collapsed, trailing-slash form),
  270. * relative to `repoDir`. These are invisible to every other `git ls-files` /
  271. * `git status` mode — and in a multi-repo workspace they are exactly where the
  272. * nested project repos live (a super-repo `.gitignore`s its child repos to keep
  273. * `git status` quiet; that does not make them third-party code). (#514)
  274. */
  275. function listIgnoredDirs(repoDir: string): string[] {
  276. try {
  277. const out = execFileSync(
  278. 'git',
  279. ['ls-files', '-z', '-o', '-i', '--exclude-standard', '--directory'],
  280. { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true }
  281. );
  282. return out.split('\0').filter((e) => e.endsWith('/') && !isWholeCwdEntry(e));
  283. } catch {
  284. return [];
  285. }
  286. }
  287. /** Max directory depth searched below an ignored dir for nested `.git` roots. */
  288. const EMBEDDED_REPO_SEARCH_DEPTH = 4;
  289. /** Max directories examined per search — a huge ignored data dir must never stall a scan/sync. */
  290. const EMBEDDED_REPO_SEARCH_ENTRIES = 2000;
  291. /**
  292. * Classify a directory's `.git` entry for embedded-repo discovery.
  293. *
  294. * - A `.git` **directory** is an embedded clone — distinct first-party code a
  295. * super-repo merely hides from git; index it (#193, #514).
  296. * - A `.git` **file** is a pointer (`gitdir: …`). A git **worktree** points into
  297. * the host repo's own `.git/worktrees/<name>`, so it is a second working view
  298. * of a repo CodeGraph already indexes — indexing it just duplicates the whole
  299. * graph N times; skip it (#848). A **submodule worktree** points into
  300. * `.git/modules/<module>/worktrees/<name>` — same duplication, so skip it too
  301. * (#945). A **submodule** checkout points into `.git/modules/<module>` (no
  302. * `worktrees/` segment) and is distinct code, so index it as before.
  303. *
  304. * Returns `'none'` when there is no `.git` entry here.
  305. */
  306. function classifyGitDir(absDir: string): 'embedded' | 'worktree' | 'none' {
  307. let st: fs.Stats;
  308. try {
  309. st = fs.statSync(path.join(absDir, '.git'));
  310. } catch {
  311. return 'none';
  312. }
  313. if (st.isDirectory()) return 'embedded';
  314. if (!st.isFile()) return 'none';
  315. try {
  316. const gitdir = fs.readFileSync(path.join(absDir, '.git'), 'utf8').match(/^gitdir:\s*(.+)$/m)?.[1]?.trim();
  317. // A worktree's gitdir lives under some repo's `.git/worktrees/<name>` —
  318. // either the top-level repo's (`.git/worktrees/`) or, for a worktree of a
  319. // submodule, that submodule's gitdir (`.git/modules/<module>/worktrees/`).
  320. // The optional `modules/<module>` segment covers the submodule case (#945).
  321. // Match both separators so a Windows-style pointer is recognized too.
  322. if (gitdir && /(^|[\\/])\.git[\\/](modules[\\/][^\\/]+[\\/])?worktrees[\\/]/.test(gitdir)) return 'worktree';
  323. } catch {
  324. // Unreadable `.git` pointer — fall back to the prior "index it" behavior.
  325. }
  326. return 'embedded';
  327. }
  328. /**
  329. * Find git repositories nested under `absDir` (inclusive), shallow bounded BFS.
  330. * Stops descending at each repo root found — contents belong to that repo's own
  331. * enumeration. Skips default-ignored dirs (`node_modules` can contain `.git`
  332. * from npm git-dependencies — that never makes it project code) and CodeGraph
  333. * data dirs. Depth- and entry-capped so a huge ignored tree can't stall the scan.
  334. */
  335. function findNestedGitRepos(absDir: string, relPrefix: string): string[] {
  336. const found: string[] = [];
  337. const defaults = defaultsOnlyIgnore();
  338. const queue: Array<{ abs: string; rel: string; depth: number }> = [
  339. { abs: absDir, rel: relPrefix, depth: 0 },
  340. ];
  341. let examined = 0;
  342. while (queue.length > 0) {
  343. const { abs, rel, depth } = queue.shift()!;
  344. if (++examined > EMBEDDED_REPO_SEARCH_ENTRIES) {
  345. logDebug('Embedded-repo search entry cap hit — deeper repos (if any) not discovered', { under: relPrefix });
  346. break;
  347. }
  348. const cls = classifyGitDir(abs);
  349. if (cls === 'worktree') {
  350. continue; // a git worktree duplicates an already-indexed repo (#848) — skip
  351. }
  352. if (cls === 'embedded') {
  353. found.push(rel);
  354. continue; // its own git handles everything below
  355. }
  356. if (depth >= EMBEDDED_REPO_SEARCH_DEPTH) continue;
  357. let entries: fs.Dirent[];
  358. try {
  359. entries = fs.readdirSync(abs, { withFileTypes: true });
  360. } catch {
  361. continue;
  362. }
  363. for (const entry of entries) {
  364. if (!entry.isDirectory()) continue;
  365. if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue;
  366. const childRel = rel + entry.name + '/';
  367. if (defaults.ignores(childRel)) continue;
  368. queue.push({ abs: path.join(abs, entry.name), rel: childRel, depth: depth + 1 });
  369. }
  370. }
  371. return found;
  372. }
  373. /**
  374. * Workspace-scope ignore matcher. Ordinary paths get the root's matcher
  375. * (built-in defaults + root `.gitignore`); paths inside an EMBEDDED repo get
  376. * that repo's own matcher (defaults + its root `.gitignore`) — the parent's
  377. * `.gitignore` hides a child repo from git, not from the index (#514). A
  378. * directory path (trailing slash) that is an ANCESTOR of an embedded root is
  379. * never ignored, so directory-pruning callers (the Linux per-directory
  380. * watcher) still descend to reach the embedded repos.
  381. *
  382. * Single source of truth for indexer and watcher scope — they must not diverge.
  383. */
  384. export class ScopeIgnore {
  385. private embedded: Array<{ root: string; matcher: Ignore }>;
  386. private defaults: Ignore = defaultsOnlyIgnore();
  387. constructor(private rootMatcher: Ignore, embedded: Array<{ root: string; matcher: Ignore }>) {
  388. // Longest root first so paths in nested embedded repos hit the innermost matcher.
  389. this.embedded = [...embedded].sort((a, b) => b.root.length - a.root.length);
  390. }
  391. ignores(rel: string): boolean {
  392. for (const { root, matcher } of this.embedded) {
  393. if (rel.startsWith(root)) {
  394. const inner = rel.slice(root.length);
  395. if (inner === '') return false;
  396. // Built-in defaults apply to the FULL path uniformly (#407) — an
  397. // embedded repo inside node_modules (an npm git-dependency) must stay
  398. // excluded even though its own rules wouldn't ignore its files.
  399. return this.defaults.ignores(rel) || matcher.ignores(inner);
  400. }
  401. }
  402. // Never prune a directory that leads to an embedded repo.
  403. if (rel.endsWith('/') && this.embedded.some(({ root }) => root.startsWith(rel))) {
  404. return false;
  405. }
  406. return this.rootMatcher.ignores(rel);
  407. }
  408. }
  409. /**
  410. * Build the workspace-scope matcher. When the caller already knows the
  411. * embedded roots (the scanner discovers them during collection), pass them to
  412. * skip rediscovery; otherwise they're discovered here (the watcher path).
  413. */
  414. export function buildScopeIgnore(rootDir: string, embeddedRoots?: Iterable<string>): ScopeIgnore {
  415. const roots = embeddedRoots ? [...embeddedRoots] : discoverEmbeddedRepoRoots(rootDir);
  416. return new ScopeIgnore(
  417. buildDefaultIgnore(rootDir),
  418. roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })),
  419. );
  420. }
  421. /**
  422. * Standalone discovery of every embedded repo root under `rootDir` (relative,
  423. * trailing-slashed) — both the untracked kind (#193) and the gitignored kind
  424. * (#514), recursively (an embedded repo can embed further repos). Returns []
  425. * for non-git roots: the filesystem walk handles nested repos there already.
  426. */
  427. export function discoverEmbeddedRepoRoots(rootDir: string): string[] {
  428. try {
  429. execFileSync('git', ['rev-parse', '--git-dir'], { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true });
  430. } catch {
  431. return [];
  432. }
  433. const out: string[] = [];
  434. const defaults = defaultsOnlyIgnore();
  435. const visit = (repoAbs: string, prefix: string): void => {
  436. const candidates: string[] = [];
  437. try {
  438. const o = execFileSync(
  439. 'git',
  440. ['ls-files', '-z', '-o', '--exclude-standard', '--directory'],
  441. { cwd: repoAbs, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  442. );
  443. for (const e of o.split('\0')) {
  444. if (e.endsWith('/') && !isWholeCwdEntry(e) && !defaults.ignores(e)) {
  445. candidates.push(...findNestedGitRepos(path.join(repoAbs, e), e));
  446. }
  447. }
  448. } catch { /* untracked listing failed — ignored-side discovery still runs */ }
  449. candidates.push(...findIgnoredEmbeddedRepos(repoAbs));
  450. for (const rel of candidates) {
  451. const full = normalizePath(prefix + rel);
  452. out.push(full);
  453. visit(path.join(repoAbs, rel), full);
  454. }
  455. };
  456. visit(rootDir, '');
  457. return out;
  458. }
  459. /**
  460. * Discover embedded repos hidden by `repoDir`'s OWN ignore rules: for each
  461. * gitignored directory (skipping built-in default excludes), search for nested
  462. * `.git` roots. Returns repo paths relative to `repoDir`, trailing-slashed.
  463. */
  464. function findIgnoredEmbeddedRepos(repoDir: string): string[] {
  465. const defaults = defaultsOnlyIgnore();
  466. const repos: string[] = [];
  467. for (const dir of listIgnoredDirs(repoDir)) {
  468. if (defaults.ignores(dir)) continue;
  469. repos.push(...findNestedGitRepos(path.join(repoDir, dir), dir));
  470. }
  471. return repos;
  472. }
  473. /**
  474. * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
  475. * git repository rooted at `repoDir`, adding each to `files` with `prefix`
  476. * prepended so paths stay relative to the original scan root.
  477. *
  478. * Recurses into embedded git repositories — nested repos that are NOT submodules
  479. * (independent clones living inside the workspace, common in CMake "super-repo"
  480. * layouts). The parent repo's `git ls-files` cannot see into them: tracked output
  481. * skips them entirely, and untracked output reports them only as an opaque
  482. * "subdir/" entry (trailing slash) rather than expanding their files. Each
  483. * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
  484. * (See issue #193.) GITIGNORED embedded repos are invisible even to that —
  485. * they're discovered separately via `findIgnoredEmbeddedRepos` (#514); every
  486. * embedded repo root (however found) is recorded in `embeddedRoots` so callers
  487. * can exempt its files from the parent's own gitignore rules.
  488. */
  489. function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, embeddedRoots?: Set<string>): void {
  490. const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true };
  491. // Tracked files. --recurse-submodules pulls in files from active submodules,
  492. // which the index would otherwise represent only as a commit pointer.
  493. // Without this, monorepos using submodules index 0 files. (See issue #147.)
  494. // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
  495. // can't be combined with -o, so untracked files are gathered separately below.
  496. // -z gives NUL-separated, unquoted output so non-ASCII (e.g. CJK) paths
  497. // survive verbatim. Without it git octal-escapes and double-quotes such paths
  498. // (the core.quotepath default), and the quoted form never matches a real file
  499. // on disk → those files are silently dropped from the index. (#541)
  500. const tracked = execFileSync('git', ['ls-files', '-z', '-c', '--recurse-submodules'], gitOpts);
  501. for (const rel of tracked.split('\0')) {
  502. if (rel) files.add(normalizePath(prefix + rel));
  503. }
  504. // Untracked files (submodules manage their own untracked state). Embedded git
  505. // repos surface here as a single "subdir/" entry that git refuses to descend
  506. // into — recurse into those as their own repos so their source gets indexed.
  507. const untracked = execFileSync('git', ['ls-files', '-z', '-o', '--exclude-standard'], gitOpts);
  508. for (const rel of untracked.split('\0')) {
  509. if (!rel) continue;
  510. if (rel.endsWith('/')) {
  511. // git only emits a trailing-slash directory entry for an embedded repo.
  512. // Guard with a .git check anyway, and skip anything else exactly as git
  513. // itself skips it (we never descend into a non-repo opaque dir). Never
  514. // descend into default-ignored locations — an embedded repo inside
  515. // node_modules is an npm git-dependency, not project code.
  516. const childDir = path.join(repoDir, rel);
  517. // A git worktree surfaces here as an opaque untracked dir too — skip it,
  518. // it's a duplicate working view of an already-indexed repo (#848).
  519. if (classifyGitDir(childDir) === 'embedded' && !defaultsOnlyIgnore().ignores(rel)) {
  520. embeddedRoots?.add(normalizePath(prefix + rel));
  521. collectGitFiles(childDir, prefix + rel, files, embeddedRoots);
  522. }
  523. continue;
  524. }
  525. files.add(normalizePath(prefix + rel));
  526. }
  527. // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a
  528. // super-repo .gitignore) never appear in any listing above — discover and
  529. // recurse into them too. (#514)
  530. for (const rel of findIgnoredEmbeddedRepos(repoDir)) {
  531. embeddedRoots?.add(normalizePath(prefix + rel));
  532. collectGitFiles(path.join(repoDir, rel), prefix + rel, files, embeddedRoots);
  533. }
  534. }
  535. /**
  536. * Get all files visible to git (tracked + untracked but not ignored).
  537. * Respects .gitignore at all levels (root, subdirectories) and descends into
  538. * embedded (nested, non-submodule) git repos. Returns null on failure
  539. * (non-git project) so callers can fall back to a filesystem walk.
  540. */
  541. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  542. try {
  543. // Check if the project directory is gitignored by a parent repo.
  544. // When rootDir lives inside a parent git repo that ignores it,
  545. // `git ls-files` returns nothing — fall back to filesystem walk.
  546. const gitRoot = execFileSync(
  547. 'git',
  548. ['rev-parse', '--show-toplevel'],
  549. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  550. ).trim();
  551. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  552. try {
  553. // git check-ignore exits 0 if the path IS ignored, 1 if not
  554. execFileSync(
  555. 'git',
  556. ['check-ignore', '-q', path.resolve(rootDir)],
  557. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  558. );
  559. // Directory is gitignored by parent repo — fall back to filesystem walk
  560. return null;
  561. } catch {
  562. // Not ignored — safe to use git ls-files
  563. }
  564. }
  565. const files = new Set<string>();
  566. const embeddedRoots = new Set<string>();
  567. collectGitFiles(rootDir, '', files, embeddedRoots);
  568. // Apply built-in default ignores uniformly — to tracked files too, since
  569. // committing a dependency/build dir doesn't make it project code. A
  570. // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
  571. // Files inside an EMBEDDED repo are matched against that repo's own rules,
  572. // not the parent's: the parent's .gitignore hides the child repo from git,
  573. // not from the index. (#514)
  574. const ig = buildScopeIgnore(rootDir, embeddedRoots);
  575. return new Set([...files].filter((f) => !ig.ignores(f)));
  576. } catch {
  577. return null;
  578. }
  579. }
  580. /**
  581. * Result of git-based change detection.
  582. * Returns null when git is unavailable (non-git project or command failure),
  583. * signaling the caller to fall back to full filesystem scan.
  584. */
  585. interface GitChanges {
  586. modified: string[]; // M, MM, AM — files to re-hash + re-index
  587. added: string[]; // ?? — new untracked files to index
  588. deleted: string[]; // D — files to remove from DB
  589. }
  590. /**
  591. * Use `git status` to detect changed files instead of scanning every file.
  592. * Returns null on failure so callers fall back to full scan.
  593. *
  594. * Recurses into embedded repos — both the untracked kind (#193: the parent's
  595. * status collapses them to an opaque `?? subdir/` entry) and the gitignored
  596. * kind (#514: they never appear in the parent's status at all) — running
  597. * `git status` inside each, so changes in a multi-repo workspace sync without
  598. * a full rescan. Deleting an ENTIRE embedded repo dir is the one case this
  599. * cannot see (the child status that would report the deletions is gone with
  600. * it); a full `codegraph index` reconciles that.
  601. */
  602. function getGitChangedFiles(rootDir: string): GitChanges | null {
  603. try {
  604. const changes: GitChanges = { modified: [], added: [], deleted: [] };
  605. // Custom extension → language overrides from the project's codegraph.json,
  606. // so change detection sees the same custom-extension files the full index does.
  607. const overrides = loadExtensionOverrides(rootDir);
  608. collectGitStatus(rootDir, '', changes, overrides);
  609. return changes;
  610. } catch {
  611. return null;
  612. }
  613. }
  614. function collectGitStatus(repoDir: string, prefix: string, out: GitChanges, overrides?: Record<string, Language>): void {
  615. const output = execFileSync(
  616. 'git',
  617. ['status', '--porcelain', '--no-renames'],
  618. { cwd: repoDir, encoding: 'utf-8', timeout: 10000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  619. );
  620. // This repo's own ignore rules — built-in defaults (#407) plus its .gitignore.
  621. // Change detection must exclude the SAME files the full index does, but git
  622. // status hides neither: it ignores nothing for *tracked* paths, and the
  623. // built-in defaults aren't gitignore at all. Without this filter a committed
  624. // vendor/ dir, or a tracked file under a .gitignored dir, surfaces here as a
  625. // change — so `codegraph status` (which reads getChangedFiles) reports a
  626. // pending edit the full index never tracks and `sync` never clears. Matching
  627. // repo-relative `rel` at each recursion level mirrors getGitVisibleFiles'
  628. // ScopeIgnore: every embedded repo is judged by ITS OWN rules, never the
  629. // parent's. (#766)
  630. const ig = buildDefaultIgnore(repoDir);
  631. const untrackedDirs: string[] = [];
  632. for (const line of output.split('\n')) {
  633. if (line.length < 4) continue; // Minimum: "XY file"
  634. const statusCode = line.substring(0, 2);
  635. const rel = normalizePath(line.substring(3));
  636. // Untracked directory entries (trailing slash) may hide an embedded repo —
  637. // collect for the recursion below instead of treating as a file.
  638. if (statusCode === '??' && rel.endsWith('/')) {
  639. untrackedDirs.push(rel);
  640. continue;
  641. }
  642. const filePath = normalizePath(prefix + rel);
  643. if (!isSourceFile(filePath, overrides)) continue;
  644. if (statusCode.includes('D')) {
  645. // Deletions stay unfiltered: getChangedFiles acts on one only when the
  646. // path is already tracked in the DB, where removal is always correct — and
  647. // that lets a newly-excluded dir's stale rows clean themselves up. (#766)
  648. out.deleted.push(filePath);
  649. continue;
  650. }
  651. // Added (`??`) / modified files inside an excluded dir must not enter the
  652. // index — match against the repo-relative path, same as the full scan. (#766)
  653. if (ig.ignores(rel)) continue;
  654. if (statusCode === '??') {
  655. out.added.push(filePath);
  656. } else {
  657. // M, MM, AM, A (staged), etc. — treat as modified
  658. out.modified.push(filePath);
  659. }
  660. }
  661. // Recurse embedded repos found under untracked dirs (at the dir itself or
  662. // nested deeper) and under this repo's gitignored dirs.
  663. for (const rel of untrackedDirs) {
  664. for (const repoRel of findNestedGitRepos(path.join(repoDir, rel), rel)) {
  665. collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out, overrides);
  666. }
  667. }
  668. for (const rel of findIgnoredEmbeddedRepos(repoDir)) {
  669. collectGitStatus(path.join(repoDir, rel), prefix + rel, out, overrides);
  670. }
  671. }
  672. /**
  673. * Recursively scan a directory for source files.
  674. *
  675. * In git repos, uses `git ls-files` (inherently respects .gitignore at all
  676. * levels), then keeps files with a supported source extension. For non-git
  677. * projects, falls back to a filesystem walk that parses .gitignore itself.
  678. */
  679. export function scanDirectory(
  680. rootDir: string,
  681. onProgress?: (current: number, file: string) => void
  682. ): string[] {
  683. // Custom extension → language overrides from the project's codegraph.json.
  684. const overrides = loadExtensionOverrides(rootDir);
  685. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  686. const gitFiles = getGitVisibleFiles(rootDir);
  687. if (gitFiles) {
  688. const files: string[] = [];
  689. let count = 0;
  690. for (const filePath of gitFiles) {
  691. if (isSourceFile(filePath, overrides)) {
  692. files.push(filePath);
  693. count++;
  694. onProgress?.(count, filePath);
  695. }
  696. }
  697. return files;
  698. }
  699. // Fallback: walk filesystem for non-git projects
  700. return scanDirectoryWalk(rootDir, onProgress);
  701. }
  702. /**
  703. * Async variant of scanDirectory that yields to the event loop periodically,
  704. * allowing worker threads to receive and render progress messages.
  705. */
  706. export async function scanDirectoryAsync(
  707. rootDir: string,
  708. onProgress?: (current: number, file: string) => void
  709. ): Promise<string[]> {
  710. // Custom extension → language overrides from the project's codegraph.json.
  711. const overrides = loadExtensionOverrides(rootDir);
  712. const gitFiles = getGitVisibleFiles(rootDir);
  713. if (gitFiles) {
  714. const files: string[] = [];
  715. let count = 0;
  716. for (const filePath of gitFiles) {
  717. if (isSourceFile(filePath, overrides)) {
  718. files.push(filePath);
  719. count++;
  720. onProgress?.(count, filePath);
  721. // Yield every 100 files so worker threads can render progress
  722. if (count % 100 === 0) {
  723. await new Promise<void>(r => setImmediate(r));
  724. }
  725. }
  726. }
  727. return files;
  728. }
  729. return scanDirectoryWalk(rootDir, onProgress);
  730. }
  731. /**
  732. * Filesystem walk fallback for non-git projects.
  733. */
  734. function scanDirectoryWalk(
  735. rootDir: string,
  736. onProgress?: (current: number, file: string) => void
  737. ): string[] {
  738. const files: string[] = [];
  739. let count = 0;
  740. const visitedDirs = new Set<string>();
  741. // Custom extension → language overrides from the project's codegraph.json.
  742. const overrides = loadExtensionOverrides(rootDir);
  743. // A .gitignore matcher scoped to the directory that declared it. Patterns in
  744. // a nested .gitignore are relative to that directory, so we keep the dir
  745. // alongside the matcher and test paths relative to it — mirroring how git
  746. // applies .gitignore files at every level.
  747. interface ScopedIgnore {
  748. dir: string;
  749. ig: Ignore;
  750. }
  751. const loadIgnore = (dir: string): ScopedIgnore | null => {
  752. const giPath = path.join(dir, '.gitignore');
  753. if (!fs.existsSync(giPath)) return null;
  754. // readGitignorePatterns is defensive: a non-UTF-8 (DLP-encrypted) or
  755. // uncompilable .gitignore is skipped/filtered with a warning, never thrown
  756. // (issue #682) — so the per-file `.ignores()` calls below can't crash.
  757. const patterns = readGitignorePatterns(giPath);
  758. return patterns ? { dir, ig: ignore().add(patterns) } : null;
  759. };
  760. const isIgnored = (fullPath: string, isDir: boolean, matchers: ScopedIgnore[]): boolean => {
  761. for (const { dir, ig } of matchers) {
  762. let rel = normalizePath(path.relative(dir, fullPath));
  763. if (!rel || rel.startsWith('..')) continue; // not under this matcher's dir
  764. if (isDir) rel += '/'; // dir-only rules (e.g. `build/`) only match with the slash
  765. if (ig.ignores(rel)) return true;
  766. }
  767. return false;
  768. };
  769. function walk(dir: string, matchers: ScopedIgnore[]): void {
  770. let realDir: string;
  771. try {
  772. realDir = fs.realpathSync(dir);
  773. } catch {
  774. logDebug('Skipping unresolvable directory', { dir });
  775. return;
  776. }
  777. if (visitedDirs.has(realDir)) {
  778. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  779. return;
  780. }
  781. visitedDirs.add(realDir);
  782. // This directory's own .gitignore (if present) applies to everything below it.
  783. // The root's .gitignore is already merged into the seeded base matcher (so a
  784. // negation there can override a built-in default), so skip it here.
  785. const own = dir === rootDir ? null : loadIgnore(dir);
  786. const active = own ? [...matchers, own] : matchers;
  787. let entries: fs.Dirent[];
  788. try {
  789. entries = fs.readdirSync(dir, { withFileTypes: true });
  790. } catch (error) {
  791. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  792. return;
  793. }
  794. for (const entry of entries) {
  795. // Never descend into git internals or any CodeGraph data directory
  796. // (the active one or a sibling another environment created — #636).
  797. if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue;
  798. const fullPath = path.join(dir, entry.name);
  799. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  800. if (entry.isSymbolicLink()) {
  801. try {
  802. const realTarget = fs.realpathSync(fullPath);
  803. const stat = fs.statSync(realTarget);
  804. if (stat.isDirectory()) {
  805. if (!isIgnored(fullPath, true, active)) {
  806. walk(fullPath, active);
  807. }
  808. } else if (stat.isFile()) {
  809. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath, overrides)) {
  810. files.push(relativePath);
  811. count++;
  812. onProgress?.(count, relativePath);
  813. }
  814. }
  815. } catch {
  816. logDebug('Skipping broken symlink', { path: fullPath });
  817. }
  818. continue;
  819. }
  820. if (entry.isDirectory()) {
  821. if (!isIgnored(fullPath, true, active)) {
  822. walk(fullPath, active);
  823. }
  824. } else if (entry.isFile()) {
  825. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath, overrides)) {
  826. files.push(relativePath);
  827. count++;
  828. onProgress?.(count, relativePath);
  829. }
  830. }
  831. }
  832. }
  833. // Seed a base matcher with the built-in default ignores (merged with the root
  834. // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
  835. walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
  836. return files;
  837. }
  838. /**
  839. * Extraction orchestrator
  840. */
  841. export class ExtractionOrchestrator {
  842. private rootDir: string;
  843. private queries: QueryBuilder;
  844. /**
  845. * Names of frameworks detected for this project, populated by indexAll().
  846. * Passed to extractFromSource so framework-specific extractors (route nodes,
  847. * middleware, etc.) run after the tree-sitter pass. Cleared if detection
  848. * hasn't run yet so single-file re-index paths can detect on the spot.
  849. */
  850. private detectedFrameworkNames: string[] | null = null;
  851. constructor(rootDir: string, queries: QueryBuilder) {
  852. this.rootDir = rootDir;
  853. this.queries = queries;
  854. }
  855. /**
  856. * Build a filesystem-backed ResolutionContext sufficient for framework
  857. * detection. Graph-query methods (getNodesByName etc.) return empty because
  858. * the DB hasn't been populated yet, but detect() only uses readFile,
  859. * fileExists, and getAllFiles, so that's fine.
  860. */
  861. private buildDetectionContext(files: string[]): ResolutionContext {
  862. const rootDir = this.rootDir;
  863. return {
  864. getNodesInFile: () => [],
  865. getNodesByName: () => [],
  866. getNodesByQualifiedName: () => [],
  867. getNodesByKind: () => [],
  868. getNodesByLowerName: () => [],
  869. getImportMappings: () => [],
  870. getAllFiles: () => files,
  871. getProjectRoot: () => rootDir,
  872. fileExists: (relativePath: string) => {
  873. const full = validatePathWithinRoot(rootDir, relativePath);
  874. if (!full) return false;
  875. try {
  876. return fs.existsSync(full);
  877. } catch {
  878. return false;
  879. }
  880. },
  881. readFile: (relativePath: string) => {
  882. const full = validatePathWithinRoot(rootDir, relativePath);
  883. if (!full) return null;
  884. try {
  885. return fs.readFileSync(full, 'utf-8');
  886. } catch {
  887. return null;
  888. }
  889. },
  890. // Monorepo support — needed by framework detect()s that probe
  891. // subpackage manifests (e.g. fabric-view looking at
  892. // packages/<sub>/package.json when the root manifest is just a
  893. // workspace declaration). Matches the resolver-context shape.
  894. listDirectories: (relativePath: string) => {
  895. const target =
  896. relativePath === '.' || relativePath === ''
  897. ? rootDir
  898. : path.join(rootDir, relativePath);
  899. try {
  900. return fs
  901. .readdirSync(target, { withFileTypes: true })
  902. .filter((entry) => entry.isDirectory())
  903. .map((entry) => entry.name);
  904. } catch {
  905. return [];
  906. }
  907. },
  908. };
  909. }
  910. /**
  911. * Detect frameworks on demand using the current scanned files (or a fresh
  912. * scan if none are provided). Cached on the orchestrator so repeat calls
  913. * inside a single run don't re-scan.
  914. */
  915. private ensureDetectedFrameworks(files?: string[]): string[] {
  916. if (this.detectedFrameworkNames !== null) return this.detectedFrameworkNames;
  917. const fileList = files ?? scanDirectory(this.rootDir);
  918. const context = this.buildDetectionContext(fileList);
  919. this.detectedFrameworkNames = detectFrameworks(context).map((r) => r.name);
  920. return this.detectedFrameworkNames;
  921. }
  922. /**
  923. * Index all files in the project
  924. */
  925. async indexAll(
  926. onProgress?: (progress: IndexProgress) => void,
  927. signal?: AbortSignal,
  928. verbose?: boolean
  929. ): Promise<IndexResult> {
  930. await initGrammars();
  931. const startTime = Date.now();
  932. const errors: ExtractionError[] = [];
  933. let filesIndexed = 0;
  934. let filesSkipped = 0;
  935. let filesErrored = 0;
  936. let totalNodes = 0;
  937. let totalEdges = 0;
  938. // Custom extension → language overrides from the project's codegraph.json.
  939. // Threaded into language detection so custom-extension files load the right
  940. // grammar and store under the mapped language.
  941. const overrides = loadExtensionOverrides(this.rootDir);
  942. const log = verbose
  943. ? (msg: string) => { console.log(`[worker] ${msg}`); }
  944. : (_msg: string) => {};
  945. // Phase 1: Scan for files
  946. onProgress?.({
  947. phase: 'scanning',
  948. current: 0,
  949. total: 0,
  950. });
  951. const files = await scanDirectoryAsync(this.rootDir, (current, file) => {
  952. onProgress?.({
  953. phase: 'scanning',
  954. current,
  955. total: 0,
  956. currentFile: file,
  957. });
  958. });
  959. // Detect frameworks once per indexAll run using the scanned file list.
  960. // Names are passed to each parse call so framework-specific extractors
  961. // (route nodes, middleware, etc.) run after the tree-sitter pass.
  962. // Framework detection is reset each run so adding e.g. requirements.txt
  963. // between runs is picked up without restarting the process.
  964. this.detectedFrameworkNames = null;
  965. const frameworkNames = this.ensureDetectedFrameworks(files);
  966. if (signal?.aborted) {
  967. return {
  968. success: false,
  969. filesIndexed: 0,
  970. filesSkipped: 0,
  971. filesErrored: 0,
  972. nodesCreated: 0,
  973. edgesCreated: 0,
  974. errors: [{ message: 'Aborted', severity: 'error' }],
  975. durationMs: Date.now() - startTime,
  976. };
  977. }
  978. // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
  979. const total = files.length;
  980. let processed = 0;
  981. // Emit parsing phase immediately so the progress bar appears during worker setup.
  982. // The yield lets the shimmer worker flush the phase transition to stdout before
  983. // the main thread starts synchronous grammar detection work.
  984. onProgress?.({
  985. phase: 'parsing',
  986. current: 0,
  987. total,
  988. });
  989. await new Promise(resolve => setImmediate(resolve));
  990. // Detect needed languages and load grammars in the parse worker
  991. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f, undefined, overrides)))];
  992. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
  993. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  994. neededLanguages.push('cpp');
  995. }
  996. // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
  997. // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
  998. const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
  999. const useWorker = fs.existsSync(parseWorkerPath);
  1000. let WorkerClass: typeof import('worker_threads').Worker | null = null;
  1001. if (useWorker) {
  1002. const { Worker } = await import('worker_threads');
  1003. WorkerClass = Worker;
  1004. } else {
  1005. // In-process fallback: load grammars locally
  1006. await loadGrammarsForLanguages(neededLanguages);
  1007. }
  1008. // --- Worker lifecycle management ---
  1009. // The worker can crash (OOM in WASM) or hang on pathological files.
  1010. // We track pending parse promises and handle both cases:
  1011. // - Timeout: terminate + restart the worker, reject the timed-out request
  1012. // - Crash: reject all pending promises, restart for remaining files
  1013. let parseWorker: import('worker_threads').Worker | null = null;
  1014. let nextId = 0;
  1015. let workerParseCount = 0;
  1016. const pendingParses = new Map<number, {
  1017. resolve: (result: ExtractionResult) => void;
  1018. reject: (err: Error) => void;
  1019. timer: ReturnType<typeof setTimeout>;
  1020. }>();
  1021. function rejectAllPending(reason: string): void {
  1022. for (const [id, pending] of pendingParses) {
  1023. clearTimeout(pending.timer);
  1024. pendingParses.delete(id);
  1025. pending.reject(new Error(reason));
  1026. }
  1027. }
  1028. function attachWorkerHandlers(w: import('worker_threads').Worker): void {
  1029. w.on('message', (msg: { type: string; id?: number; result?: ExtractionResult }) => {
  1030. if (msg.type === 'parse-result' && msg.id !== undefined) {
  1031. const pending = pendingParses.get(msg.id);
  1032. if (pending) {
  1033. clearTimeout(pending.timer);
  1034. pendingParses.delete(msg.id);
  1035. pending.resolve(msg.result!);
  1036. }
  1037. }
  1038. });
  1039. w.on('error', (err) => {
  1040. logWarn('Parse worker error', { error: err.message });
  1041. rejectAllPending(`Worker error: ${err.message}`);
  1042. });
  1043. w.on('exit', (code) => {
  1044. if (code !== 0 && pendingParses.size > 0) {
  1045. logWarn('Parse worker exited unexpectedly', { code });
  1046. rejectAllPending(`Worker exited with code ${code}`);
  1047. }
  1048. // Clear reference so we know to respawn, reset count so
  1049. // the fresh worker gets a full cycle before recycling.
  1050. if (parseWorker === w) {
  1051. parseWorker = null;
  1052. workerParseCount = 0;
  1053. }
  1054. });
  1055. }
  1056. async function ensureWorker(): Promise<import('worker_threads').Worker> {
  1057. if (parseWorker) return parseWorker;
  1058. log('Spawning new parse worker...');
  1059. parseWorker = new WorkerClass!(parseWorkerPath);
  1060. attachWorkerHandlers(parseWorker);
  1061. // Load grammars in the new worker
  1062. await new Promise<void>((resolve, reject) => {
  1063. parseWorker!.once('message', (msg: { type: string }) => {
  1064. if (msg.type === 'grammars-loaded') resolve();
  1065. else reject(new Error(`Unexpected message: ${msg.type}`));
  1066. });
  1067. parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages });
  1068. });
  1069. return parseWorker;
  1070. }
  1071. if (WorkerClass) {
  1072. await ensureWorker();
  1073. }
  1074. /**
  1075. * Recycle the worker thread to reclaim WASM memory.
  1076. * Terminates the current worker and clears the reference so
  1077. * ensureWorker() will spawn a fresh one on the next call.
  1078. */
  1079. function recycleWorker(): void {
  1080. if (!parseWorker) return;
  1081. log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
  1082. const w = parseWorker;
  1083. parseWorker = null;
  1084. workerParseCount = 0;
  1085. // Fire-and-forget: worker.terminate() can hang if WASM is stuck
  1086. w.terminate().catch(() => {});
  1087. }
  1088. async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
  1089. // Resolve the language on the main thread (where the project's
  1090. // codegraph.json overrides are loaded) and hand it to the worker, so the
  1091. // worker never needs the override map itself.
  1092. const language = detectLanguage(filePath, content, overrides);
  1093. if (!WorkerClass) {
  1094. // In-process fallback
  1095. return extractFromSource(
  1096. filePath,
  1097. content,
  1098. language,
  1099. frameworkNames
  1100. );
  1101. }
  1102. // Recycle the worker before the next parse if we've hit the threshold.
  1103. // This destroys the WASM linear memory (which can grow but never shrink)
  1104. // and starts a fresh worker with a clean heap.
  1105. if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
  1106. await recycleWorker();
  1107. }
  1108. const worker = await ensureWorker();
  1109. const id = nextId++;
  1110. workerParseCount++;
  1111. // Scale timeout for large files: base 10s + 10s per 100KB
  1112. const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
  1113. return new Promise<ExtractionResult>((resolve, reject) => {
  1114. const timer = setTimeout(() => {
  1115. pendingParses.delete(id);
  1116. log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
  1117. // Reject FIRST — worker.terminate() can hang if WASM is stuck
  1118. parseWorker = null;
  1119. workerParseCount = 0;
  1120. reject(new Error(`Parse timed out after ${timeoutMs}ms`));
  1121. // Fire-and-forget: kill the stuck worker in the background
  1122. worker.terminate().catch(() => {});
  1123. }, timeoutMs);
  1124. pendingParses.set(id, { resolve, reject, timer });
  1125. worker.postMessage({ type: 'parse', id, filePath, content, frameworkNames, language });
  1126. });
  1127. }
  1128. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  1129. if (signal?.aborted) {
  1130. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  1131. return {
  1132. success: false,
  1133. filesIndexed,
  1134. filesSkipped,
  1135. filesErrored,
  1136. nodesCreated: totalNodes,
  1137. edgesCreated: totalEdges,
  1138. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  1139. durationMs: Date.now() - startTime,
  1140. };
  1141. }
  1142. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  1143. // Read files in parallel (with path validation before any I/O)
  1144. const fileContents = await Promise.all(
  1145. batch.map(async (fp) => {
  1146. try {
  1147. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  1148. if (!fullPath) {
  1149. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  1150. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  1151. }
  1152. const content = await fsp.readFile(fullPath, 'utf-8');
  1153. const stats = await fsp.stat(fullPath);
  1154. return { filePath: fp, content, stats, error: null as Error | null };
  1155. } catch (err) {
  1156. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  1157. }
  1158. })
  1159. );
  1160. // Send to worker for parsing, store results on main thread
  1161. for (const { filePath, content, stats, error } of fileContents) {
  1162. if (signal?.aborted) {
  1163. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  1164. return {
  1165. success: false,
  1166. filesIndexed,
  1167. filesSkipped,
  1168. filesErrored,
  1169. nodesCreated: totalNodes,
  1170. edgesCreated: totalEdges,
  1171. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  1172. durationMs: Date.now() - startTime,
  1173. };
  1174. }
  1175. // Report progress before parsing (show current file being worked on)
  1176. onProgress?.({
  1177. phase: 'parsing',
  1178. current: processed,
  1179. total,
  1180. currentFile: filePath,
  1181. });
  1182. if (error || content === null || stats === null) {
  1183. processed++;
  1184. filesErrored++;
  1185. errors.push({
  1186. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1187. filePath,
  1188. severity: 'error',
  1189. code: 'read_error',
  1190. });
  1191. continue;
  1192. }
  1193. // Honour MAX_FILE_SIZE. Without this check, vendored generated
  1194. // headers, minified bundles, and other multi-MB files get indexed,
  1195. // wasting WASM heap and the worker recycle budget on inputs with no
  1196. // useful symbols. The single-file extractFile path already enforces
  1197. // this; the bulk path used to silently skip the check.
  1198. if (stats.size > MAX_FILE_SIZE) {
  1199. processed++;
  1200. filesSkipped++;
  1201. errors.push({
  1202. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1203. filePath,
  1204. severity: 'warning',
  1205. code: 'size_exceeded',
  1206. });
  1207. onProgress?.({ phase: 'parsing', current: processed, total });
  1208. continue;
  1209. }
  1210. // Parse in worker thread (main thread stays unblocked).
  1211. // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
  1212. let result: ExtractionResult;
  1213. try {
  1214. result = await requestParse(filePath, content);
  1215. } catch (parseErr) {
  1216. processed++;
  1217. filesErrored++;
  1218. errors.push({
  1219. message: parseErr instanceof Error ? parseErr.message : String(parseErr),
  1220. filePath,
  1221. severity: 'error',
  1222. code: 'parse_error',
  1223. });
  1224. continue;
  1225. }
  1226. processed++;
  1227. // Store in database on main thread (SQLite is not thread-safe)
  1228. if (result.nodes.length > 0 || result.errors.length === 0) {
  1229. const language = detectLanguage(filePath, content, overrides);
  1230. this.storeExtractionResult(filePath, content, language, stats, result);
  1231. }
  1232. if (result.errors.length > 0) {
  1233. for (const err of result.errors) {
  1234. if (!err.filePath) err.filePath = filePath;
  1235. }
  1236. errors.push(...result.errors);
  1237. }
  1238. if (result.nodes.length > 0) {
  1239. filesIndexed++;
  1240. totalNodes += result.nodes.length;
  1241. totalEdges += result.edges.length;
  1242. } else if (result.errors.some((e) => e.severity === 'error')) {
  1243. filesErrored++;
  1244. } else {
  1245. // Files with no symbols but no errors (yaml, twig, properties) are
  1246. // tracked at the file level — count them as indexed so the CLI
  1247. // doesn't misleadingly report "No files found to index".
  1248. const lang = detectLanguage(filePath, content, overrides);
  1249. if (isFileLevelOnlyLanguage(lang)) {
  1250. filesIndexed++;
  1251. } else {
  1252. filesSkipped++;
  1253. }
  1254. }
  1255. }
  1256. }
  1257. // Report 100% so the progress bar doesn't hang at 99%
  1258. onProgress?.({
  1259. phase: 'parsing',
  1260. current: total,
  1261. total,
  1262. });
  1263. // Yield so the shimmer worker's buffered stdout writes can flush.
  1264. // Worker thread stdout is proxied through the main thread's event loop,
  1265. // so synchronous work here blocks the animation from rendering.
  1266. await new Promise(resolve => setImmediate(resolve));
  1267. // Retry pass: files that failed due to WASM memory corruption may succeed
  1268. // on a fresh worker with a clean heap. Recycle before each attempt so
  1269. // every file gets the absolute cleanest WASM state possible.
  1270. const retryableErrors = errors.filter(
  1271. (e) => e.code === 'parse_error' && e.filePath &&
  1272. (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
  1273. );
  1274. if (retryableErrors.length > 0 && WorkerClass) {
  1275. log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
  1276. const stillFailing: typeof retryableErrors = [];
  1277. for (const errEntry of retryableErrors) {
  1278. const filePath = errEntry.filePath!;
  1279. if (signal?.aborted) break;
  1280. // Fresh worker for every retry — maximum WASM headroom
  1281. recycleWorker();
  1282. let content: string;
  1283. try {
  1284. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  1285. if (!fullPath) continue;
  1286. content = await fsp.readFile(fullPath, 'utf-8');
  1287. } catch {
  1288. continue;
  1289. }
  1290. let result: ExtractionResult;
  1291. try {
  1292. result = await requestParse(filePath, content);
  1293. } catch {
  1294. stillFailing.push(errEntry);
  1295. continue;
  1296. }
  1297. if (result.nodes.length > 0 || result.errors.length === 0) {
  1298. const language = detectLanguage(filePath, content, overrides);
  1299. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  1300. this.storeExtractionResult(filePath, content, language, stats, result);
  1301. const idx = errors.indexOf(errEntry);
  1302. if (idx >= 0) errors.splice(idx, 1);
  1303. filesErrored--;
  1304. filesIndexed++;
  1305. totalNodes += result.nodes.length;
  1306. totalEdges += result.edges.length;
  1307. log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
  1308. }
  1309. }
  1310. // Last resort: for files that still crash on a clean worker, strip
  1311. // comment-only lines to reduce WASM memory pressure. Many compiler
  1312. // test files are 90%+ comments (CHECK directives) that don't contribute
  1313. // code nodes but consume parser memory.
  1314. if (stillFailing.length > 0) {
  1315. log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
  1316. for (const errEntry of stillFailing) {
  1317. const filePath = errEntry.filePath!;
  1318. if (signal?.aborted) break;
  1319. recycleWorker();
  1320. let fullContent: string;
  1321. try {
  1322. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  1323. if (!fullPath) continue;
  1324. fullContent = await fsp.readFile(fullPath, 'utf-8');
  1325. } catch {
  1326. continue;
  1327. }
  1328. // Strip lines that are entirely comments (preserving line numbers
  1329. // by replacing with empty lines so node positions stay correct)
  1330. const stripped = fullContent
  1331. .split('\n')
  1332. .map(line => /^\s*\/\//.test(line) ? '' : line)
  1333. .join('\n');
  1334. let result: ExtractionResult;
  1335. try {
  1336. result = await requestParse(filePath, stripped);
  1337. } catch {
  1338. continue;
  1339. }
  1340. if (result.nodes.length > 0 || result.errors.length === 0) {
  1341. const language = detectLanguage(filePath, fullContent, overrides);
  1342. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  1343. this.storeExtractionResult(filePath, fullContent, language, stats, result);
  1344. const idx = errors.indexOf(errEntry);
  1345. if (idx >= 0) errors.splice(idx, 1);
  1346. filesErrored--;
  1347. filesIndexed++;
  1348. totalNodes += result.nodes.length;
  1349. totalEdges += result.edges.length;
  1350. log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
  1351. }
  1352. }
  1353. }
  1354. }
  1355. // Shut down parse worker and clear any pending timers
  1356. rejectAllPending('Indexing complete');
  1357. if (parseWorker) {
  1358. (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  1359. }
  1360. return {
  1361. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1362. filesIndexed,
  1363. filesSkipped,
  1364. filesErrored,
  1365. nodesCreated: totalNodes,
  1366. edgesCreated: totalEdges,
  1367. errors,
  1368. durationMs: Date.now() - startTime,
  1369. };
  1370. }
  1371. /**
  1372. * Index specific files
  1373. */
  1374. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  1375. const startTime = Date.now();
  1376. const errors: ExtractionError[] = [];
  1377. let filesIndexed = 0;
  1378. let filesSkipped = 0;
  1379. let filesErrored = 0;
  1380. let totalNodes = 0;
  1381. let totalEdges = 0;
  1382. for (const filePath of filePaths) {
  1383. const result = await this.indexFile(filePath);
  1384. if (result.errors.length > 0) {
  1385. errors.push(...result.errors);
  1386. }
  1387. if (result.nodes.length > 0) {
  1388. filesIndexed++;
  1389. totalNodes += result.nodes.length;
  1390. totalEdges += result.edges.length;
  1391. } else if (result.errors.some((e) => e.severity === 'error')) {
  1392. filesErrored++;
  1393. } else {
  1394. const tracked = this.queries.getFileByPath(filePath);
  1395. if (tracked && isFileLevelOnlyLanguage(tracked.language)) {
  1396. filesIndexed++;
  1397. } else {
  1398. filesSkipped++;
  1399. }
  1400. }
  1401. }
  1402. return {
  1403. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1404. filesIndexed,
  1405. filesSkipped,
  1406. filesErrored,
  1407. nodesCreated: totalNodes,
  1408. edgesCreated: totalEdges,
  1409. errors,
  1410. durationMs: Date.now() - startTime,
  1411. };
  1412. }
  1413. /**
  1414. * Index a single file
  1415. */
  1416. async indexFile(relativePath: string): Promise<ExtractionResult> {
  1417. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  1418. if (!fullPath) {
  1419. return {
  1420. nodes: [],
  1421. edges: [],
  1422. unresolvedReferences: [],
  1423. errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1424. durationMs: 0,
  1425. };
  1426. }
  1427. // Read file content and stats
  1428. let content: string;
  1429. let stats: fs.Stats;
  1430. try {
  1431. stats = await fsp.stat(fullPath);
  1432. content = await fsp.readFile(fullPath, 'utf-8');
  1433. } catch (error) {
  1434. return {
  1435. nodes: [],
  1436. edges: [],
  1437. unresolvedReferences: [],
  1438. errors: [
  1439. {
  1440. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1441. filePath: relativePath,
  1442. severity: 'error',
  1443. code: 'read_error',
  1444. },
  1445. ],
  1446. durationMs: 0,
  1447. };
  1448. }
  1449. return this.indexFileWithContent(relativePath, content, stats);
  1450. }
  1451. /**
  1452. * Index a single file with pre-read content and stats.
  1453. * Used by the parallel batch reader to avoid redundant file I/O.
  1454. */
  1455. async indexFileWithContent(
  1456. relativePath: string,
  1457. content: string,
  1458. stats: fs.Stats
  1459. ): Promise<ExtractionResult> {
  1460. // Prevent path traversal
  1461. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  1462. if (!fullPath) {
  1463. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  1464. return {
  1465. nodes: [],
  1466. edges: [],
  1467. unresolvedReferences: [],
  1468. errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1469. durationMs: 0,
  1470. };
  1471. }
  1472. // Check file size
  1473. if (stats.size > MAX_FILE_SIZE) {
  1474. return {
  1475. nodes: [],
  1476. edges: [],
  1477. unresolvedReferences: [],
  1478. errors: [
  1479. {
  1480. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1481. filePath: relativePath,
  1482. severity: 'warning',
  1483. code: 'size_exceeded',
  1484. },
  1485. ],
  1486. durationMs: 0,
  1487. };
  1488. }
  1489. // Detect language (honoring the project's codegraph.json extension overrides)
  1490. const language = detectLanguage(relativePath, content, loadExtensionOverrides(this.rootDir));
  1491. if (!isLanguageSupported(language)) {
  1492. return {
  1493. nodes: [],
  1494. edges: [],
  1495. unresolvedReferences: [],
  1496. errors: [],
  1497. durationMs: 0,
  1498. };
  1499. }
  1500. // Extract from source. Use cached framework names if indexAll has run,
  1501. // otherwise detect on the spot so single-file re-index paths still emit
  1502. // route nodes / middleware / etc.
  1503. const frameworkNames = this.ensureDetectedFrameworks();
  1504. const result = extractFromSource(relativePath, content, language, frameworkNames);
  1505. // Store in database
  1506. if (result.nodes.length > 0 || result.errors.length === 0) {
  1507. this.storeExtractionResult(relativePath, content, language, stats, result);
  1508. }
  1509. return result;
  1510. }
  1511. /**
  1512. * Store extraction result in database
  1513. */
  1514. private storeExtractionResult(
  1515. filePath: string,
  1516. content: string,
  1517. language: Language,
  1518. stats: fs.Stats,
  1519. result: ExtractionResult
  1520. ): void {
  1521. const contentHash = hashContent(content);
  1522. // Check if file already exists and hasn't changed
  1523. const existingFile = this.queries.getFileByPath(filePath);
  1524. if (existingFile && existingFile.contentHash === contentHash) {
  1525. return; // No changes
  1526. }
  1527. // Snapshot incoming cross-file edges BEFORE deleting this file's nodes.
  1528. // `deleteFile` cascades to delete every edge whose source OR target is a
  1529. // node in this file (edges.FK ... ON DELETE CASCADE). Edges whose SOURCE is
  1530. // in this file are re-emitted by the extractor below, but edges whose SOURCE
  1531. // is in a *different* (unchanged) file are not — they would be silently
  1532. // dropped, which is issue #899: re-indexing a callee file severs `calls`/
  1533. // `references` edges from callers that import it via module-attribute
  1534. // access (`pkg.mod.fn(...)`).
  1535. //
  1536. // We snapshot the edge plus the target node's (name, kind) so we can
  1537. // re-resolve to the re-indexed target's NEW id. Node ids are
  1538. // `sha256(filePath:kind:name:line)`, so any line shift in the callee file
  1539. // (e.g. a docstring-only edit above the symbol) changes every target id and
  1540. // a naive re-insert by old id would silently drop every edge. Matching by
  1541. // (filePath, kind, name) is stable across line shifts; if the symbol was
  1542. // renamed/removed, no match is found and the edge stays dropped (correct).
  1543. const crossFileIncomingEdges = existingFile
  1544. ? this.queries.getCrossFileIncomingEdgesWithTarget(filePath)
  1545. : [];
  1546. // Delete existing data for this file
  1547. if (existingFile) {
  1548. this.queries.deleteFile(filePath);
  1549. }
  1550. // Filter out nodes with missing required fields before insertion.
  1551. // This prevents FK violations when edges reference nodes that would
  1552. // be silently skipped by insertNode() (see issue #42).
  1553. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  1554. // Insert nodes
  1555. if (validNodes.length > 0) {
  1556. this.queries.insertNodes(validNodes);
  1557. }
  1558. // Filter edges to only reference nodes that were actually inserted
  1559. if (result.edges.length > 0) {
  1560. const insertedIds = new Set(validNodes.map((n) => n.id));
  1561. const validEdges = result.edges.filter(
  1562. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  1563. );
  1564. if (validEdges.length > 0) {
  1565. this.queries.insertEdges(validEdges);
  1566. }
  1567. }
  1568. // Re-insert cross-file incoming edges snapshotted before the delete,
  1569. // re-resolving each edge's target to the re-indexed node's new id by
  1570. // (filePath, kind, name). Node ids include the source line, so any line
  1571. // shift in the callee file (e.g. a docstring-only edit above the symbol)
  1572. // changes every target id and a naive re-insert by old id would drop them
  1573. // all. `insertEdges` still filters to endpoints that exist, so edges whose
  1574. // caller (source) was deleted, or whose callee (target) was renamed/removed
  1575. // during the re-index (no match in `newTargetIds`), are dropped. This
  1576. // closes the #899 edge-drop on `sync`.
  1577. if (crossFileIncomingEdges.length > 0) {
  1578. const newNodesByKindName = new Map<string, string>();
  1579. for (const n of validNodes) {
  1580. newNodesByKindName.set(`${n.kind}\0${n.name}`, n.id);
  1581. }
  1582. const reinserted: Edge[] = [];
  1583. for (const e of crossFileIncomingEdges) {
  1584. const newTargetId = newNodesByKindName.get(`${e.targetKind}\0${e.targetName}`);
  1585. if (newTargetId) {
  1586. reinserted.push({ source: e.source, target: newTargetId, kind: e.kind, metadata: e.metadata, line: e.line, column: e.column, provenance: e.provenance });
  1587. }
  1588. }
  1589. if (reinserted.length > 0) {
  1590. this.queries.insertEdges(reinserted);
  1591. }
  1592. }
  1593. // Insert unresolved references in batch with denormalized filePath/language
  1594. if (result.unresolvedReferences.length > 0) {
  1595. const insertedIds = new Set(validNodes.map((n) => n.id));
  1596. const refsWithContext = result.unresolvedReferences
  1597. .filter((ref) => insertedIds.has(ref.fromNodeId))
  1598. .map((ref) => ({
  1599. ...ref,
  1600. filePath: ref.filePath ?? filePath,
  1601. language: ref.language ?? language,
  1602. }));
  1603. if (refsWithContext.length > 0) {
  1604. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  1605. }
  1606. }
  1607. // Insert file record
  1608. const fileRecord: FileRecord = {
  1609. path: filePath,
  1610. contentHash,
  1611. language,
  1612. size: stats.size,
  1613. modifiedAt: stats.mtimeMs,
  1614. indexedAt: Date.now(),
  1615. nodeCount: result.nodes.length,
  1616. errors: result.errors.length > 0 ? result.errors : undefined,
  1617. };
  1618. this.queries.upsertFile(fileRecord);
  1619. }
  1620. /**
  1621. * Sync the index with the current file state.
  1622. *
  1623. * Change detection is filesystem-based, never git: a (size, mtime) stat
  1624. * pre-filter skips unchanged files, then a content-hash compare confirms real
  1625. * changes. This works in non-git projects and catches committed changes from
  1626. * `git pull`/`checkout`/`merge`/`rebase` that `git status` cannot see.
  1627. */
  1628. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  1629. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  1630. const startTime = Date.now();
  1631. let filesChecked = 0;
  1632. let filesAdded = 0;
  1633. let filesModified = 0;
  1634. let filesRemoved = 0;
  1635. let nodesUpdated = 0;
  1636. const changedFilePaths: string[] = [];
  1637. onProgress?.({
  1638. phase: 'scanning',
  1639. current: 0,
  1640. total: 0,
  1641. });
  1642. const filesToIndex: string[] = [];
  1643. // === Filesystem reconcile (git-independent) ===
  1644. // The source of truth for "what changed" is the filesystem vs the indexed
  1645. // state — never git. We enumerate the current source files and reconcile
  1646. // each against the DB. A cheap (size, mtime) stat pre-filter skips unchanged
  1647. // files without reading or hashing them, so the expensive read+hash+parse
  1648. // only runs for files that actually changed. This catches edits/adds/deletes
  1649. // whether or not the project uses git, and crucially also catches committed
  1650. // changes from `git pull`/`checkout`/`merge`/`rebase` — which `git status`
  1651. // cannot see, because the working tree is clean afterward.
  1652. const currentFiles = await scanDirectoryAsync(this.rootDir);
  1653. filesChecked = currentFiles.length;
  1654. const currentSet = new Set(currentFiles);
  1655. const trackedFiles = this.queries.getAllFiles();
  1656. const trackedMap = new Map<string, FileRecord>();
  1657. for (const f of trackedFiles) {
  1658. trackedMap.set(f.path, f);
  1659. }
  1660. // Removals: tracked in the DB but no longer a present source file. Check the
  1661. // filesystem directly — `scanDirectory` (via `git ls-files`) still lists a
  1662. // file deleted from disk but not yet staged, so set membership alone misses it.
  1663. // `reconcileChecks` drives the cooperative yield shared with the adds/mods loop
  1664. // below (see SYNC_RECONCILE_YIELD_INTERVAL / issue #905).
  1665. let reconcileChecks = 0;
  1666. for (const tracked of trackedFiles) {
  1667. if (!currentSet.has(tracked.path) || !fs.existsSync(path.join(this.rootDir, tracked.path))) {
  1668. this.queries.deleteFile(tracked.path);
  1669. filesRemoved++;
  1670. }
  1671. if (++reconcileChecks % SYNC_RECONCILE_YIELD_INTERVAL === 0) {
  1672. await new Promise<void>((resolve) => setImmediate(resolve));
  1673. }
  1674. }
  1675. // Adds / modifications.
  1676. for (const filePath of currentFiles) {
  1677. // Same cooperative yield as the removals loop — this is the other O(files)
  1678. // synchronous-stat loop that wedges the main thread on a large repo (#905).
  1679. // Yield at the top of the body so the `continue` fast-paths below still hit it.
  1680. if (++reconcileChecks % SYNC_RECONCILE_YIELD_INTERVAL === 0) {
  1681. await new Promise<void>((resolve) => setImmediate(resolve));
  1682. }
  1683. const fullPath = path.join(this.rootDir, filePath);
  1684. const tracked = trackedMap.get(filePath);
  1685. // Cheap pre-filter: an already-indexed file whose size AND mtime both match
  1686. // the DB is unchanged — skip it without reading or hashing. (A content
  1687. // change that preserves both exactly is the blind spot every mtime-based
  1688. // incremental tool accepts; `index --force` is the escape hatch. Git bumps
  1689. // mtime on every file it writes during checkout/merge, so pulls are caught.)
  1690. if (tracked) {
  1691. try {
  1692. const stat = fs.statSync(fullPath);
  1693. if (stat.size === tracked.size && Math.floor(stat.mtimeMs) === Math.floor(tracked.modifiedAt)) {
  1694. continue;
  1695. }
  1696. } catch (error) {
  1697. logDebug('Skipping unstattable file during sync', { filePath, error: String(error) });
  1698. continue;
  1699. }
  1700. }
  1701. // New, or size/mtime changed — read + hash to confirm a real content change.
  1702. let content: string;
  1703. try {
  1704. content = fs.readFileSync(fullPath, 'utf-8');
  1705. } catch (error) {
  1706. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  1707. continue;
  1708. }
  1709. const contentHash = hashContent(content);
  1710. if (!tracked) {
  1711. filesToIndex.push(filePath);
  1712. changedFilePaths.push(filePath);
  1713. filesAdded++;
  1714. } else if (tracked.contentHash !== contentHash) {
  1715. filesToIndex.push(filePath);
  1716. changedFilePaths.push(filePath);
  1717. filesModified++;
  1718. }
  1719. }
  1720. // Load only grammars needed for changed files
  1721. if (filesToIndex.length > 0) {
  1722. const overrides = loadExtensionOverrides(this.rootDir);
  1723. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f, undefined, overrides)))];
  1724. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
  1725. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  1726. neededLanguages.push('cpp');
  1727. }
  1728. await loadGrammarsForLanguages(neededLanguages);
  1729. }
  1730. // Index changed files
  1731. const total = filesToIndex.length;
  1732. for (let i = 0; i < filesToIndex.length; i++) {
  1733. const filePath = filesToIndex[i]!;
  1734. onProgress?.({
  1735. phase: 'parsing',
  1736. current: i + 1,
  1737. total,
  1738. currentFile: filePath,
  1739. });
  1740. const result = await this.indexFile(filePath);
  1741. nodesUpdated += result.nodes.length;
  1742. }
  1743. return {
  1744. filesChecked,
  1745. filesAdded,
  1746. filesModified,
  1747. filesRemoved,
  1748. nodesUpdated,
  1749. durationMs: Date.now() - startTime,
  1750. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  1751. };
  1752. }
  1753. /**
  1754. * Get files that have changed since last index.
  1755. * Uses git status as a fast path when available, falling back to full scan.
  1756. */
  1757. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  1758. const gitChanges = getGitChangedFiles(this.rootDir);
  1759. if (gitChanges) {
  1760. // === Git fast path ===
  1761. const added: string[] = [];
  1762. const modified: string[] = [];
  1763. const removed: string[] = [];
  1764. // Deleted files — only report if tracked in DB
  1765. for (const filePath of gitChanges.deleted) {
  1766. const tracked = this.queries.getFileByPath(filePath);
  1767. if (tracked) {
  1768. removed.push(filePath);
  1769. }
  1770. }
  1771. // Modified + added files — read + hash, compare with DB. Untracked (`??`)
  1772. // files stay untracked in git even after indexing, so they must be
  1773. // hash-compared like modified files instead of always counting as added —
  1774. // otherwise status reports them as pending forever. (See issue #206.)
  1775. for (const filePath of [...gitChanges.modified, ...gitChanges.added]) {
  1776. const fullPath = path.join(this.rootDir, filePath);
  1777. let content: string;
  1778. try {
  1779. content = fs.readFileSync(fullPath, 'utf-8');
  1780. } catch (error) {
  1781. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1782. continue;
  1783. }
  1784. const contentHash = hashContent(content);
  1785. const tracked = this.queries.getFileByPath(filePath);
  1786. if (!tracked) {
  1787. added.push(filePath);
  1788. } else if (tracked.contentHash !== contentHash) {
  1789. modified.push(filePath);
  1790. }
  1791. }
  1792. return { added, modified, removed };
  1793. }
  1794. // === Fallback: full scan (non-git project or git failure) ===
  1795. const currentFiles = new Set(scanDirectory(this.rootDir));
  1796. const trackedFiles = this.queries.getAllFiles();
  1797. // Build Map for O(1) lookups
  1798. const trackedMap = new Map<string, FileRecord>();
  1799. for (const f of trackedFiles) {
  1800. trackedMap.set(f.path, f);
  1801. }
  1802. const added: string[] = [];
  1803. const modified: string[] = [];
  1804. const removed: string[] = [];
  1805. // Find removed files
  1806. for (const tracked of trackedFiles) {
  1807. if (!currentFiles.has(tracked.path)) {
  1808. removed.push(tracked.path);
  1809. }
  1810. }
  1811. // Find added and modified files
  1812. for (const filePath of currentFiles) {
  1813. const fullPath = path.join(this.rootDir, filePath);
  1814. let content: string;
  1815. try {
  1816. content = fs.readFileSync(fullPath, 'utf-8');
  1817. } catch (error) {
  1818. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1819. continue;
  1820. }
  1821. const contentHash = hashContent(content);
  1822. const tracked = trackedMap.get(filePath);
  1823. if (!tracked) {
  1824. added.push(filePath);
  1825. } else if (tracked.contentHash !== contentHash) {
  1826. modified.push(filePath);
  1827. }
  1828. }
  1829. return { added, modified, removed };
  1830. }
  1831. }
  1832. // Re-export useful types and functions
  1833. export { extractFromSource } from './tree-sitter';
  1834. export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';