index.ts 71 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. Edge,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isSourceFile, isLanguageSupported, isFileLevelOnlyLanguage, initGrammars, loadGrammarsForLanguages } from './grammars';
  21. import { isCodeGraphDataDir } from '../directory';
  22. import { logDebug, logWarn } from '../errors';
  23. import { validatePathWithinRoot, normalizePath } from '../utils';
  24. import ignore, { Ignore } from 'ignore';
  25. import { detectFrameworks } from '../resolution/frameworks';
  26. import type { ResolutionContext } from '../resolution/types';
  27. /**
  28. * Number of files to read in parallel during indexing.
  29. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  30. */
  31. const FILE_IO_BATCH_SIZE = 10;
  32. // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
  33. /**
  34. * Maximum time (ms) to wait for a single file to parse in the worker thread.
  35. * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
  36. * indexing run from freezing. The worker is restarted after a timeout.
  37. */
  38. const PARSE_TIMEOUT_MS = 10_000;
  39. /**
  40. * Number of files to parse before recycling the worker thread.
  41. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
  42. * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
  43. * V8 isolate by terminating the worker thread and spawning a fresh one.
  44. * This interval balances memory usage against the cost of reloading grammars.
  45. */
  46. const WORKER_RECYCLE_INTERVAL = 250;
  47. /**
  48. * Progress callback for indexing operations
  49. */
  50. export interface IndexProgress {
  51. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  52. current: number;
  53. total: number;
  54. currentFile?: string;
  55. }
  56. /**
  57. * Result of an indexing operation
  58. */
  59. export interface IndexResult {
  60. success: boolean;
  61. filesIndexed: number;
  62. filesSkipped: number;
  63. filesErrored: number;
  64. nodesCreated: number;
  65. edgesCreated: number;
  66. errors: ExtractionError[];
  67. durationMs: number;
  68. }
  69. /**
  70. * Result of a sync operation
  71. */
  72. export interface SyncResult {
  73. filesChecked: number;
  74. filesAdded: number;
  75. filesModified: number;
  76. filesRemoved: number;
  77. nodesUpdated: number;
  78. durationMs: number;
  79. changedFilePaths?: string[];
  80. }
  81. /**
  82. * Calculate SHA256 hash of file contents
  83. */
  84. export function hashContent(content: string): string {
  85. return crypto.createHash('sha256').update(content).digest('hex');
  86. }
  87. /**
  88. * Skip files larger than this (bytes). Generated bundles, minified JS, and
  89. * vendored blobs blow the WASM heap and the worker-recycle budget for no useful
  90. * symbols. 1 MB covers essentially all hand-written source.
  91. */
  92. const MAX_FILE_SIZE = 1024 * 1024;
  93. /**
  94. * Directory names that are dependency, build, cache, or tooling output across the
  95. * languages/frameworks CodeGraph supports — curated from the canonical
  96. * github/gitignore templates. Excluded by default so the graph reflects your code,
  97. * not third-party noise, without requiring a `.gitignore` (issue #407). The
  98. * exclusion applies uniformly (git or not, tracked or not); the only opt-in is an
  99. * explicit `.gitignore` negation (e.g. `!vendor/`). First-party-prone or generic
  100. * names (`packages`, `lib`, `app`, `bin`, `src`, `deps`, `env`, `tmp`, `storage`,
  101. * `Library`) are deliberately NOT listed, to avoid ever hiding real source.
  102. *
  103. * Only dirs that actually contain *indexable source* (or are enormous) earn a slot
  104. * — IDE/state dirs like `.idea`/`.vs` are omitted because CodeGraph indexes only
  105. * recognized source extensions, so they produce no symbols regardless.
  106. */
  107. const DEFAULT_IGNORE_DIRS: ReadonlySet<string> = new Set([
  108. // JS / TS — dependency directories
  109. 'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
  110. '.yarn', '.pnpm-store',
  111. // JS / TS — framework & bundler build / cache / deploy output
  112. '.next', '.nuxt', '.svelte-kit', '.turbo', '.vite', '.parcel-cache', '.angular',
  113. '.docusaurus', 'storybook-static', '.vinxi', '.nitro', 'out-tsc',
  114. '.vercel', '.netlify', '.wrangler',
  115. // Build output (common across ecosystems)
  116. 'dist', 'build', 'out', '.output',
  117. // Test / coverage
  118. 'coverage', '.nyc_output',
  119. // Python
  120. '__pycache__', '__pypackages__', '.venv', 'venv', '.pixi', '.pdm-build',
  121. '.mypy_cache', '.pytest_cache', '.ruff_cache', '.tox', '.nox', '.hypothesis',
  122. '.ipynb_checkpoints', '.eggs',
  123. // Rust / JVM (Maven, Gradle, Scala)
  124. 'target', '.gradle',
  125. // .NET
  126. 'obj',
  127. // Vendored deps (Go, PHP/Composer, Ruby/Bundler)
  128. 'vendor',
  129. // Swift / iOS
  130. '.build', 'Pods', 'Carthage', 'DerivedData', '.swiftpm',
  131. // Dart / Flutter
  132. '.dart_tool', '.pub-cache',
  133. // Native (Android NDK, C/C++ deps)
  134. '.cxx', '.externalNativeBuild', 'vcpkg_installed',
  135. // Scala tooling
  136. '.bloop', '.metals',
  137. // Lua / Luau (LuaRocks)
  138. 'lua_modules', '.luarocks',
  139. // Delphi / RAD Studio IDE backups (duplicate .pas source — would double-count)
  140. '__history', '__recovery',
  141. // Generic cache
  142. '.cache',
  143. ]);
  144. /** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
  145. const DEFAULT_IGNORE_PATTERNS: string[] = [
  146. ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
  147. '*.egg-info/', // Python packaging metadata
  148. 'cmake-build-*/', // CLion / CMake build trees
  149. 'bazel-*/', // Bazel output symlink trees
  150. ];
  151. /** True if `buf` decodes as strict UTF-8 (no invalid byte sequences). */
  152. function isValidUtf8(buf: Buffer): boolean {
  153. try {
  154. new TextDecoder('utf-8', { fatal: true }).decode(buf);
  155. return true;
  156. } catch {
  157. return false;
  158. }
  159. }
  160. /**
  161. * Read a `.gitignore` and return patterns safe to hand to the `ignore` matcher —
  162. * never throwing, even when the file isn't real gitignore text. Two failure
  163. * modes, both seen in the wild (issue #682):
  164. *
  165. * - The file isn't valid UTF-8 — e.g. transparently encrypted in place by
  166. * corporate DLP / endpoint-security software, leaving a UTF-16 header plus
  167. * ciphertext. None of it is meaningful patterns, so the whole file is skipped.
  168. * - The file is text but a single line can't be compiled to a regex by the
  169. * `ignore` library — `\\[` and friends throw "Unterminated character class".
  170. * Crucially the throw is LAZY (at match time, not `.add()`), so it would
  171. * otherwise escape mid-scan. That one pattern is dropped; the rest are kept.
  172. *
  173. * Either way a warning that NAMES the file is logged (the reporter couldn't tell
  174. * which `.gitignore` was at fault) and indexing continues instead of aborting.
  175. * Returns '' when there's nothing usable.
  176. */
  177. function readGitignorePatterns(giPath: string): string {
  178. let buf: Buffer;
  179. try {
  180. buf = fs.readFileSync(giPath);
  181. } catch {
  182. return ''; // unreadable (permissions / race) — treat as absent
  183. }
  184. // A NUL byte never appears in real gitignore text, and a fatal UTF-8 decode
  185. // catches the rest. Such a file isn't ignore patterns at all.
  186. if (buf.includes(0) || !isValidUtf8(buf)) {
  187. logWarn(
  188. 'Ignoring a .gitignore that is not valid UTF-8 text — it may have been encrypted ' +
  189. 'in place by endpoint-security software. Indexing continues without it.',
  190. { file: giPath },
  191. );
  192. return '';
  193. }
  194. const content = buf.toString('utf-8');
  195. // Fast path: one `.ignores()` call forces the library to compile EVERY rule,
  196. // so if it doesn't throw, the whole file is safe to use verbatim.
  197. try {
  198. ignore().add(content).ignores('.codegraph-probe');
  199. return content;
  200. } catch {
  201. // Fall through: a line is uncompilable — keep the good ones, drop the bad.
  202. }
  203. const kept: string[] = [];
  204. let dropped = 0;
  205. for (const line of content.split(/\r?\n/)) {
  206. try {
  207. ignore().add(line).ignores('.codegraph-probe');
  208. kept.push(line);
  209. } catch {
  210. dropped++;
  211. }
  212. }
  213. if (dropped > 0) {
  214. logWarn(
  215. `Skipped ${dropped} unparseable pattern(s) in a .gitignore; the rest are applied.`,
  216. { file: giPath },
  217. );
  218. }
  219. return kept.join('\n');
  220. }
  221. /**
  222. * An `ignore` matcher seeded with the built-in defaults, merged with the project's
  223. * root .gitignore so a negation there (e.g. `!vendor/`) overrides a default. Shared
  224. * by both enumeration paths so behavior is identical with or without git — and so
  225. * the defaults apply to tracked files too (committing a dependency dir doesn't make
  226. * it project code; the explicit `.gitignore` negation is the only opt-in).
  227. */
  228. export function buildDefaultIgnore(rootDir: string): Ignore {
  229. const ig = ignore().add(DEFAULT_IGNORE_PATTERNS);
  230. const rootGitignore = path.join(rootDir, '.gitignore');
  231. if (fs.existsSync(rootGitignore)) ig.add(readGitignorePatterns(rootGitignore));
  232. return ig;
  233. }
  234. /**
  235. * Defaults-only ignore matcher (no root `.gitignore` merged). Used wherever the
  236. * parent repo's own ignore rules must NOT apply — inside embedded child repos,
  237. * whose gitignore semantics their own `git ls-files` already enforced (#514).
  238. */
  239. function defaultsOnlyIgnore(): Ignore {
  240. return ignore().add(DEFAULT_IGNORE_PATTERNS);
  241. }
  242. /**
  243. * `git ls-files --directory` collapses a wholly-untracked/ignored directory into
  244. * one entry — and when the command's own cwd is such a directory (the indexed
  245. * root is itself a git-ignored subdir of an enclosing repo), git emits the
  246. * literal `./` meaning "this entire directory". That sentinel is not a real
  247. * nested path: feeding it to the `ignore` matcher throws ("path should be a
  248. * `path.relative()`d string, but got "./""), which used to abort `buildScopeIgnore`
  249. * and so break the MCP daemon's watcher/auto-sync on connect; and joining it back
  250. * onto `repoDir` would just re-point at the cwd. Drop it wherever we consume
  251. * `--directory` output. (#936)
  252. */
  253. function isWholeCwdEntry(entry: string): boolean {
  254. return entry === './' || entry === '.' || entry === '';
  255. }
  256. /**
  257. * List the gitignored DIRECTORIES of a repo (collapsed, trailing-slash form),
  258. * relative to `repoDir`. These are invisible to every other `git ls-files` /
  259. * `git status` mode — and in a multi-repo workspace they are exactly where the
  260. * nested project repos live (a super-repo `.gitignore`s its child repos to keep
  261. * `git status` quiet; that does not make them third-party code). (#514)
  262. */
  263. function listIgnoredDirs(repoDir: string): string[] {
  264. try {
  265. const out = execFileSync(
  266. 'git',
  267. ['ls-files', '-z', '-o', '-i', '--exclude-standard', '--directory'],
  268. { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true }
  269. );
  270. return out.split('\0').filter((e) => e.endsWith('/') && !isWholeCwdEntry(e));
  271. } catch {
  272. return [];
  273. }
  274. }
  275. /** Max directory depth searched below an ignored dir for nested `.git` roots. */
  276. const EMBEDDED_REPO_SEARCH_DEPTH = 4;
  277. /** Max directories examined per search — a huge ignored data dir must never stall a scan/sync. */
  278. const EMBEDDED_REPO_SEARCH_ENTRIES = 2000;
  279. /**
  280. * Classify a directory's `.git` entry for embedded-repo discovery.
  281. *
  282. * - A `.git` **directory** is an embedded clone — distinct first-party code a
  283. * super-repo merely hides from git; index it (#193, #514).
  284. * - A `.git` **file** is a pointer (`gitdir: …`). A git **worktree** points into
  285. * the host repo's own `.git/worktrees/<name>`, so it is a second working view
  286. * of a repo CodeGraph already indexes — indexing it just duplicates the whole
  287. * graph N times; skip it (#848). A **submodule** points into `.git/modules/`
  288. * and is distinct code, so index it as before.
  289. *
  290. * Returns `'none'` when there is no `.git` entry here.
  291. */
  292. function classifyGitDir(absDir: string): 'embedded' | 'worktree' | 'none' {
  293. let st: fs.Stats;
  294. try {
  295. st = fs.statSync(path.join(absDir, '.git'));
  296. } catch {
  297. return 'none';
  298. }
  299. if (st.isDirectory()) return 'embedded';
  300. if (!st.isFile()) return 'none';
  301. try {
  302. const gitdir = fs.readFileSync(path.join(absDir, '.git'), 'utf8').match(/^gitdir:\s*(.+)$/m)?.[1]?.trim();
  303. // A linked worktree's gitdir lives under some repo's `.git/worktrees/`.
  304. // Match both separators so a Windows-style pointer is recognized too.
  305. if (gitdir && /(^|[\\/])\.git[\\/]worktrees[\\/]/.test(gitdir)) return 'worktree';
  306. } catch {
  307. // Unreadable `.git` pointer — fall back to the prior "index it" behavior.
  308. }
  309. return 'embedded';
  310. }
  311. /**
  312. * Find git repositories nested under `absDir` (inclusive), shallow bounded BFS.
  313. * Stops descending at each repo root found — contents belong to that repo's own
  314. * enumeration. Skips default-ignored dirs (`node_modules` can contain `.git`
  315. * from npm git-dependencies — that never makes it project code) and CodeGraph
  316. * data dirs. Depth- and entry-capped so a huge ignored tree can't stall the scan.
  317. */
  318. function findNestedGitRepos(absDir: string, relPrefix: string): string[] {
  319. const found: string[] = [];
  320. const defaults = defaultsOnlyIgnore();
  321. const queue: Array<{ abs: string; rel: string; depth: number }> = [
  322. { abs: absDir, rel: relPrefix, depth: 0 },
  323. ];
  324. let examined = 0;
  325. while (queue.length > 0) {
  326. const { abs, rel, depth } = queue.shift()!;
  327. if (++examined > EMBEDDED_REPO_SEARCH_ENTRIES) {
  328. logDebug('Embedded-repo search entry cap hit — deeper repos (if any) not discovered', { under: relPrefix });
  329. break;
  330. }
  331. const cls = classifyGitDir(abs);
  332. if (cls === 'worktree') {
  333. continue; // a git worktree duplicates an already-indexed repo (#848) — skip
  334. }
  335. if (cls === 'embedded') {
  336. found.push(rel);
  337. continue; // its own git handles everything below
  338. }
  339. if (depth >= EMBEDDED_REPO_SEARCH_DEPTH) continue;
  340. let entries: fs.Dirent[];
  341. try {
  342. entries = fs.readdirSync(abs, { withFileTypes: true });
  343. } catch {
  344. continue;
  345. }
  346. for (const entry of entries) {
  347. if (!entry.isDirectory()) continue;
  348. if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue;
  349. const childRel = rel + entry.name + '/';
  350. if (defaults.ignores(childRel)) continue;
  351. queue.push({ abs: path.join(abs, entry.name), rel: childRel, depth: depth + 1 });
  352. }
  353. }
  354. return found;
  355. }
  356. /**
  357. * Workspace-scope ignore matcher. Ordinary paths get the root's matcher
  358. * (built-in defaults + root `.gitignore`); paths inside an EMBEDDED repo get
  359. * that repo's own matcher (defaults + its root `.gitignore`) — the parent's
  360. * `.gitignore` hides a child repo from git, not from the index (#514). A
  361. * directory path (trailing slash) that is an ANCESTOR of an embedded root is
  362. * never ignored, so directory-pruning callers (the Linux per-directory
  363. * watcher) still descend to reach the embedded repos.
  364. *
  365. * Single source of truth for indexer and watcher scope — they must not diverge.
  366. */
  367. export class ScopeIgnore {
  368. private embedded: Array<{ root: string; matcher: Ignore }>;
  369. private defaults: Ignore = defaultsOnlyIgnore();
  370. constructor(private rootMatcher: Ignore, embedded: Array<{ root: string; matcher: Ignore }>) {
  371. // Longest root first so paths in nested embedded repos hit the innermost matcher.
  372. this.embedded = [...embedded].sort((a, b) => b.root.length - a.root.length);
  373. }
  374. ignores(rel: string): boolean {
  375. for (const { root, matcher } of this.embedded) {
  376. if (rel.startsWith(root)) {
  377. const inner = rel.slice(root.length);
  378. if (inner === '') return false;
  379. // Built-in defaults apply to the FULL path uniformly (#407) — an
  380. // embedded repo inside node_modules (an npm git-dependency) must stay
  381. // excluded even though its own rules wouldn't ignore its files.
  382. return this.defaults.ignores(rel) || matcher.ignores(inner);
  383. }
  384. }
  385. // Never prune a directory that leads to an embedded repo.
  386. if (rel.endsWith('/') && this.embedded.some(({ root }) => root.startsWith(rel))) {
  387. return false;
  388. }
  389. return this.rootMatcher.ignores(rel);
  390. }
  391. }
  392. /**
  393. * Build the workspace-scope matcher. When the caller already knows the
  394. * embedded roots (the scanner discovers them during collection), pass them to
  395. * skip rediscovery; otherwise they're discovered here (the watcher path).
  396. */
  397. export function buildScopeIgnore(rootDir: string, embeddedRoots?: Iterable<string>): ScopeIgnore {
  398. const roots = embeddedRoots ? [...embeddedRoots] : discoverEmbeddedRepoRoots(rootDir);
  399. return new ScopeIgnore(
  400. buildDefaultIgnore(rootDir),
  401. roots.map((root) => ({ root, matcher: buildDefaultIgnore(path.join(rootDir, root)) })),
  402. );
  403. }
  404. /**
  405. * Standalone discovery of every embedded repo root under `rootDir` (relative,
  406. * trailing-slashed) — both the untracked kind (#193) and the gitignored kind
  407. * (#514), recursively (an embedded repo can embed further repos). Returns []
  408. * for non-git roots: the filesystem walk handles nested repos there already.
  409. */
  410. export function discoverEmbeddedRepoRoots(rootDir: string): string[] {
  411. try {
  412. execFileSync('git', ['rev-parse', '--git-dir'], { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true });
  413. } catch {
  414. return [];
  415. }
  416. const out: string[] = [];
  417. const defaults = defaultsOnlyIgnore();
  418. const visit = (repoAbs: string, prefix: string): void => {
  419. const candidates: string[] = [];
  420. try {
  421. const o = execFileSync(
  422. 'git',
  423. ['ls-files', '-z', '-o', '--exclude-standard', '--directory'],
  424. { cwd: repoAbs, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  425. );
  426. for (const e of o.split('\0')) {
  427. if (e.endsWith('/') && !isWholeCwdEntry(e) && !defaults.ignores(e)) {
  428. candidates.push(...findNestedGitRepos(path.join(repoAbs, e), e));
  429. }
  430. }
  431. } catch { /* untracked listing failed — ignored-side discovery still runs */ }
  432. candidates.push(...findIgnoredEmbeddedRepos(repoAbs));
  433. for (const rel of candidates) {
  434. const full = normalizePath(prefix + rel);
  435. out.push(full);
  436. visit(path.join(repoAbs, rel), full);
  437. }
  438. };
  439. visit(rootDir, '');
  440. return out;
  441. }
  442. /**
  443. * Discover embedded repos hidden by `repoDir`'s OWN ignore rules: for each
  444. * gitignored directory (skipping built-in default excludes), search for nested
  445. * `.git` roots. Returns repo paths relative to `repoDir`, trailing-slashed.
  446. */
  447. function findIgnoredEmbeddedRepos(repoDir: string): string[] {
  448. const defaults = defaultsOnlyIgnore();
  449. const repos: string[] = [];
  450. for (const dir of listIgnoredDirs(repoDir)) {
  451. if (defaults.ignores(dir)) continue;
  452. repos.push(...findNestedGitRepos(path.join(repoDir, dir), dir));
  453. }
  454. return repos;
  455. }
  456. /**
  457. * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
  458. * git repository rooted at `repoDir`, adding each to `files` with `prefix`
  459. * prepended so paths stay relative to the original scan root.
  460. *
  461. * Recurses into embedded git repositories — nested repos that are NOT submodules
  462. * (independent clones living inside the workspace, common in CMake "super-repo"
  463. * layouts). The parent repo's `git ls-files` cannot see into them: tracked output
  464. * skips them entirely, and untracked output reports them only as an opaque
  465. * "subdir/" entry (trailing slash) rather than expanding their files. Each
  466. * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
  467. * (See issue #193.) GITIGNORED embedded repos are invisible even to that —
  468. * they're discovered separately via `findIgnoredEmbeddedRepos` (#514); every
  469. * embedded repo root (however found) is recorded in `embeddedRoots` so callers
  470. * can exempt its files from the parent's own gitignore rules.
  471. */
  472. function collectGitFiles(repoDir: string, prefix: string, files: Set<string>, embeddedRoots?: Set<string>): void {
  473. const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'], windowsHide: true };
  474. // Tracked files. --recurse-submodules pulls in files from active submodules,
  475. // which the index would otherwise represent only as a commit pointer.
  476. // Without this, monorepos using submodules index 0 files. (See issue #147.)
  477. // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
  478. // can't be combined with -o, so untracked files are gathered separately below.
  479. // -z gives NUL-separated, unquoted output so non-ASCII (e.g. CJK) paths
  480. // survive verbatim. Without it git octal-escapes and double-quotes such paths
  481. // (the core.quotepath default), and the quoted form never matches a real file
  482. // on disk → those files are silently dropped from the index. (#541)
  483. const tracked = execFileSync('git', ['ls-files', '-z', '-c', '--recurse-submodules'], gitOpts);
  484. for (const rel of tracked.split('\0')) {
  485. if (rel) files.add(normalizePath(prefix + rel));
  486. }
  487. // Untracked files (submodules manage their own untracked state). Embedded git
  488. // repos surface here as a single "subdir/" entry that git refuses to descend
  489. // into — recurse into those as their own repos so their source gets indexed.
  490. const untracked = execFileSync('git', ['ls-files', '-z', '-o', '--exclude-standard'], gitOpts);
  491. for (const rel of untracked.split('\0')) {
  492. if (!rel) continue;
  493. if (rel.endsWith('/')) {
  494. // git only emits a trailing-slash directory entry for an embedded repo.
  495. // Guard with a .git check anyway, and skip anything else exactly as git
  496. // itself skips it (we never descend into a non-repo opaque dir). Never
  497. // descend into default-ignored locations — an embedded repo inside
  498. // node_modules is an npm git-dependency, not project code.
  499. const childDir = path.join(repoDir, rel);
  500. // A git worktree surfaces here as an opaque untracked dir too — skip it,
  501. // it's a duplicate working view of an already-indexed repo (#848).
  502. if (classifyGitDir(childDir) === 'embedded' && !defaultsOnlyIgnore().ignores(rel)) {
  503. embeddedRoots?.add(normalizePath(prefix + rel));
  504. collectGitFiles(childDir, prefix + rel, files, embeddedRoots);
  505. }
  506. continue;
  507. }
  508. files.add(normalizePath(prefix + rel));
  509. }
  510. // Embedded repos hidden by THIS repo's ignore rules (`/packages/` in a
  511. // super-repo .gitignore) never appear in any listing above — discover and
  512. // recurse into them too. (#514)
  513. for (const rel of findIgnoredEmbeddedRepos(repoDir)) {
  514. embeddedRoots?.add(normalizePath(prefix + rel));
  515. collectGitFiles(path.join(repoDir, rel), prefix + rel, files, embeddedRoots);
  516. }
  517. }
  518. /**
  519. * Get all files visible to git (tracked + untracked but not ignored).
  520. * Respects .gitignore at all levels (root, subdirectories) and descends into
  521. * embedded (nested, non-submodule) git repos. Returns null on failure
  522. * (non-git project) so callers can fall back to a filesystem walk.
  523. */
  524. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  525. try {
  526. // Check if the project directory is gitignored by a parent repo.
  527. // When rootDir lives inside a parent git repo that ignores it,
  528. // `git ls-files` returns nothing — fall back to filesystem walk.
  529. const gitRoot = execFileSync(
  530. 'git',
  531. ['rev-parse', '--show-toplevel'],
  532. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  533. ).trim();
  534. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  535. try {
  536. // git check-ignore exits 0 if the path IS ignored, 1 if not
  537. execFileSync(
  538. 'git',
  539. ['check-ignore', '-q', path.resolve(rootDir)],
  540. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  541. );
  542. // Directory is gitignored by parent repo — fall back to filesystem walk
  543. return null;
  544. } catch {
  545. // Not ignored — safe to use git ls-files
  546. }
  547. }
  548. const files = new Set<string>();
  549. const embeddedRoots = new Set<string>();
  550. collectGitFiles(rootDir, '', files, embeddedRoots);
  551. // Apply built-in default ignores uniformly — to tracked files too, since
  552. // committing a dependency/build dir doesn't make it project code. A
  553. // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
  554. // Files inside an EMBEDDED repo are matched against that repo's own rules,
  555. // not the parent's: the parent's .gitignore hides the child repo from git,
  556. // not from the index. (#514)
  557. const ig = buildScopeIgnore(rootDir, embeddedRoots);
  558. return new Set([...files].filter((f) => !ig.ignores(f)));
  559. } catch {
  560. return null;
  561. }
  562. }
  563. /**
  564. * Result of git-based change detection.
  565. * Returns null when git is unavailable (non-git project or command failure),
  566. * signaling the caller to fall back to full filesystem scan.
  567. */
  568. interface GitChanges {
  569. modified: string[]; // M, MM, AM — files to re-hash + re-index
  570. added: string[]; // ?? — new untracked files to index
  571. deleted: string[]; // D — files to remove from DB
  572. }
  573. /**
  574. * Use `git status` to detect changed files instead of scanning every file.
  575. * Returns null on failure so callers fall back to full scan.
  576. *
  577. * Recurses into embedded repos — both the untracked kind (#193: the parent's
  578. * status collapses them to an opaque `?? subdir/` entry) and the gitignored
  579. * kind (#514: they never appear in the parent's status at all) — running
  580. * `git status` inside each, so changes in a multi-repo workspace sync without
  581. * a full rescan. Deleting an ENTIRE embedded repo dir is the one case this
  582. * cannot see (the child status that would report the deletions is gone with
  583. * it); a full `codegraph index` reconciles that.
  584. */
  585. function getGitChangedFiles(rootDir: string): GitChanges | null {
  586. try {
  587. const changes: GitChanges = { modified: [], added: [], deleted: [] };
  588. collectGitStatus(rootDir, '', changes);
  589. return changes;
  590. } catch {
  591. return null;
  592. }
  593. }
  594. function collectGitStatus(repoDir: string, prefix: string, out: GitChanges): void {
  595. const output = execFileSync(
  596. 'git',
  597. ['status', '--porcelain', '--no-renames'],
  598. { cwd: repoDir, encoding: 'utf-8', timeout: 10000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'], windowsHide: true }
  599. );
  600. const untrackedDirs: string[] = [];
  601. for (const line of output.split('\n')) {
  602. if (line.length < 4) continue; // Minimum: "XY file"
  603. const statusCode = line.substring(0, 2);
  604. const rel = normalizePath(line.substring(3));
  605. // Untracked directory entries (trailing slash) may hide an embedded repo —
  606. // collect for the recursion below instead of treating as a file.
  607. if (statusCode === '??' && rel.endsWith('/')) {
  608. untrackedDirs.push(rel);
  609. continue;
  610. }
  611. const filePath = normalizePath(prefix + rel);
  612. // Skip non-source files (git status already omits .gitignored paths).
  613. if (!isSourceFile(filePath)) continue;
  614. if (statusCode === '??') {
  615. out.added.push(filePath);
  616. } else if (statusCode.includes('D')) {
  617. out.deleted.push(filePath);
  618. } else {
  619. // M, MM, AM, A (staged), etc. — treat as modified
  620. out.modified.push(filePath);
  621. }
  622. }
  623. // Recurse embedded repos found under untracked dirs (at the dir itself or
  624. // nested deeper) and under this repo's gitignored dirs.
  625. for (const rel of untrackedDirs) {
  626. for (const repoRel of findNestedGitRepos(path.join(repoDir, rel), rel)) {
  627. collectGitStatus(path.join(repoDir, repoRel), prefix + repoRel, out);
  628. }
  629. }
  630. for (const rel of findIgnoredEmbeddedRepos(repoDir)) {
  631. collectGitStatus(path.join(repoDir, rel), prefix + rel, out);
  632. }
  633. }
  634. /**
  635. * Recursively scan a directory for source files.
  636. *
  637. * In git repos, uses `git ls-files` (inherently respects .gitignore at all
  638. * levels), then keeps files with a supported source extension. For non-git
  639. * projects, falls back to a filesystem walk that parses .gitignore itself.
  640. */
  641. export function scanDirectory(
  642. rootDir: string,
  643. onProgress?: (current: number, file: string) => void
  644. ): string[] {
  645. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  646. const gitFiles = getGitVisibleFiles(rootDir);
  647. if (gitFiles) {
  648. const files: string[] = [];
  649. let count = 0;
  650. for (const filePath of gitFiles) {
  651. if (isSourceFile(filePath)) {
  652. files.push(filePath);
  653. count++;
  654. onProgress?.(count, filePath);
  655. }
  656. }
  657. return files;
  658. }
  659. // Fallback: walk filesystem for non-git projects
  660. return scanDirectoryWalk(rootDir, onProgress);
  661. }
  662. /**
  663. * Async variant of scanDirectory that yields to the event loop periodically,
  664. * allowing worker threads to receive and render progress messages.
  665. */
  666. export async function scanDirectoryAsync(
  667. rootDir: string,
  668. onProgress?: (current: number, file: string) => void
  669. ): Promise<string[]> {
  670. const gitFiles = getGitVisibleFiles(rootDir);
  671. if (gitFiles) {
  672. const files: string[] = [];
  673. let count = 0;
  674. for (const filePath of gitFiles) {
  675. if (isSourceFile(filePath)) {
  676. files.push(filePath);
  677. count++;
  678. onProgress?.(count, filePath);
  679. // Yield every 100 files so worker threads can render progress
  680. if (count % 100 === 0) {
  681. await new Promise<void>(r => setImmediate(r));
  682. }
  683. }
  684. }
  685. return files;
  686. }
  687. return scanDirectoryWalk(rootDir, onProgress);
  688. }
  689. /**
  690. * Filesystem walk fallback for non-git projects.
  691. */
  692. function scanDirectoryWalk(
  693. rootDir: string,
  694. onProgress?: (current: number, file: string) => void
  695. ): string[] {
  696. const files: string[] = [];
  697. let count = 0;
  698. const visitedDirs = new Set<string>();
  699. // A .gitignore matcher scoped to the directory that declared it. Patterns in
  700. // a nested .gitignore are relative to that directory, so we keep the dir
  701. // alongside the matcher and test paths relative to it — mirroring how git
  702. // applies .gitignore files at every level.
  703. interface ScopedIgnore {
  704. dir: string;
  705. ig: Ignore;
  706. }
  707. const loadIgnore = (dir: string): ScopedIgnore | null => {
  708. const giPath = path.join(dir, '.gitignore');
  709. if (!fs.existsSync(giPath)) return null;
  710. // readGitignorePatterns is defensive: a non-UTF-8 (DLP-encrypted) or
  711. // uncompilable .gitignore is skipped/filtered with a warning, never thrown
  712. // (issue #682) — so the per-file `.ignores()` calls below can't crash.
  713. const patterns = readGitignorePatterns(giPath);
  714. return patterns ? { dir, ig: ignore().add(patterns) } : null;
  715. };
  716. const isIgnored = (fullPath: string, isDir: boolean, matchers: ScopedIgnore[]): boolean => {
  717. for (const { dir, ig } of matchers) {
  718. let rel = normalizePath(path.relative(dir, fullPath));
  719. if (!rel || rel.startsWith('..')) continue; // not under this matcher's dir
  720. if (isDir) rel += '/'; // dir-only rules (e.g. `build/`) only match with the slash
  721. if (ig.ignores(rel)) return true;
  722. }
  723. return false;
  724. };
  725. function walk(dir: string, matchers: ScopedIgnore[]): void {
  726. let realDir: string;
  727. try {
  728. realDir = fs.realpathSync(dir);
  729. } catch {
  730. logDebug('Skipping unresolvable directory', { dir });
  731. return;
  732. }
  733. if (visitedDirs.has(realDir)) {
  734. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  735. return;
  736. }
  737. visitedDirs.add(realDir);
  738. // This directory's own .gitignore (if present) applies to everything below it.
  739. // The root's .gitignore is already merged into the seeded base matcher (so a
  740. // negation there can override a built-in default), so skip it here.
  741. const own = dir === rootDir ? null : loadIgnore(dir);
  742. const active = own ? [...matchers, own] : matchers;
  743. let entries: fs.Dirent[];
  744. try {
  745. entries = fs.readdirSync(dir, { withFileTypes: true });
  746. } catch (error) {
  747. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  748. return;
  749. }
  750. for (const entry of entries) {
  751. // Never descend into git internals or any CodeGraph data directory
  752. // (the active one or a sibling another environment created — #636).
  753. if (entry.name === '.git' || isCodeGraphDataDir(entry.name)) continue;
  754. const fullPath = path.join(dir, entry.name);
  755. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  756. if (entry.isSymbolicLink()) {
  757. try {
  758. const realTarget = fs.realpathSync(fullPath);
  759. const stat = fs.statSync(realTarget);
  760. if (stat.isDirectory()) {
  761. if (!isIgnored(fullPath, true, active)) {
  762. walk(fullPath, active);
  763. }
  764. } else if (stat.isFile()) {
  765. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath)) {
  766. files.push(relativePath);
  767. count++;
  768. onProgress?.(count, relativePath);
  769. }
  770. }
  771. } catch {
  772. logDebug('Skipping broken symlink', { path: fullPath });
  773. }
  774. continue;
  775. }
  776. if (entry.isDirectory()) {
  777. if (!isIgnored(fullPath, true, active)) {
  778. walk(fullPath, active);
  779. }
  780. } else if (entry.isFile()) {
  781. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath)) {
  782. files.push(relativePath);
  783. count++;
  784. onProgress?.(count, relativePath);
  785. }
  786. }
  787. }
  788. }
  789. // Seed a base matcher with the built-in default ignores (merged with the root
  790. // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
  791. walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
  792. return files;
  793. }
  794. /**
  795. * Extraction orchestrator
  796. */
  797. export class ExtractionOrchestrator {
  798. private rootDir: string;
  799. private queries: QueryBuilder;
  800. /**
  801. * Names of frameworks detected for this project, populated by indexAll().
  802. * Passed to extractFromSource so framework-specific extractors (route nodes,
  803. * middleware, etc.) run after the tree-sitter pass. Cleared if detection
  804. * hasn't run yet so single-file re-index paths can detect on the spot.
  805. */
  806. private detectedFrameworkNames: string[] | null = null;
  807. constructor(rootDir: string, queries: QueryBuilder) {
  808. this.rootDir = rootDir;
  809. this.queries = queries;
  810. }
  811. /**
  812. * Build a filesystem-backed ResolutionContext sufficient for framework
  813. * detection. Graph-query methods (getNodesByName etc.) return empty because
  814. * the DB hasn't been populated yet, but detect() only uses readFile,
  815. * fileExists, and getAllFiles, so that's fine.
  816. */
  817. private buildDetectionContext(files: string[]): ResolutionContext {
  818. const rootDir = this.rootDir;
  819. return {
  820. getNodesInFile: () => [],
  821. getNodesByName: () => [],
  822. getNodesByQualifiedName: () => [],
  823. getNodesByKind: () => [],
  824. getNodesByLowerName: () => [],
  825. getImportMappings: () => [],
  826. getAllFiles: () => files,
  827. getProjectRoot: () => rootDir,
  828. fileExists: (relativePath: string) => {
  829. const full = validatePathWithinRoot(rootDir, relativePath);
  830. if (!full) return false;
  831. try {
  832. return fs.existsSync(full);
  833. } catch {
  834. return false;
  835. }
  836. },
  837. readFile: (relativePath: string) => {
  838. const full = validatePathWithinRoot(rootDir, relativePath);
  839. if (!full) return null;
  840. try {
  841. return fs.readFileSync(full, 'utf-8');
  842. } catch {
  843. return null;
  844. }
  845. },
  846. // Monorepo support — needed by framework detect()s that probe
  847. // subpackage manifests (e.g. fabric-view looking at
  848. // packages/<sub>/package.json when the root manifest is just a
  849. // workspace declaration). Matches the resolver-context shape.
  850. listDirectories: (relativePath: string) => {
  851. const target =
  852. relativePath === '.' || relativePath === ''
  853. ? rootDir
  854. : path.join(rootDir, relativePath);
  855. try {
  856. return fs
  857. .readdirSync(target, { withFileTypes: true })
  858. .filter((entry) => entry.isDirectory())
  859. .map((entry) => entry.name);
  860. } catch {
  861. return [];
  862. }
  863. },
  864. };
  865. }
  866. /**
  867. * Detect frameworks on demand using the current scanned files (or a fresh
  868. * scan if none are provided). Cached on the orchestrator so repeat calls
  869. * inside a single run don't re-scan.
  870. */
  871. private ensureDetectedFrameworks(files?: string[]): string[] {
  872. if (this.detectedFrameworkNames !== null) return this.detectedFrameworkNames;
  873. const fileList = files ?? scanDirectory(this.rootDir);
  874. const context = this.buildDetectionContext(fileList);
  875. this.detectedFrameworkNames = detectFrameworks(context).map((r) => r.name);
  876. return this.detectedFrameworkNames;
  877. }
  878. /**
  879. * Index all files in the project
  880. */
  881. async indexAll(
  882. onProgress?: (progress: IndexProgress) => void,
  883. signal?: AbortSignal,
  884. verbose?: boolean
  885. ): Promise<IndexResult> {
  886. await initGrammars();
  887. const startTime = Date.now();
  888. const errors: ExtractionError[] = [];
  889. let filesIndexed = 0;
  890. let filesSkipped = 0;
  891. let filesErrored = 0;
  892. let totalNodes = 0;
  893. let totalEdges = 0;
  894. const log = verbose
  895. ? (msg: string) => { console.log(`[worker] ${msg}`); }
  896. : (_msg: string) => {};
  897. // Phase 1: Scan for files
  898. onProgress?.({
  899. phase: 'scanning',
  900. current: 0,
  901. total: 0,
  902. });
  903. const files = await scanDirectoryAsync(this.rootDir, (current, file) => {
  904. onProgress?.({
  905. phase: 'scanning',
  906. current,
  907. total: 0,
  908. currentFile: file,
  909. });
  910. });
  911. // Detect frameworks once per indexAll run using the scanned file list.
  912. // Names are passed to each parse call so framework-specific extractors
  913. // (route nodes, middleware, etc.) run after the tree-sitter pass.
  914. // Framework detection is reset each run so adding e.g. requirements.txt
  915. // between runs is picked up without restarting the process.
  916. this.detectedFrameworkNames = null;
  917. const frameworkNames = this.ensureDetectedFrameworks(files);
  918. if (signal?.aborted) {
  919. return {
  920. success: false,
  921. filesIndexed: 0,
  922. filesSkipped: 0,
  923. filesErrored: 0,
  924. nodesCreated: 0,
  925. edgesCreated: 0,
  926. errors: [{ message: 'Aborted', severity: 'error' }],
  927. durationMs: Date.now() - startTime,
  928. };
  929. }
  930. // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
  931. const total = files.length;
  932. let processed = 0;
  933. // Emit parsing phase immediately so the progress bar appears during worker setup.
  934. // The yield lets the shimmer worker flush the phase transition to stdout before
  935. // the main thread starts synchronous grammar detection work.
  936. onProgress?.({
  937. phase: 'parsing',
  938. current: 0,
  939. total,
  940. });
  941. await new Promise(resolve => setImmediate(resolve));
  942. // Detect needed languages and load grammars in the parse worker
  943. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
  944. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
  945. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  946. neededLanguages.push('cpp');
  947. }
  948. // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
  949. // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
  950. const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
  951. const useWorker = fs.existsSync(parseWorkerPath);
  952. let WorkerClass: typeof import('worker_threads').Worker | null = null;
  953. if (useWorker) {
  954. const { Worker } = await import('worker_threads');
  955. WorkerClass = Worker;
  956. } else {
  957. // In-process fallback: load grammars locally
  958. await loadGrammarsForLanguages(neededLanguages);
  959. }
  960. // --- Worker lifecycle management ---
  961. // The worker can crash (OOM in WASM) or hang on pathological files.
  962. // We track pending parse promises and handle both cases:
  963. // - Timeout: terminate + restart the worker, reject the timed-out request
  964. // - Crash: reject all pending promises, restart for remaining files
  965. let parseWorker: import('worker_threads').Worker | null = null;
  966. let nextId = 0;
  967. let workerParseCount = 0;
  968. const pendingParses = new Map<number, {
  969. resolve: (result: ExtractionResult) => void;
  970. reject: (err: Error) => void;
  971. timer: ReturnType<typeof setTimeout>;
  972. }>();
  973. function rejectAllPending(reason: string): void {
  974. for (const [id, pending] of pendingParses) {
  975. clearTimeout(pending.timer);
  976. pendingParses.delete(id);
  977. pending.reject(new Error(reason));
  978. }
  979. }
  980. function attachWorkerHandlers(w: import('worker_threads').Worker): void {
  981. w.on('message', (msg: { type: string; id?: number; result?: ExtractionResult }) => {
  982. if (msg.type === 'parse-result' && msg.id !== undefined) {
  983. const pending = pendingParses.get(msg.id);
  984. if (pending) {
  985. clearTimeout(pending.timer);
  986. pendingParses.delete(msg.id);
  987. pending.resolve(msg.result!);
  988. }
  989. }
  990. });
  991. w.on('error', (err) => {
  992. logWarn('Parse worker error', { error: err.message });
  993. rejectAllPending(`Worker error: ${err.message}`);
  994. });
  995. w.on('exit', (code) => {
  996. if (code !== 0 && pendingParses.size > 0) {
  997. logWarn('Parse worker exited unexpectedly', { code });
  998. rejectAllPending(`Worker exited with code ${code}`);
  999. }
  1000. // Clear reference so we know to respawn, reset count so
  1001. // the fresh worker gets a full cycle before recycling.
  1002. if (parseWorker === w) {
  1003. parseWorker = null;
  1004. workerParseCount = 0;
  1005. }
  1006. });
  1007. }
  1008. async function ensureWorker(): Promise<import('worker_threads').Worker> {
  1009. if (parseWorker) return parseWorker;
  1010. log('Spawning new parse worker...');
  1011. parseWorker = new WorkerClass!(parseWorkerPath);
  1012. attachWorkerHandlers(parseWorker);
  1013. // Load grammars in the new worker
  1014. await new Promise<void>((resolve, reject) => {
  1015. parseWorker!.once('message', (msg: { type: string }) => {
  1016. if (msg.type === 'grammars-loaded') resolve();
  1017. else reject(new Error(`Unexpected message: ${msg.type}`));
  1018. });
  1019. parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages });
  1020. });
  1021. return parseWorker;
  1022. }
  1023. if (WorkerClass) {
  1024. await ensureWorker();
  1025. }
  1026. /**
  1027. * Recycle the worker thread to reclaim WASM memory.
  1028. * Terminates the current worker and clears the reference so
  1029. * ensureWorker() will spawn a fresh one on the next call.
  1030. */
  1031. function recycleWorker(): void {
  1032. if (!parseWorker) return;
  1033. log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
  1034. const w = parseWorker;
  1035. parseWorker = null;
  1036. workerParseCount = 0;
  1037. // Fire-and-forget: worker.terminate() can hang if WASM is stuck
  1038. w.terminate().catch(() => {});
  1039. }
  1040. async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
  1041. if (!WorkerClass) {
  1042. // In-process fallback
  1043. return extractFromSource(
  1044. filePath,
  1045. content,
  1046. detectLanguage(filePath, content),
  1047. frameworkNames
  1048. );
  1049. }
  1050. // Recycle the worker before the next parse if we've hit the threshold.
  1051. // This destroys the WASM linear memory (which can grow but never shrink)
  1052. // and starts a fresh worker with a clean heap.
  1053. if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
  1054. await recycleWorker();
  1055. }
  1056. const worker = await ensureWorker();
  1057. const id = nextId++;
  1058. workerParseCount++;
  1059. // Scale timeout for large files: base 10s + 10s per 100KB
  1060. const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
  1061. return new Promise<ExtractionResult>((resolve, reject) => {
  1062. const timer = setTimeout(() => {
  1063. pendingParses.delete(id);
  1064. log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
  1065. // Reject FIRST — worker.terminate() can hang if WASM is stuck
  1066. parseWorker = null;
  1067. workerParseCount = 0;
  1068. reject(new Error(`Parse timed out after ${timeoutMs}ms`));
  1069. // Fire-and-forget: kill the stuck worker in the background
  1070. worker.terminate().catch(() => {});
  1071. }, timeoutMs);
  1072. pendingParses.set(id, { resolve, reject, timer });
  1073. worker.postMessage({ type: 'parse', id, filePath, content, frameworkNames });
  1074. });
  1075. }
  1076. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  1077. if (signal?.aborted) {
  1078. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  1079. return {
  1080. success: false,
  1081. filesIndexed,
  1082. filesSkipped,
  1083. filesErrored,
  1084. nodesCreated: totalNodes,
  1085. edgesCreated: totalEdges,
  1086. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  1087. durationMs: Date.now() - startTime,
  1088. };
  1089. }
  1090. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  1091. // Read files in parallel (with path validation before any I/O)
  1092. const fileContents = await Promise.all(
  1093. batch.map(async (fp) => {
  1094. try {
  1095. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  1096. if (!fullPath) {
  1097. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  1098. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  1099. }
  1100. const content = await fsp.readFile(fullPath, 'utf-8');
  1101. const stats = await fsp.stat(fullPath);
  1102. return { filePath: fp, content, stats, error: null as Error | null };
  1103. } catch (err) {
  1104. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  1105. }
  1106. })
  1107. );
  1108. // Send to worker for parsing, store results on main thread
  1109. for (const { filePath, content, stats, error } of fileContents) {
  1110. if (signal?.aborted) {
  1111. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  1112. return {
  1113. success: false,
  1114. filesIndexed,
  1115. filesSkipped,
  1116. filesErrored,
  1117. nodesCreated: totalNodes,
  1118. edgesCreated: totalEdges,
  1119. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  1120. durationMs: Date.now() - startTime,
  1121. };
  1122. }
  1123. // Report progress before parsing (show current file being worked on)
  1124. onProgress?.({
  1125. phase: 'parsing',
  1126. current: processed,
  1127. total,
  1128. currentFile: filePath,
  1129. });
  1130. if (error || content === null || stats === null) {
  1131. processed++;
  1132. filesErrored++;
  1133. errors.push({
  1134. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1135. filePath,
  1136. severity: 'error',
  1137. code: 'read_error',
  1138. });
  1139. continue;
  1140. }
  1141. // Honour MAX_FILE_SIZE. Without this check, vendored generated
  1142. // headers, minified bundles, and other multi-MB files get indexed,
  1143. // wasting WASM heap and the worker recycle budget on inputs with no
  1144. // useful symbols. The single-file extractFile path already enforces
  1145. // this; the bulk path used to silently skip the check.
  1146. if (stats.size > MAX_FILE_SIZE) {
  1147. processed++;
  1148. filesSkipped++;
  1149. errors.push({
  1150. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1151. filePath,
  1152. severity: 'warning',
  1153. code: 'size_exceeded',
  1154. });
  1155. onProgress?.({ phase: 'parsing', current: processed, total });
  1156. continue;
  1157. }
  1158. // Parse in worker thread (main thread stays unblocked).
  1159. // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
  1160. let result: ExtractionResult;
  1161. try {
  1162. result = await requestParse(filePath, content);
  1163. } catch (parseErr) {
  1164. processed++;
  1165. filesErrored++;
  1166. errors.push({
  1167. message: parseErr instanceof Error ? parseErr.message : String(parseErr),
  1168. filePath,
  1169. severity: 'error',
  1170. code: 'parse_error',
  1171. });
  1172. continue;
  1173. }
  1174. processed++;
  1175. // Store in database on main thread (SQLite is not thread-safe)
  1176. if (result.nodes.length > 0 || result.errors.length === 0) {
  1177. const language = detectLanguage(filePath, content);
  1178. this.storeExtractionResult(filePath, content, language, stats, result);
  1179. }
  1180. if (result.errors.length > 0) {
  1181. for (const err of result.errors) {
  1182. if (!err.filePath) err.filePath = filePath;
  1183. }
  1184. errors.push(...result.errors);
  1185. }
  1186. if (result.nodes.length > 0) {
  1187. filesIndexed++;
  1188. totalNodes += result.nodes.length;
  1189. totalEdges += result.edges.length;
  1190. } else if (result.errors.some((e) => e.severity === 'error')) {
  1191. filesErrored++;
  1192. } else {
  1193. // Files with no symbols but no errors (yaml, twig, properties) are
  1194. // tracked at the file level — count them as indexed so the CLI
  1195. // doesn't misleadingly report "No files found to index".
  1196. const lang = detectLanguage(filePath, content);
  1197. if (isFileLevelOnlyLanguage(lang)) {
  1198. filesIndexed++;
  1199. } else {
  1200. filesSkipped++;
  1201. }
  1202. }
  1203. }
  1204. }
  1205. // Report 100% so the progress bar doesn't hang at 99%
  1206. onProgress?.({
  1207. phase: 'parsing',
  1208. current: total,
  1209. total,
  1210. });
  1211. // Yield so the shimmer worker's buffered stdout writes can flush.
  1212. // Worker thread stdout is proxied through the main thread's event loop,
  1213. // so synchronous work here blocks the animation from rendering.
  1214. await new Promise(resolve => setImmediate(resolve));
  1215. // Retry pass: files that failed due to WASM memory corruption may succeed
  1216. // on a fresh worker with a clean heap. Recycle before each attempt so
  1217. // every file gets the absolute cleanest WASM state possible.
  1218. const retryableErrors = errors.filter(
  1219. (e) => e.code === 'parse_error' && e.filePath &&
  1220. (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
  1221. );
  1222. if (retryableErrors.length > 0 && WorkerClass) {
  1223. log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
  1224. const stillFailing: typeof retryableErrors = [];
  1225. for (const errEntry of retryableErrors) {
  1226. const filePath = errEntry.filePath!;
  1227. if (signal?.aborted) break;
  1228. // Fresh worker for every retry — maximum WASM headroom
  1229. recycleWorker();
  1230. let content: string;
  1231. try {
  1232. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  1233. if (!fullPath) continue;
  1234. content = await fsp.readFile(fullPath, 'utf-8');
  1235. } catch {
  1236. continue;
  1237. }
  1238. let result: ExtractionResult;
  1239. try {
  1240. result = await requestParse(filePath, content);
  1241. } catch {
  1242. stillFailing.push(errEntry);
  1243. continue;
  1244. }
  1245. if (result.nodes.length > 0 || result.errors.length === 0) {
  1246. const language = detectLanguage(filePath, content);
  1247. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  1248. this.storeExtractionResult(filePath, content, language, stats, result);
  1249. const idx = errors.indexOf(errEntry);
  1250. if (idx >= 0) errors.splice(idx, 1);
  1251. filesErrored--;
  1252. filesIndexed++;
  1253. totalNodes += result.nodes.length;
  1254. totalEdges += result.edges.length;
  1255. log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
  1256. }
  1257. }
  1258. // Last resort: for files that still crash on a clean worker, strip
  1259. // comment-only lines to reduce WASM memory pressure. Many compiler
  1260. // test files are 90%+ comments (CHECK directives) that don't contribute
  1261. // code nodes but consume parser memory.
  1262. if (stillFailing.length > 0) {
  1263. log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
  1264. for (const errEntry of stillFailing) {
  1265. const filePath = errEntry.filePath!;
  1266. if (signal?.aborted) break;
  1267. recycleWorker();
  1268. let fullContent: string;
  1269. try {
  1270. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  1271. if (!fullPath) continue;
  1272. fullContent = await fsp.readFile(fullPath, 'utf-8');
  1273. } catch {
  1274. continue;
  1275. }
  1276. // Strip lines that are entirely comments (preserving line numbers
  1277. // by replacing with empty lines so node positions stay correct)
  1278. const stripped = fullContent
  1279. .split('\n')
  1280. .map(line => /^\s*\/\//.test(line) ? '' : line)
  1281. .join('\n');
  1282. let result: ExtractionResult;
  1283. try {
  1284. result = await requestParse(filePath, stripped);
  1285. } catch {
  1286. continue;
  1287. }
  1288. if (result.nodes.length > 0 || result.errors.length === 0) {
  1289. const language = detectLanguage(filePath, fullContent);
  1290. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  1291. this.storeExtractionResult(filePath, fullContent, language, stats, result);
  1292. const idx = errors.indexOf(errEntry);
  1293. if (idx >= 0) errors.splice(idx, 1);
  1294. filesErrored--;
  1295. filesIndexed++;
  1296. totalNodes += result.nodes.length;
  1297. totalEdges += result.edges.length;
  1298. log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
  1299. }
  1300. }
  1301. }
  1302. }
  1303. // Shut down parse worker and clear any pending timers
  1304. rejectAllPending('Indexing complete');
  1305. if (parseWorker) {
  1306. (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  1307. }
  1308. return {
  1309. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1310. filesIndexed,
  1311. filesSkipped,
  1312. filesErrored,
  1313. nodesCreated: totalNodes,
  1314. edgesCreated: totalEdges,
  1315. errors,
  1316. durationMs: Date.now() - startTime,
  1317. };
  1318. }
  1319. /**
  1320. * Index specific files
  1321. */
  1322. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  1323. const startTime = Date.now();
  1324. const errors: ExtractionError[] = [];
  1325. let filesIndexed = 0;
  1326. let filesSkipped = 0;
  1327. let filesErrored = 0;
  1328. let totalNodes = 0;
  1329. let totalEdges = 0;
  1330. for (const filePath of filePaths) {
  1331. const result = await this.indexFile(filePath);
  1332. if (result.errors.length > 0) {
  1333. errors.push(...result.errors);
  1334. }
  1335. if (result.nodes.length > 0) {
  1336. filesIndexed++;
  1337. totalNodes += result.nodes.length;
  1338. totalEdges += result.edges.length;
  1339. } else if (result.errors.some((e) => e.severity === 'error')) {
  1340. filesErrored++;
  1341. } else {
  1342. const tracked = this.queries.getFileByPath(filePath);
  1343. if (tracked && isFileLevelOnlyLanguage(tracked.language)) {
  1344. filesIndexed++;
  1345. } else {
  1346. filesSkipped++;
  1347. }
  1348. }
  1349. }
  1350. return {
  1351. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1352. filesIndexed,
  1353. filesSkipped,
  1354. filesErrored,
  1355. nodesCreated: totalNodes,
  1356. edgesCreated: totalEdges,
  1357. errors,
  1358. durationMs: Date.now() - startTime,
  1359. };
  1360. }
  1361. /**
  1362. * Index a single file
  1363. */
  1364. async indexFile(relativePath: string): Promise<ExtractionResult> {
  1365. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  1366. if (!fullPath) {
  1367. return {
  1368. nodes: [],
  1369. edges: [],
  1370. unresolvedReferences: [],
  1371. errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1372. durationMs: 0,
  1373. };
  1374. }
  1375. // Read file content and stats
  1376. let content: string;
  1377. let stats: fs.Stats;
  1378. try {
  1379. stats = await fsp.stat(fullPath);
  1380. content = await fsp.readFile(fullPath, 'utf-8');
  1381. } catch (error) {
  1382. return {
  1383. nodes: [],
  1384. edges: [],
  1385. unresolvedReferences: [],
  1386. errors: [
  1387. {
  1388. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1389. filePath: relativePath,
  1390. severity: 'error',
  1391. code: 'read_error',
  1392. },
  1393. ],
  1394. durationMs: 0,
  1395. };
  1396. }
  1397. return this.indexFileWithContent(relativePath, content, stats);
  1398. }
  1399. /**
  1400. * Index a single file with pre-read content and stats.
  1401. * Used by the parallel batch reader to avoid redundant file I/O.
  1402. */
  1403. async indexFileWithContent(
  1404. relativePath: string,
  1405. content: string,
  1406. stats: fs.Stats
  1407. ): Promise<ExtractionResult> {
  1408. // Prevent path traversal
  1409. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  1410. if (!fullPath) {
  1411. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  1412. return {
  1413. nodes: [],
  1414. edges: [],
  1415. unresolvedReferences: [],
  1416. errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1417. durationMs: 0,
  1418. };
  1419. }
  1420. // Check file size
  1421. if (stats.size > MAX_FILE_SIZE) {
  1422. return {
  1423. nodes: [],
  1424. edges: [],
  1425. unresolvedReferences: [],
  1426. errors: [
  1427. {
  1428. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1429. filePath: relativePath,
  1430. severity: 'warning',
  1431. code: 'size_exceeded',
  1432. },
  1433. ],
  1434. durationMs: 0,
  1435. };
  1436. }
  1437. // Detect language
  1438. const language = detectLanguage(relativePath, content);
  1439. if (!isLanguageSupported(language)) {
  1440. return {
  1441. nodes: [],
  1442. edges: [],
  1443. unresolvedReferences: [],
  1444. errors: [],
  1445. durationMs: 0,
  1446. };
  1447. }
  1448. // Extract from source. Use cached framework names if indexAll has run,
  1449. // otherwise detect on the spot so single-file re-index paths still emit
  1450. // route nodes / middleware / etc.
  1451. const frameworkNames = this.ensureDetectedFrameworks();
  1452. const result = extractFromSource(relativePath, content, language, frameworkNames);
  1453. // Store in database
  1454. if (result.nodes.length > 0 || result.errors.length === 0) {
  1455. this.storeExtractionResult(relativePath, content, language, stats, result);
  1456. }
  1457. return result;
  1458. }
  1459. /**
  1460. * Store extraction result in database
  1461. */
  1462. private storeExtractionResult(
  1463. filePath: string,
  1464. content: string,
  1465. language: Language,
  1466. stats: fs.Stats,
  1467. result: ExtractionResult
  1468. ): void {
  1469. const contentHash = hashContent(content);
  1470. // Check if file already exists and hasn't changed
  1471. const existingFile = this.queries.getFileByPath(filePath);
  1472. if (existingFile && existingFile.contentHash === contentHash) {
  1473. return; // No changes
  1474. }
  1475. // Snapshot incoming cross-file edges BEFORE deleting this file's nodes.
  1476. // `deleteFile` cascades to delete every edge whose source OR target is a
  1477. // node in this file (edges.FK ... ON DELETE CASCADE). Edges whose SOURCE is
  1478. // in this file are re-emitted by the extractor below, but edges whose SOURCE
  1479. // is in a *different* (unchanged) file are not — they would be silently
  1480. // dropped, which is issue #899: re-indexing a callee file severs `calls`/
  1481. // `references` edges from callers that import it via module-attribute
  1482. // access (`pkg.mod.fn(...)`).
  1483. //
  1484. // We snapshot the edge plus the target node's (name, kind) so we can
  1485. // re-resolve to the re-indexed target's NEW id. Node ids are
  1486. // `sha256(filePath:kind:name:line)`, so any line shift in the callee file
  1487. // (e.g. a docstring-only edit above the symbol) changes every target id and
  1488. // a naive re-insert by old id would silently drop every edge. Matching by
  1489. // (filePath, kind, name) is stable across line shifts; if the symbol was
  1490. // renamed/removed, no match is found and the edge stays dropped (correct).
  1491. const crossFileIncomingEdges = existingFile
  1492. ? this.queries.getCrossFileIncomingEdgesWithTarget(filePath)
  1493. : [];
  1494. // Delete existing data for this file
  1495. if (existingFile) {
  1496. this.queries.deleteFile(filePath);
  1497. }
  1498. // Filter out nodes with missing required fields before insertion.
  1499. // This prevents FK violations when edges reference nodes that would
  1500. // be silently skipped by insertNode() (see issue #42).
  1501. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  1502. // Insert nodes
  1503. if (validNodes.length > 0) {
  1504. this.queries.insertNodes(validNodes);
  1505. }
  1506. // Filter edges to only reference nodes that were actually inserted
  1507. if (result.edges.length > 0) {
  1508. const insertedIds = new Set(validNodes.map((n) => n.id));
  1509. const validEdges = result.edges.filter(
  1510. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  1511. );
  1512. if (validEdges.length > 0) {
  1513. this.queries.insertEdges(validEdges);
  1514. }
  1515. }
  1516. // Re-insert cross-file incoming edges snapshotted before the delete,
  1517. // re-resolving each edge's target to the re-indexed node's new id by
  1518. // (filePath, kind, name). Node ids include the source line, so any line
  1519. // shift in the callee file (e.g. a docstring-only edit above the symbol)
  1520. // changes every target id and a naive re-insert by old id would drop them
  1521. // all. `insertEdges` still filters to endpoints that exist, so edges whose
  1522. // caller (source) was deleted, or whose callee (target) was renamed/removed
  1523. // during the re-index (no match in `newTargetIds`), are dropped. This
  1524. // closes the #899 edge-drop on `sync`.
  1525. if (crossFileIncomingEdges.length > 0) {
  1526. const newNodesByKindName = new Map<string, string>();
  1527. for (const n of validNodes) {
  1528. newNodesByKindName.set(`${n.kind}\0${n.name}`, n.id);
  1529. }
  1530. const reinserted: Edge[] = [];
  1531. for (const e of crossFileIncomingEdges) {
  1532. const newTargetId = newNodesByKindName.get(`${e.targetKind}\0${e.targetName}`);
  1533. if (newTargetId) {
  1534. reinserted.push({ source: e.source, target: newTargetId, kind: e.kind, metadata: e.metadata, line: e.line, column: e.column, provenance: e.provenance });
  1535. }
  1536. }
  1537. if (reinserted.length > 0) {
  1538. this.queries.insertEdges(reinserted);
  1539. }
  1540. }
  1541. // Insert unresolved references in batch with denormalized filePath/language
  1542. if (result.unresolvedReferences.length > 0) {
  1543. const insertedIds = new Set(validNodes.map((n) => n.id));
  1544. const refsWithContext = result.unresolvedReferences
  1545. .filter((ref) => insertedIds.has(ref.fromNodeId))
  1546. .map((ref) => ({
  1547. ...ref,
  1548. filePath: ref.filePath ?? filePath,
  1549. language: ref.language ?? language,
  1550. }));
  1551. if (refsWithContext.length > 0) {
  1552. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  1553. }
  1554. }
  1555. // Insert file record
  1556. const fileRecord: FileRecord = {
  1557. path: filePath,
  1558. contentHash,
  1559. language,
  1560. size: stats.size,
  1561. modifiedAt: stats.mtimeMs,
  1562. indexedAt: Date.now(),
  1563. nodeCount: result.nodes.length,
  1564. errors: result.errors.length > 0 ? result.errors : undefined,
  1565. };
  1566. this.queries.upsertFile(fileRecord);
  1567. }
  1568. /**
  1569. * Sync the index with the current file state.
  1570. *
  1571. * Change detection is filesystem-based, never git: a (size, mtime) stat
  1572. * pre-filter skips unchanged files, then a content-hash compare confirms real
  1573. * changes. This works in non-git projects and catches committed changes from
  1574. * `git pull`/`checkout`/`merge`/`rebase` that `git status` cannot see.
  1575. */
  1576. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  1577. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  1578. const startTime = Date.now();
  1579. let filesChecked = 0;
  1580. let filesAdded = 0;
  1581. let filesModified = 0;
  1582. let filesRemoved = 0;
  1583. let nodesUpdated = 0;
  1584. const changedFilePaths: string[] = [];
  1585. onProgress?.({
  1586. phase: 'scanning',
  1587. current: 0,
  1588. total: 0,
  1589. });
  1590. const filesToIndex: string[] = [];
  1591. // === Filesystem reconcile (git-independent) ===
  1592. // The source of truth for "what changed" is the filesystem vs the indexed
  1593. // state — never git. We enumerate the current source files and reconcile
  1594. // each against the DB. A cheap (size, mtime) stat pre-filter skips unchanged
  1595. // files without reading or hashing them, so the expensive read+hash+parse
  1596. // only runs for files that actually changed. This catches edits/adds/deletes
  1597. // whether or not the project uses git, and crucially also catches committed
  1598. // changes from `git pull`/`checkout`/`merge`/`rebase` — which `git status`
  1599. // cannot see, because the working tree is clean afterward.
  1600. const currentFiles = scanDirectory(this.rootDir);
  1601. filesChecked = currentFiles.length;
  1602. const currentSet = new Set(currentFiles);
  1603. const trackedFiles = this.queries.getAllFiles();
  1604. const trackedMap = new Map<string, FileRecord>();
  1605. for (const f of trackedFiles) {
  1606. trackedMap.set(f.path, f);
  1607. }
  1608. // Removals: tracked in the DB but no longer a present source file. Check the
  1609. // filesystem directly — `scanDirectory` (via `git ls-files`) still lists a
  1610. // file deleted from disk but not yet staged, so set membership alone misses it.
  1611. for (const tracked of trackedFiles) {
  1612. if (!currentSet.has(tracked.path) || !fs.existsSync(path.join(this.rootDir, tracked.path))) {
  1613. this.queries.deleteFile(tracked.path);
  1614. filesRemoved++;
  1615. }
  1616. }
  1617. // Adds / modifications.
  1618. for (const filePath of currentFiles) {
  1619. const fullPath = path.join(this.rootDir, filePath);
  1620. const tracked = trackedMap.get(filePath);
  1621. // Cheap pre-filter: an already-indexed file whose size AND mtime both match
  1622. // the DB is unchanged — skip it without reading or hashing. (A content
  1623. // change that preserves both exactly is the blind spot every mtime-based
  1624. // incremental tool accepts; `index --force` is the escape hatch. Git bumps
  1625. // mtime on every file it writes during checkout/merge, so pulls are caught.)
  1626. if (tracked) {
  1627. try {
  1628. const stat = fs.statSync(fullPath);
  1629. if (stat.size === tracked.size && Math.floor(stat.mtimeMs) === Math.floor(tracked.modifiedAt)) {
  1630. continue;
  1631. }
  1632. } catch (error) {
  1633. logDebug('Skipping unstattable file during sync', { filePath, error: String(error) });
  1634. continue;
  1635. }
  1636. }
  1637. // New, or size/mtime changed — read + hash to confirm a real content change.
  1638. let content: string;
  1639. try {
  1640. content = fs.readFileSync(fullPath, 'utf-8');
  1641. } catch (error) {
  1642. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  1643. continue;
  1644. }
  1645. const contentHash = hashContent(content);
  1646. if (!tracked) {
  1647. filesToIndex.push(filePath);
  1648. changedFilePaths.push(filePath);
  1649. filesAdded++;
  1650. } else if (tracked.contentHash !== contentHash) {
  1651. filesToIndex.push(filePath);
  1652. changedFilePaths.push(filePath);
  1653. filesModified++;
  1654. }
  1655. }
  1656. // Load only grammars needed for changed files
  1657. if (filesToIndex.length > 0) {
  1658. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
  1659. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
  1660. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  1661. neededLanguages.push('cpp');
  1662. }
  1663. await loadGrammarsForLanguages(neededLanguages);
  1664. }
  1665. // Index changed files
  1666. const total = filesToIndex.length;
  1667. for (let i = 0; i < filesToIndex.length; i++) {
  1668. const filePath = filesToIndex[i]!;
  1669. onProgress?.({
  1670. phase: 'parsing',
  1671. current: i + 1,
  1672. total,
  1673. currentFile: filePath,
  1674. });
  1675. const result = await this.indexFile(filePath);
  1676. nodesUpdated += result.nodes.length;
  1677. }
  1678. return {
  1679. filesChecked,
  1680. filesAdded,
  1681. filesModified,
  1682. filesRemoved,
  1683. nodesUpdated,
  1684. durationMs: Date.now() - startTime,
  1685. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  1686. };
  1687. }
  1688. /**
  1689. * Get files that have changed since last index.
  1690. * Uses git status as a fast path when available, falling back to full scan.
  1691. */
  1692. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  1693. const gitChanges = getGitChangedFiles(this.rootDir);
  1694. if (gitChanges) {
  1695. // === Git fast path ===
  1696. const added: string[] = [];
  1697. const modified: string[] = [];
  1698. const removed: string[] = [];
  1699. // Deleted files — only report if tracked in DB
  1700. for (const filePath of gitChanges.deleted) {
  1701. const tracked = this.queries.getFileByPath(filePath);
  1702. if (tracked) {
  1703. removed.push(filePath);
  1704. }
  1705. }
  1706. // Modified + added files — read + hash, compare with DB. Untracked (`??`)
  1707. // files stay untracked in git even after indexing, so they must be
  1708. // hash-compared like modified files instead of always counting as added —
  1709. // otherwise status reports them as pending forever. (See issue #206.)
  1710. for (const filePath of [...gitChanges.modified, ...gitChanges.added]) {
  1711. const fullPath = path.join(this.rootDir, filePath);
  1712. let content: string;
  1713. try {
  1714. content = fs.readFileSync(fullPath, 'utf-8');
  1715. } catch (error) {
  1716. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1717. continue;
  1718. }
  1719. const contentHash = hashContent(content);
  1720. const tracked = this.queries.getFileByPath(filePath);
  1721. if (!tracked) {
  1722. added.push(filePath);
  1723. } else if (tracked.contentHash !== contentHash) {
  1724. modified.push(filePath);
  1725. }
  1726. }
  1727. return { added, modified, removed };
  1728. }
  1729. // === Fallback: full scan (non-git project or git failure) ===
  1730. const currentFiles = new Set(scanDirectory(this.rootDir));
  1731. const trackedFiles = this.queries.getAllFiles();
  1732. // Build Map for O(1) lookups
  1733. const trackedMap = new Map<string, FileRecord>();
  1734. for (const f of trackedFiles) {
  1735. trackedMap.set(f.path, f);
  1736. }
  1737. const added: string[] = [];
  1738. const modified: string[] = [];
  1739. const removed: string[] = [];
  1740. // Find removed files
  1741. for (const tracked of trackedFiles) {
  1742. if (!currentFiles.has(tracked.path)) {
  1743. removed.push(tracked.path);
  1744. }
  1745. }
  1746. // Find added and modified files
  1747. for (const filePath of currentFiles) {
  1748. const fullPath = path.join(this.rootDir, filePath);
  1749. let content: string;
  1750. try {
  1751. content = fs.readFileSync(fullPath, 'utf-8');
  1752. } catch (error) {
  1753. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1754. continue;
  1755. }
  1756. const contentHash = hashContent(content);
  1757. const tracked = trackedMap.get(filePath);
  1758. if (!tracked) {
  1759. added.push(filePath);
  1760. } else if (tracked.contentHash !== contentHash) {
  1761. modified.push(filePath);
  1762. }
  1763. }
  1764. return { added, modified, removed };
  1765. }
  1766. }
  1767. // Re-export useful types and functions
  1768. export { extractFromSource } from './tree-sitter';
  1769. export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';