index.ts 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. } from '../types';
  17. import { QueryBuilder } from '../db/queries';
  18. import { extractFromSource } from './tree-sitter';
  19. import { detectLanguage, isSourceFile, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
  20. import { logDebug, logWarn } from '../errors';
  21. import { validatePathWithinRoot, normalizePath } from '../utils';
  22. import ignore, { Ignore } from 'ignore';
  23. import { detectFrameworks } from '../resolution/frameworks';
  24. import type { ResolutionContext } from '../resolution/types';
  25. /**
  26. * Number of files to read in parallel during indexing.
  27. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  28. */
  29. const FILE_IO_BATCH_SIZE = 10;
  30. // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
  31. /**
  32. * Maximum time (ms) to wait for a single file to parse in the worker thread.
  33. * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
  34. * indexing run from freezing. The worker is restarted after a timeout.
  35. */
  36. const PARSE_TIMEOUT_MS = 10_000;
  37. /**
  38. * Number of files to parse before recycling the worker thread.
  39. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
  40. * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
  41. * V8 isolate by terminating the worker thread and spawning a fresh one.
  42. * This interval balances memory usage against the cost of reloading grammars.
  43. */
  44. const WORKER_RECYCLE_INTERVAL = 250;
  45. /**
  46. * Progress callback for indexing operations
  47. */
  48. export interface IndexProgress {
  49. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  50. current: number;
  51. total: number;
  52. currentFile?: string;
  53. }
  54. /**
  55. * Result of an indexing operation
  56. */
  57. export interface IndexResult {
  58. success: boolean;
  59. filesIndexed: number;
  60. filesSkipped: number;
  61. filesErrored: number;
  62. nodesCreated: number;
  63. edgesCreated: number;
  64. errors: ExtractionError[];
  65. durationMs: number;
  66. }
  67. /**
  68. * Result of a sync operation
  69. */
  70. export interface SyncResult {
  71. filesChecked: number;
  72. filesAdded: number;
  73. filesModified: number;
  74. filesRemoved: number;
  75. nodesUpdated: number;
  76. durationMs: number;
  77. changedFilePaths?: string[];
  78. }
  79. /**
  80. * Calculate SHA256 hash of file contents
  81. */
  82. export function hashContent(content: string): string {
  83. return crypto.createHash('sha256').update(content).digest('hex');
  84. }
  85. /**
  86. * Skip files larger than this (bytes). Generated bundles, minified JS, and
  87. * vendored blobs blow the WASM heap and the worker-recycle budget for no useful
  88. * symbols. 1 MB covers essentially all hand-written source.
  89. */
  90. const MAX_FILE_SIZE = 1024 * 1024;
  91. /**
  92. * Directory names that are dependency, build, cache, or tooling output across the
  93. * languages/frameworks CodeGraph supports — curated from the canonical
  94. * github/gitignore templates. Excluded by default so the graph reflects your code,
  95. * not third-party noise, without requiring a `.gitignore` (issue #407). The
  96. * exclusion applies uniformly (git or not, tracked or not); the only opt-in is an
  97. * explicit `.gitignore` negation (e.g. `!vendor/`). First-party-prone or generic
  98. * names (`packages`, `lib`, `app`, `bin`, `src`, `deps`, `env`, `tmp`, `storage`,
  99. * `Library`) are deliberately NOT listed, to avoid ever hiding real source.
  100. *
  101. * Only dirs that actually contain *indexable source* (or are enormous) earn a slot
  102. * — IDE/state dirs like `.idea`/`.vs` are omitted because CodeGraph indexes only
  103. * recognized source extensions, so they produce no symbols regardless.
  104. */
  105. const DEFAULT_IGNORE_DIRS: ReadonlySet<string> = new Set([
  106. // JS / TS — dependency directories
  107. 'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
  108. '.yarn', '.pnpm-store',
  109. // JS / TS — framework & bundler build / cache / deploy output
  110. '.next', '.nuxt', '.svelte-kit', '.turbo', '.vite', '.parcel-cache', '.angular',
  111. '.docusaurus', 'storybook-static', '.vinxi', '.nitro', 'out-tsc',
  112. '.vercel', '.netlify', '.wrangler',
  113. // Build output (common across ecosystems)
  114. 'dist', 'build', 'out', '.output',
  115. // Test / coverage
  116. 'coverage', '.nyc_output',
  117. // Python
  118. '__pycache__', '__pypackages__', '.venv', 'venv', '.pixi', '.pdm-build',
  119. '.mypy_cache', '.pytest_cache', '.ruff_cache', '.tox', '.nox', '.hypothesis',
  120. '.ipynb_checkpoints', '.eggs',
  121. // Rust / JVM (Maven, Gradle, Scala)
  122. 'target', '.gradle',
  123. // .NET
  124. 'obj',
  125. // Vendored deps (Go, PHP/Composer, Ruby/Bundler)
  126. 'vendor',
  127. // Swift / iOS
  128. '.build', 'Pods', 'Carthage', 'DerivedData', '.swiftpm',
  129. // Dart / Flutter
  130. '.dart_tool', '.pub-cache',
  131. // Native (Android NDK, C/C++ deps)
  132. '.cxx', '.externalNativeBuild', 'vcpkg_installed',
  133. // Scala tooling
  134. '.bloop', '.metals',
  135. // Lua / Luau (LuaRocks)
  136. 'lua_modules', '.luarocks',
  137. // Delphi / RAD Studio IDE backups (duplicate .pas source — would double-count)
  138. '__history', '__recovery',
  139. // Generic cache
  140. '.cache',
  141. ]);
  142. /** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
  143. const DEFAULT_IGNORE_PATTERNS: string[] = [
  144. ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
  145. '*.egg-info/', // Python packaging metadata
  146. 'cmake-build-*/', // CLion / CMake build trees
  147. 'bazel-*/', // Bazel output symlink trees
  148. ];
  149. /**
  150. * An `ignore` matcher seeded with the built-in defaults, merged with the project's
  151. * root .gitignore so a negation there (e.g. `!vendor/`) overrides a default. Shared
  152. * by both enumeration paths so behavior is identical with or without git — and so
  153. * the defaults apply to tracked files too (committing a dependency dir doesn't make
  154. * it project code; the explicit `.gitignore` negation is the only opt-in).
  155. */
  156. export function buildDefaultIgnore(rootDir: string): Ignore {
  157. const ig = ignore().add(DEFAULT_IGNORE_PATTERNS);
  158. try {
  159. const rootGitignore = path.join(rootDir, '.gitignore');
  160. if (fs.existsSync(rootGitignore)) ig.add(fs.readFileSync(rootGitignore, 'utf-8'));
  161. } catch {
  162. // Unreadable root .gitignore — the built-in defaults still apply.
  163. }
  164. return ig;
  165. }
  166. /**
  167. * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
  168. * git repository rooted at `repoDir`, adding each to `files` with `prefix`
  169. * prepended so paths stay relative to the original scan root.
  170. *
  171. * Recurses into embedded git repositories — nested repos that are NOT submodules
  172. * (independent clones living inside the workspace, common in CMake "super-repo"
  173. * layouts). The parent repo's `git ls-files` cannot see into them: tracked output
  174. * skips them entirely, and untracked output reports them only as an opaque
  175. * "subdir/" entry (trailing slash) rather than expanding their files. Each
  176. * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
  177. * (See issue #193.)
  178. */
  179. function collectGitFiles(repoDir: string, prefix: string, files: Set<string>): void {
  180. const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'] };
  181. // Tracked files. --recurse-submodules pulls in files from active submodules,
  182. // which the index would otherwise represent only as a commit pointer.
  183. // Without this, monorepos using submodules index 0 files. (See issue #147.)
  184. // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
  185. // can't be combined with -o, so untracked files are gathered separately below.
  186. const tracked = execFileSync('git', ['ls-files', '-c', '--recurse-submodules'], gitOpts);
  187. for (const line of tracked.split('\n')) {
  188. const trimmed = line.trim();
  189. if (trimmed) {
  190. files.add(normalizePath(prefix + trimmed));
  191. }
  192. }
  193. // Untracked files (submodules manage their own untracked state). Embedded git
  194. // repos surface here as a single "subdir/" entry that git refuses to descend
  195. // into — recurse into those as their own repos so their source gets indexed.
  196. const untracked = execFileSync('git', ['ls-files', '-o', '--exclude-standard'], gitOpts);
  197. for (const line of untracked.split('\n')) {
  198. const trimmed = line.trim();
  199. if (!trimmed) continue;
  200. if (trimmed.endsWith('/')) {
  201. // git only emits a trailing-slash directory entry for an embedded repo.
  202. // Guard with a .git check anyway, and skip anything else exactly as git
  203. // itself skips it (we never descend into a non-repo opaque dir).
  204. const childDir = path.join(repoDir, trimmed);
  205. if (fs.existsSync(path.join(childDir, '.git'))) {
  206. collectGitFiles(childDir, prefix + trimmed, files);
  207. }
  208. continue;
  209. }
  210. files.add(normalizePath(prefix + trimmed));
  211. }
  212. }
  213. /**
  214. * Get all files visible to git (tracked + untracked but not ignored).
  215. * Respects .gitignore at all levels (root, subdirectories) and descends into
  216. * embedded (nested, non-submodule) git repos. Returns null on failure
  217. * (non-git project) so callers can fall back to a filesystem walk.
  218. */
  219. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  220. try {
  221. // Check if the project directory is gitignored by a parent repo.
  222. // When rootDir lives inside a parent git repo that ignores it,
  223. // `git ls-files` returns nothing — fall back to filesystem walk.
  224. const gitRoot = execFileSync(
  225. 'git',
  226. ['rev-parse', '--show-toplevel'],
  227. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  228. ).trim();
  229. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  230. try {
  231. // git check-ignore exits 0 if the path IS ignored, 1 if not
  232. execFileSync(
  233. 'git',
  234. ['check-ignore', '-q', path.resolve(rootDir)],
  235. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  236. );
  237. // Directory is gitignored by parent repo — fall back to filesystem walk
  238. return null;
  239. } catch {
  240. // Not ignored — safe to use git ls-files
  241. }
  242. }
  243. const files = new Set<string>();
  244. collectGitFiles(rootDir, '', files);
  245. // Apply built-in default ignores uniformly — to tracked files too, since
  246. // committing a dependency/build dir doesn't make it project code. A
  247. // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
  248. const ig = buildDefaultIgnore(rootDir);
  249. return new Set([...files].filter((f) => !ig.ignores(f)));
  250. } catch {
  251. return null;
  252. }
  253. }
  254. /**
  255. * Result of git-based change detection.
  256. * Returns null when git is unavailable (non-git project or command failure),
  257. * signaling the caller to fall back to full filesystem scan.
  258. */
  259. interface GitChanges {
  260. modified: string[]; // M, MM, AM — files to re-hash + re-index
  261. added: string[]; // ?? — new untracked files to index
  262. deleted: string[]; // D — files to remove from DB
  263. }
  264. /**
  265. * Use `git status` to detect changed files instead of scanning every file.
  266. * Returns null on failure so callers fall back to full scan.
  267. */
  268. function getGitChangedFiles(rootDir: string): GitChanges | null {
  269. try {
  270. const output = execFileSync(
  271. 'git',
  272. ['status', '--porcelain', '--no-renames'],
  273. { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
  274. );
  275. const modified: string[] = [];
  276. const added: string[] = [];
  277. const deleted: string[] = [];
  278. for (const line of output.split('\n')) {
  279. if (line.length < 4) continue; // Minimum: "XY file"
  280. const statusCode = line.substring(0, 2);
  281. const filePath = normalizePath(line.substring(3));
  282. // Skip non-source files (git status already omits .gitignored paths).
  283. if (!isSourceFile(filePath)) continue;
  284. if (statusCode === '??') {
  285. added.push(filePath);
  286. } else if (statusCode.includes('D')) {
  287. deleted.push(filePath);
  288. } else {
  289. // M, MM, AM, A (staged), etc. — treat as modified
  290. modified.push(filePath);
  291. }
  292. }
  293. return { modified, added, deleted };
  294. } catch {
  295. return null;
  296. }
  297. }
  298. /**
  299. * Recursively scan a directory for source files.
  300. *
  301. * In git repos, uses `git ls-files` (inherently respects .gitignore at all
  302. * levels), then keeps files with a supported source extension. For non-git
  303. * projects, falls back to a filesystem walk that parses .gitignore itself.
  304. */
  305. export function scanDirectory(
  306. rootDir: string,
  307. onProgress?: (current: number, file: string) => void
  308. ): string[] {
  309. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  310. const gitFiles = getGitVisibleFiles(rootDir);
  311. if (gitFiles) {
  312. const files: string[] = [];
  313. let count = 0;
  314. for (const filePath of gitFiles) {
  315. if (isSourceFile(filePath)) {
  316. files.push(filePath);
  317. count++;
  318. onProgress?.(count, filePath);
  319. }
  320. }
  321. return files;
  322. }
  323. // Fallback: walk filesystem for non-git projects
  324. return scanDirectoryWalk(rootDir, onProgress);
  325. }
  326. /**
  327. * Async variant of scanDirectory that yields to the event loop periodically,
  328. * allowing worker threads to receive and render progress messages.
  329. */
  330. export async function scanDirectoryAsync(
  331. rootDir: string,
  332. onProgress?: (current: number, file: string) => void
  333. ): Promise<string[]> {
  334. const gitFiles = getGitVisibleFiles(rootDir);
  335. if (gitFiles) {
  336. const files: string[] = [];
  337. let count = 0;
  338. for (const filePath of gitFiles) {
  339. if (isSourceFile(filePath)) {
  340. files.push(filePath);
  341. count++;
  342. onProgress?.(count, filePath);
  343. // Yield every 100 files so worker threads can render progress
  344. if (count % 100 === 0) {
  345. await new Promise<void>(r => setImmediate(r));
  346. }
  347. }
  348. }
  349. return files;
  350. }
  351. return scanDirectoryWalk(rootDir, onProgress);
  352. }
  353. /**
  354. * Filesystem walk fallback for non-git projects.
  355. */
  356. function scanDirectoryWalk(
  357. rootDir: string,
  358. onProgress?: (current: number, file: string) => void
  359. ): string[] {
  360. const files: string[] = [];
  361. let count = 0;
  362. const visitedDirs = new Set<string>();
  363. // A .gitignore matcher scoped to the directory that declared it. Patterns in
  364. // a nested .gitignore are relative to that directory, so we keep the dir
  365. // alongside the matcher and test paths relative to it — mirroring how git
  366. // applies .gitignore files at every level.
  367. interface ScopedIgnore {
  368. dir: string;
  369. ig: Ignore;
  370. }
  371. const loadIgnore = (dir: string): ScopedIgnore | null => {
  372. try {
  373. const giPath = path.join(dir, '.gitignore');
  374. if (fs.existsSync(giPath)) {
  375. return { dir, ig: ignore().add(fs.readFileSync(giPath, 'utf-8')) };
  376. }
  377. } catch {
  378. // Unreadable .gitignore — treat as absent.
  379. }
  380. return null;
  381. };
  382. const isIgnored = (fullPath: string, isDir: boolean, matchers: ScopedIgnore[]): boolean => {
  383. for (const { dir, ig } of matchers) {
  384. let rel = normalizePath(path.relative(dir, fullPath));
  385. if (!rel || rel.startsWith('..')) continue; // not under this matcher's dir
  386. if (isDir) rel += '/'; // dir-only rules (e.g. `build/`) only match with the slash
  387. if (ig.ignores(rel)) return true;
  388. }
  389. return false;
  390. };
  391. function walk(dir: string, matchers: ScopedIgnore[]): void {
  392. let realDir: string;
  393. try {
  394. realDir = fs.realpathSync(dir);
  395. } catch {
  396. logDebug('Skipping unresolvable directory', { dir });
  397. return;
  398. }
  399. if (visitedDirs.has(realDir)) {
  400. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  401. return;
  402. }
  403. visitedDirs.add(realDir);
  404. // This directory's own .gitignore (if present) applies to everything below it.
  405. // The root's .gitignore is already merged into the seeded base matcher (so a
  406. // negation there can override a built-in default), so skip it here.
  407. const own = dir === rootDir ? null : loadIgnore(dir);
  408. const active = own ? [...matchers, own] : matchers;
  409. let entries: fs.Dirent[];
  410. try {
  411. entries = fs.readdirSync(dir, { withFileTypes: true });
  412. } catch (error) {
  413. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  414. return;
  415. }
  416. for (const entry of entries) {
  417. // Never descend into git internals or our own data directory.
  418. if (entry.name === '.git' || entry.name === '.codegraph') continue;
  419. const fullPath = path.join(dir, entry.name);
  420. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  421. if (entry.isSymbolicLink()) {
  422. try {
  423. const realTarget = fs.realpathSync(fullPath);
  424. const stat = fs.statSync(realTarget);
  425. if (stat.isDirectory()) {
  426. if (!isIgnored(fullPath, true, active)) {
  427. walk(fullPath, active);
  428. }
  429. } else if (stat.isFile()) {
  430. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath)) {
  431. files.push(relativePath);
  432. count++;
  433. onProgress?.(count, relativePath);
  434. }
  435. }
  436. } catch {
  437. logDebug('Skipping broken symlink', { path: fullPath });
  438. }
  439. continue;
  440. }
  441. if (entry.isDirectory()) {
  442. if (!isIgnored(fullPath, true, active)) {
  443. walk(fullPath, active);
  444. }
  445. } else if (entry.isFile()) {
  446. if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath)) {
  447. files.push(relativePath);
  448. count++;
  449. onProgress?.(count, relativePath);
  450. }
  451. }
  452. }
  453. }
  454. // Seed a base matcher with the built-in default ignores (merged with the root
  455. // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
  456. walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
  457. return files;
  458. }
  459. /**
  460. * Extraction orchestrator
  461. */
  462. export class ExtractionOrchestrator {
  463. private rootDir: string;
  464. private queries: QueryBuilder;
  465. /**
  466. * Names of frameworks detected for this project, populated by indexAll().
  467. * Passed to extractFromSource so framework-specific extractors (route nodes,
  468. * middleware, etc.) run after the tree-sitter pass. Cleared if detection
  469. * hasn't run yet so single-file re-index paths can detect on the spot.
  470. */
  471. private detectedFrameworkNames: string[] | null = null;
  472. constructor(rootDir: string, queries: QueryBuilder) {
  473. this.rootDir = rootDir;
  474. this.queries = queries;
  475. }
  476. /**
  477. * Build a filesystem-backed ResolutionContext sufficient for framework
  478. * detection. Graph-query methods (getNodesByName etc.) return empty because
  479. * the DB hasn't been populated yet, but detect() only uses readFile,
  480. * fileExists, and getAllFiles, so that's fine.
  481. */
  482. private buildDetectionContext(files: string[]): ResolutionContext {
  483. const rootDir = this.rootDir;
  484. return {
  485. getNodesInFile: () => [],
  486. getNodesByName: () => [],
  487. getNodesByQualifiedName: () => [],
  488. getNodesByKind: () => [],
  489. getNodesByLowerName: () => [],
  490. getImportMappings: () => [],
  491. getAllFiles: () => files,
  492. getProjectRoot: () => rootDir,
  493. fileExists: (relativePath: string) => {
  494. const full = validatePathWithinRoot(rootDir, relativePath);
  495. if (!full) return false;
  496. try {
  497. return fs.existsSync(full);
  498. } catch {
  499. return false;
  500. }
  501. },
  502. readFile: (relativePath: string) => {
  503. const full = validatePathWithinRoot(rootDir, relativePath);
  504. if (!full) return null;
  505. try {
  506. return fs.readFileSync(full, 'utf-8');
  507. } catch {
  508. return null;
  509. }
  510. },
  511. // Monorepo support — needed by framework detect()s that probe
  512. // subpackage manifests (e.g. fabric-view looking at
  513. // packages/<sub>/package.json when the root manifest is just a
  514. // workspace declaration). Matches the resolver-context shape.
  515. listDirectories: (relativePath: string) => {
  516. const target =
  517. relativePath === '.' || relativePath === ''
  518. ? rootDir
  519. : path.join(rootDir, relativePath);
  520. try {
  521. return fs
  522. .readdirSync(target, { withFileTypes: true })
  523. .filter((entry) => entry.isDirectory())
  524. .map((entry) => entry.name);
  525. } catch {
  526. return [];
  527. }
  528. },
  529. };
  530. }
  531. /**
  532. * Detect frameworks on demand using the current scanned files (or a fresh
  533. * scan if none are provided). Cached on the orchestrator so repeat calls
  534. * inside a single run don't re-scan.
  535. */
  536. private ensureDetectedFrameworks(files?: string[]): string[] {
  537. if (this.detectedFrameworkNames !== null) return this.detectedFrameworkNames;
  538. const fileList = files ?? scanDirectory(this.rootDir);
  539. const context = this.buildDetectionContext(fileList);
  540. this.detectedFrameworkNames = detectFrameworks(context).map((r) => r.name);
  541. return this.detectedFrameworkNames;
  542. }
  543. /**
  544. * Index all files in the project
  545. */
  546. async indexAll(
  547. onProgress?: (progress: IndexProgress) => void,
  548. signal?: AbortSignal,
  549. verbose?: boolean
  550. ): Promise<IndexResult> {
  551. await initGrammars();
  552. const startTime = Date.now();
  553. const errors: ExtractionError[] = [];
  554. let filesIndexed = 0;
  555. let filesSkipped = 0;
  556. let filesErrored = 0;
  557. let totalNodes = 0;
  558. let totalEdges = 0;
  559. const log = verbose
  560. ? (msg: string) => { console.log(`[worker] ${msg}`); }
  561. : (_msg: string) => {};
  562. // Phase 1: Scan for files
  563. onProgress?.({
  564. phase: 'scanning',
  565. current: 0,
  566. total: 0,
  567. });
  568. const files = await scanDirectoryAsync(this.rootDir, (current, file) => {
  569. onProgress?.({
  570. phase: 'scanning',
  571. current,
  572. total: 0,
  573. currentFile: file,
  574. });
  575. });
  576. // Detect frameworks once per indexAll run using the scanned file list.
  577. // Names are passed to each parse call so framework-specific extractors
  578. // (route nodes, middleware, etc.) run after the tree-sitter pass.
  579. // Framework detection is reset each run so adding e.g. requirements.txt
  580. // between runs is picked up without restarting the process.
  581. this.detectedFrameworkNames = null;
  582. const frameworkNames = this.ensureDetectedFrameworks(files);
  583. if (signal?.aborted) {
  584. return {
  585. success: false,
  586. filesIndexed: 0,
  587. filesSkipped: 0,
  588. filesErrored: 0,
  589. nodesCreated: 0,
  590. edgesCreated: 0,
  591. errors: [{ message: 'Aborted', severity: 'error' }],
  592. durationMs: Date.now() - startTime,
  593. };
  594. }
  595. // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
  596. const total = files.length;
  597. let processed = 0;
  598. // Emit parsing phase immediately so the progress bar appears during worker setup.
  599. // The yield lets the shimmer worker flush the phase transition to stdout before
  600. // the main thread starts synchronous grammar detection work.
  601. onProgress?.({
  602. phase: 'parsing',
  603. current: 0,
  604. total,
  605. });
  606. await new Promise(resolve => setImmediate(resolve));
  607. // Detect needed languages and load grammars in the parse worker
  608. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
  609. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
  610. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  611. neededLanguages.push('cpp');
  612. }
  613. // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
  614. // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
  615. const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
  616. const useWorker = fs.existsSync(parseWorkerPath);
  617. let WorkerClass: typeof import('worker_threads').Worker | null = null;
  618. if (useWorker) {
  619. const { Worker } = await import('worker_threads');
  620. WorkerClass = Worker;
  621. } else {
  622. // In-process fallback: load grammars locally
  623. await loadGrammarsForLanguages(neededLanguages);
  624. }
  625. // --- Worker lifecycle management ---
  626. // The worker can crash (OOM in WASM) or hang on pathological files.
  627. // We track pending parse promises and handle both cases:
  628. // - Timeout: terminate + restart the worker, reject the timed-out request
  629. // - Crash: reject all pending promises, restart for remaining files
  630. let parseWorker: import('worker_threads').Worker | null = null;
  631. let nextId = 0;
  632. let workerParseCount = 0;
  633. const pendingParses = new Map<number, {
  634. resolve: (result: ExtractionResult) => void;
  635. reject: (err: Error) => void;
  636. timer: ReturnType<typeof setTimeout>;
  637. }>();
  638. function rejectAllPending(reason: string): void {
  639. for (const [id, pending] of pendingParses) {
  640. clearTimeout(pending.timer);
  641. pendingParses.delete(id);
  642. pending.reject(new Error(reason));
  643. }
  644. }
  645. function attachWorkerHandlers(w: import('worker_threads').Worker): void {
  646. w.on('message', (msg: { type: string; id?: number; result?: ExtractionResult }) => {
  647. if (msg.type === 'parse-result' && msg.id !== undefined) {
  648. const pending = pendingParses.get(msg.id);
  649. if (pending) {
  650. clearTimeout(pending.timer);
  651. pendingParses.delete(msg.id);
  652. pending.resolve(msg.result!);
  653. }
  654. }
  655. });
  656. w.on('error', (err) => {
  657. logWarn('Parse worker error', { error: err.message });
  658. rejectAllPending(`Worker error: ${err.message}`);
  659. });
  660. w.on('exit', (code) => {
  661. if (code !== 0 && pendingParses.size > 0) {
  662. logWarn('Parse worker exited unexpectedly', { code });
  663. rejectAllPending(`Worker exited with code ${code}`);
  664. }
  665. // Clear reference so we know to respawn, reset count so
  666. // the fresh worker gets a full cycle before recycling.
  667. if (parseWorker === w) {
  668. parseWorker = null;
  669. workerParseCount = 0;
  670. }
  671. });
  672. }
  673. async function ensureWorker(): Promise<import('worker_threads').Worker> {
  674. if (parseWorker) return parseWorker;
  675. log('Spawning new parse worker...');
  676. parseWorker = new WorkerClass!(parseWorkerPath);
  677. attachWorkerHandlers(parseWorker);
  678. // Load grammars in the new worker
  679. await new Promise<void>((resolve, reject) => {
  680. parseWorker!.once('message', (msg: { type: string }) => {
  681. if (msg.type === 'grammars-loaded') resolve();
  682. else reject(new Error(`Unexpected message: ${msg.type}`));
  683. });
  684. parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages });
  685. });
  686. return parseWorker;
  687. }
  688. if (WorkerClass) {
  689. await ensureWorker();
  690. }
  691. /**
  692. * Recycle the worker thread to reclaim WASM memory.
  693. * Terminates the current worker and clears the reference so
  694. * ensureWorker() will spawn a fresh one on the next call.
  695. */
  696. function recycleWorker(): void {
  697. if (!parseWorker) return;
  698. log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
  699. const w = parseWorker;
  700. parseWorker = null;
  701. workerParseCount = 0;
  702. // Fire-and-forget: worker.terminate() can hang if WASM is stuck
  703. w.terminate().catch(() => {});
  704. }
  705. async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
  706. if (!WorkerClass) {
  707. // In-process fallback
  708. return extractFromSource(
  709. filePath,
  710. content,
  711. detectLanguage(filePath, content),
  712. frameworkNames
  713. );
  714. }
  715. // Recycle the worker before the next parse if we've hit the threshold.
  716. // This destroys the WASM linear memory (which can grow but never shrink)
  717. // and starts a fresh worker with a clean heap.
  718. if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
  719. await recycleWorker();
  720. }
  721. const worker = await ensureWorker();
  722. const id = nextId++;
  723. workerParseCount++;
  724. // Scale timeout for large files: base 10s + 10s per 100KB
  725. const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
  726. return new Promise<ExtractionResult>((resolve, reject) => {
  727. const timer = setTimeout(() => {
  728. pendingParses.delete(id);
  729. log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
  730. // Reject FIRST — worker.terminate() can hang if WASM is stuck
  731. parseWorker = null;
  732. workerParseCount = 0;
  733. reject(new Error(`Parse timed out after ${timeoutMs}ms`));
  734. // Fire-and-forget: kill the stuck worker in the background
  735. worker.terminate().catch(() => {});
  736. }, timeoutMs);
  737. pendingParses.set(id, { resolve, reject, timer });
  738. worker.postMessage({ type: 'parse', id, filePath, content, frameworkNames });
  739. });
  740. }
  741. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  742. if (signal?.aborted) {
  743. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  744. return {
  745. success: false,
  746. filesIndexed,
  747. filesSkipped,
  748. filesErrored,
  749. nodesCreated: totalNodes,
  750. edgesCreated: totalEdges,
  751. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  752. durationMs: Date.now() - startTime,
  753. };
  754. }
  755. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  756. // Read files in parallel (with path validation before any I/O)
  757. const fileContents = await Promise.all(
  758. batch.map(async (fp) => {
  759. try {
  760. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  761. if (!fullPath) {
  762. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  763. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  764. }
  765. const content = await fsp.readFile(fullPath, 'utf-8');
  766. const stats = await fsp.stat(fullPath);
  767. return { filePath: fp, content, stats, error: null as Error | null };
  768. } catch (err) {
  769. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  770. }
  771. })
  772. );
  773. // Send to worker for parsing, store results on main thread
  774. for (const { filePath, content, stats, error } of fileContents) {
  775. if (signal?.aborted) {
  776. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  777. return {
  778. success: false,
  779. filesIndexed,
  780. filesSkipped,
  781. filesErrored,
  782. nodesCreated: totalNodes,
  783. edgesCreated: totalEdges,
  784. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  785. durationMs: Date.now() - startTime,
  786. };
  787. }
  788. // Report progress before parsing (show current file being worked on)
  789. onProgress?.({
  790. phase: 'parsing',
  791. current: processed,
  792. total,
  793. currentFile: filePath,
  794. });
  795. if (error || content === null || stats === null) {
  796. processed++;
  797. filesErrored++;
  798. errors.push({
  799. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  800. filePath,
  801. severity: 'error',
  802. code: 'read_error',
  803. });
  804. continue;
  805. }
  806. // Honour MAX_FILE_SIZE. Without this check, vendored generated
  807. // headers, minified bundles, and other multi-MB files get indexed,
  808. // wasting WASM heap and the worker recycle budget on inputs with no
  809. // useful symbols. The single-file extractFile path already enforces
  810. // this; the bulk path used to silently skip the check.
  811. if (stats.size > MAX_FILE_SIZE) {
  812. processed++;
  813. filesSkipped++;
  814. errors.push({
  815. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  816. filePath,
  817. severity: 'warning',
  818. code: 'size_exceeded',
  819. });
  820. onProgress?.({ phase: 'parsing', current: processed, total });
  821. continue;
  822. }
  823. // Parse in worker thread (main thread stays unblocked).
  824. // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
  825. let result: ExtractionResult;
  826. try {
  827. result = await requestParse(filePath, content);
  828. } catch (parseErr) {
  829. processed++;
  830. filesErrored++;
  831. errors.push({
  832. message: parseErr instanceof Error ? parseErr.message : String(parseErr),
  833. filePath,
  834. severity: 'error',
  835. code: 'parse_error',
  836. });
  837. continue;
  838. }
  839. processed++;
  840. // Store in database on main thread (SQLite is not thread-safe)
  841. if (result.nodes.length > 0 || result.errors.length === 0) {
  842. const language = detectLanguage(filePath, content);
  843. this.storeExtractionResult(filePath, content, language, stats, result);
  844. }
  845. if (result.errors.length > 0) {
  846. for (const err of result.errors) {
  847. if (!err.filePath) err.filePath = filePath;
  848. }
  849. errors.push(...result.errors);
  850. }
  851. if (result.nodes.length > 0) {
  852. filesIndexed++;
  853. totalNodes += result.nodes.length;
  854. totalEdges += result.edges.length;
  855. } else if (result.errors.some((e) => e.severity === 'error')) {
  856. filesErrored++;
  857. } else {
  858. filesSkipped++;
  859. }
  860. }
  861. }
  862. // Report 100% so the progress bar doesn't hang at 99%
  863. onProgress?.({
  864. phase: 'parsing',
  865. current: total,
  866. total,
  867. });
  868. // Yield so the shimmer worker's buffered stdout writes can flush.
  869. // Worker thread stdout is proxied through the main thread's event loop,
  870. // so synchronous work here blocks the animation from rendering.
  871. await new Promise(resolve => setImmediate(resolve));
  872. // Retry pass: files that failed due to WASM memory corruption may succeed
  873. // on a fresh worker with a clean heap. Recycle before each attempt so
  874. // every file gets the absolute cleanest WASM state possible.
  875. const retryableErrors = errors.filter(
  876. (e) => e.code === 'parse_error' && e.filePath &&
  877. (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
  878. );
  879. if (retryableErrors.length > 0 && WorkerClass) {
  880. log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
  881. const stillFailing: typeof retryableErrors = [];
  882. for (const errEntry of retryableErrors) {
  883. const filePath = errEntry.filePath!;
  884. if (signal?.aborted) break;
  885. // Fresh worker for every retry — maximum WASM headroom
  886. recycleWorker();
  887. let content: string;
  888. try {
  889. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  890. if (!fullPath) continue;
  891. content = await fsp.readFile(fullPath, 'utf-8');
  892. } catch {
  893. continue;
  894. }
  895. let result: ExtractionResult;
  896. try {
  897. result = await requestParse(filePath, content);
  898. } catch {
  899. stillFailing.push(errEntry);
  900. continue;
  901. }
  902. if (result.nodes.length > 0 || result.errors.length === 0) {
  903. const language = detectLanguage(filePath, content);
  904. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  905. this.storeExtractionResult(filePath, content, language, stats, result);
  906. const idx = errors.indexOf(errEntry);
  907. if (idx >= 0) errors.splice(idx, 1);
  908. filesErrored--;
  909. filesIndexed++;
  910. totalNodes += result.nodes.length;
  911. totalEdges += result.edges.length;
  912. log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
  913. }
  914. }
  915. // Last resort: for files that still crash on a clean worker, strip
  916. // comment-only lines to reduce WASM memory pressure. Many compiler
  917. // test files are 90%+ comments (CHECK directives) that don't contribute
  918. // code nodes but consume parser memory.
  919. if (stillFailing.length > 0) {
  920. log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
  921. for (const errEntry of stillFailing) {
  922. const filePath = errEntry.filePath!;
  923. if (signal?.aborted) break;
  924. recycleWorker();
  925. let fullContent: string;
  926. try {
  927. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  928. if (!fullPath) continue;
  929. fullContent = await fsp.readFile(fullPath, 'utf-8');
  930. } catch {
  931. continue;
  932. }
  933. // Strip lines that are entirely comments (preserving line numbers
  934. // by replacing with empty lines so node positions stay correct)
  935. const stripped = fullContent
  936. .split('\n')
  937. .map(line => /^\s*\/\//.test(line) ? '' : line)
  938. .join('\n');
  939. let result: ExtractionResult;
  940. try {
  941. result = await requestParse(filePath, stripped);
  942. } catch {
  943. continue;
  944. }
  945. if (result.nodes.length > 0 || result.errors.length === 0) {
  946. const language = detectLanguage(filePath, fullContent);
  947. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  948. this.storeExtractionResult(filePath, fullContent, language, stats, result);
  949. const idx = errors.indexOf(errEntry);
  950. if (idx >= 0) errors.splice(idx, 1);
  951. filesErrored--;
  952. filesIndexed++;
  953. totalNodes += result.nodes.length;
  954. totalEdges += result.edges.length;
  955. log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
  956. }
  957. }
  958. }
  959. }
  960. // Shut down parse worker and clear any pending timers
  961. rejectAllPending('Indexing complete');
  962. if (parseWorker) {
  963. (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  964. }
  965. return {
  966. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  967. filesIndexed,
  968. filesSkipped,
  969. filesErrored,
  970. nodesCreated: totalNodes,
  971. edgesCreated: totalEdges,
  972. errors,
  973. durationMs: Date.now() - startTime,
  974. };
  975. }
  976. /**
  977. * Index specific files
  978. */
  979. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  980. const startTime = Date.now();
  981. const errors: ExtractionError[] = [];
  982. let filesIndexed = 0;
  983. let filesSkipped = 0;
  984. let filesErrored = 0;
  985. let totalNodes = 0;
  986. let totalEdges = 0;
  987. for (const filePath of filePaths) {
  988. const result = await this.indexFile(filePath);
  989. if (result.errors.length > 0) {
  990. errors.push(...result.errors);
  991. }
  992. if (result.nodes.length > 0) {
  993. filesIndexed++;
  994. totalNodes += result.nodes.length;
  995. totalEdges += result.edges.length;
  996. } else if (result.errors.some((e) => e.severity === 'error')) {
  997. filesErrored++;
  998. } else {
  999. filesSkipped++;
  1000. }
  1001. }
  1002. return {
  1003. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  1004. filesIndexed,
  1005. filesSkipped,
  1006. filesErrored,
  1007. nodesCreated: totalNodes,
  1008. edgesCreated: totalEdges,
  1009. errors,
  1010. durationMs: Date.now() - startTime,
  1011. };
  1012. }
  1013. /**
  1014. * Index a single file
  1015. */
  1016. async indexFile(relativePath: string): Promise<ExtractionResult> {
  1017. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  1018. if (!fullPath) {
  1019. return {
  1020. nodes: [],
  1021. edges: [],
  1022. unresolvedReferences: [],
  1023. errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1024. durationMs: 0,
  1025. };
  1026. }
  1027. // Read file content and stats
  1028. let content: string;
  1029. let stats: fs.Stats;
  1030. try {
  1031. stats = await fsp.stat(fullPath);
  1032. content = await fsp.readFile(fullPath, 'utf-8');
  1033. } catch (error) {
  1034. return {
  1035. nodes: [],
  1036. edges: [],
  1037. unresolvedReferences: [],
  1038. errors: [
  1039. {
  1040. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  1041. filePath: relativePath,
  1042. severity: 'error',
  1043. code: 'read_error',
  1044. },
  1045. ],
  1046. durationMs: 0,
  1047. };
  1048. }
  1049. return this.indexFileWithContent(relativePath, content, stats);
  1050. }
  1051. /**
  1052. * Index a single file with pre-read content and stats.
  1053. * Used by the parallel batch reader to avoid redundant file I/O.
  1054. */
  1055. async indexFileWithContent(
  1056. relativePath: string,
  1057. content: string,
  1058. stats: fs.Stats
  1059. ): Promise<ExtractionResult> {
  1060. // Prevent path traversal
  1061. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  1062. if (!fullPath) {
  1063. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  1064. return {
  1065. nodes: [],
  1066. edges: [],
  1067. unresolvedReferences: [],
  1068. errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  1069. durationMs: 0,
  1070. };
  1071. }
  1072. // Check file size
  1073. if (stats.size > MAX_FILE_SIZE) {
  1074. return {
  1075. nodes: [],
  1076. edges: [],
  1077. unresolvedReferences: [],
  1078. errors: [
  1079. {
  1080. message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
  1081. filePath: relativePath,
  1082. severity: 'warning',
  1083. code: 'size_exceeded',
  1084. },
  1085. ],
  1086. durationMs: 0,
  1087. };
  1088. }
  1089. // Detect language
  1090. const language = detectLanguage(relativePath, content);
  1091. if (!isLanguageSupported(language)) {
  1092. return {
  1093. nodes: [],
  1094. edges: [],
  1095. unresolvedReferences: [],
  1096. errors: [],
  1097. durationMs: 0,
  1098. };
  1099. }
  1100. // Extract from source. Use cached framework names if indexAll has run,
  1101. // otherwise detect on the spot so single-file re-index paths still emit
  1102. // route nodes / middleware / etc.
  1103. const frameworkNames = this.ensureDetectedFrameworks();
  1104. const result = extractFromSource(relativePath, content, language, frameworkNames);
  1105. // Store in database
  1106. if (result.nodes.length > 0 || result.errors.length === 0) {
  1107. this.storeExtractionResult(relativePath, content, language, stats, result);
  1108. }
  1109. return result;
  1110. }
  1111. /**
  1112. * Store extraction result in database
  1113. */
  1114. private storeExtractionResult(
  1115. filePath: string,
  1116. content: string,
  1117. language: Language,
  1118. stats: fs.Stats,
  1119. result: ExtractionResult
  1120. ): void {
  1121. const contentHash = hashContent(content);
  1122. // Check if file already exists and hasn't changed
  1123. const existingFile = this.queries.getFileByPath(filePath);
  1124. if (existingFile && existingFile.contentHash === contentHash) {
  1125. return; // No changes
  1126. }
  1127. // Delete existing data for this file
  1128. if (existingFile) {
  1129. this.queries.deleteFile(filePath);
  1130. }
  1131. // Filter out nodes with missing required fields before insertion.
  1132. // This prevents FK violations when edges reference nodes that would
  1133. // be silently skipped by insertNode() (see issue #42).
  1134. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  1135. // Insert nodes
  1136. if (validNodes.length > 0) {
  1137. this.queries.insertNodes(validNodes);
  1138. }
  1139. // Filter edges to only reference nodes that were actually inserted
  1140. if (result.edges.length > 0) {
  1141. const insertedIds = new Set(validNodes.map((n) => n.id));
  1142. const validEdges = result.edges.filter(
  1143. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  1144. );
  1145. if (validEdges.length > 0) {
  1146. this.queries.insertEdges(validEdges);
  1147. }
  1148. }
  1149. // Insert unresolved references in batch with denormalized filePath/language
  1150. if (result.unresolvedReferences.length > 0) {
  1151. const insertedIds = new Set(validNodes.map((n) => n.id));
  1152. const refsWithContext = result.unresolvedReferences
  1153. .filter((ref) => insertedIds.has(ref.fromNodeId))
  1154. .map((ref) => ({
  1155. ...ref,
  1156. filePath: ref.filePath ?? filePath,
  1157. language: ref.language ?? language,
  1158. }));
  1159. if (refsWithContext.length > 0) {
  1160. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  1161. }
  1162. }
  1163. // Insert file record
  1164. const fileRecord: FileRecord = {
  1165. path: filePath,
  1166. contentHash,
  1167. language,
  1168. size: stats.size,
  1169. modifiedAt: stats.mtimeMs,
  1170. indexedAt: Date.now(),
  1171. nodeCount: result.nodes.length,
  1172. errors: result.errors.length > 0 ? result.errors : undefined,
  1173. };
  1174. this.queries.upsertFile(fileRecord);
  1175. }
  1176. /**
  1177. * Sync the index with the current file state.
  1178. *
  1179. * Change detection is filesystem-based, never git: a (size, mtime) stat
  1180. * pre-filter skips unchanged files, then a content-hash compare confirms real
  1181. * changes. This works in non-git projects and catches committed changes from
  1182. * `git pull`/`checkout`/`merge`/`rebase` that `git status` cannot see.
  1183. */
  1184. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  1185. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  1186. const startTime = Date.now();
  1187. let filesChecked = 0;
  1188. let filesAdded = 0;
  1189. let filesModified = 0;
  1190. let filesRemoved = 0;
  1191. let nodesUpdated = 0;
  1192. const changedFilePaths: string[] = [];
  1193. onProgress?.({
  1194. phase: 'scanning',
  1195. current: 0,
  1196. total: 0,
  1197. });
  1198. const filesToIndex: string[] = [];
  1199. // === Filesystem reconcile (git-independent) ===
  1200. // The source of truth for "what changed" is the filesystem vs the indexed
  1201. // state — never git. We enumerate the current source files and reconcile
  1202. // each against the DB. A cheap (size, mtime) stat pre-filter skips unchanged
  1203. // files without reading or hashing them, so the expensive read+hash+parse
  1204. // only runs for files that actually changed. This catches edits/adds/deletes
  1205. // whether or not the project uses git, and crucially also catches committed
  1206. // changes from `git pull`/`checkout`/`merge`/`rebase` — which `git status`
  1207. // cannot see, because the working tree is clean afterward.
  1208. const currentFiles = scanDirectory(this.rootDir);
  1209. filesChecked = currentFiles.length;
  1210. const currentSet = new Set(currentFiles);
  1211. const trackedFiles = this.queries.getAllFiles();
  1212. const trackedMap = new Map<string, FileRecord>();
  1213. for (const f of trackedFiles) {
  1214. trackedMap.set(f.path, f);
  1215. }
  1216. // Removals: tracked in the DB but no longer a present source file. Check the
  1217. // filesystem directly — `scanDirectory` (via `git ls-files`) still lists a
  1218. // file deleted from disk but not yet staged, so set membership alone misses it.
  1219. for (const tracked of trackedFiles) {
  1220. if (!currentSet.has(tracked.path) || !fs.existsSync(path.join(this.rootDir, tracked.path))) {
  1221. this.queries.deleteFile(tracked.path);
  1222. filesRemoved++;
  1223. }
  1224. }
  1225. // Adds / modifications.
  1226. for (const filePath of currentFiles) {
  1227. const fullPath = path.join(this.rootDir, filePath);
  1228. const tracked = trackedMap.get(filePath);
  1229. // Cheap pre-filter: an already-indexed file whose size AND mtime both match
  1230. // the DB is unchanged — skip it without reading or hashing. (A content
  1231. // change that preserves both exactly is the blind spot every mtime-based
  1232. // incremental tool accepts; `index --force` is the escape hatch. Git bumps
  1233. // mtime on every file it writes during checkout/merge, so pulls are caught.)
  1234. if (tracked) {
  1235. try {
  1236. const stat = fs.statSync(fullPath);
  1237. if (stat.size === tracked.size && Math.floor(stat.mtimeMs) === Math.floor(tracked.modifiedAt)) {
  1238. continue;
  1239. }
  1240. } catch (error) {
  1241. logDebug('Skipping unstattable file during sync', { filePath, error: String(error) });
  1242. continue;
  1243. }
  1244. }
  1245. // New, or size/mtime changed — read + hash to confirm a real content change.
  1246. let content: string;
  1247. try {
  1248. content = fs.readFileSync(fullPath, 'utf-8');
  1249. } catch (error) {
  1250. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  1251. continue;
  1252. }
  1253. const contentHash = hashContent(content);
  1254. if (!tracked) {
  1255. filesToIndex.push(filePath);
  1256. changedFilePaths.push(filePath);
  1257. filesAdded++;
  1258. } else if (tracked.contentHash !== contentHash) {
  1259. filesToIndex.push(filePath);
  1260. changedFilePaths.push(filePath);
  1261. filesModified++;
  1262. }
  1263. }
  1264. // Load only grammars needed for changed files
  1265. if (filesToIndex.length > 0) {
  1266. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
  1267. // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
  1268. if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
  1269. neededLanguages.push('cpp');
  1270. }
  1271. await loadGrammarsForLanguages(neededLanguages);
  1272. }
  1273. // Index changed files
  1274. const total = filesToIndex.length;
  1275. for (let i = 0; i < filesToIndex.length; i++) {
  1276. const filePath = filesToIndex[i]!;
  1277. onProgress?.({
  1278. phase: 'parsing',
  1279. current: i + 1,
  1280. total,
  1281. currentFile: filePath,
  1282. });
  1283. const result = await this.indexFile(filePath);
  1284. nodesUpdated += result.nodes.length;
  1285. }
  1286. return {
  1287. filesChecked,
  1288. filesAdded,
  1289. filesModified,
  1290. filesRemoved,
  1291. nodesUpdated,
  1292. durationMs: Date.now() - startTime,
  1293. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  1294. };
  1295. }
  1296. /**
  1297. * Get files that have changed since last index.
  1298. * Uses git status as a fast path when available, falling back to full scan.
  1299. */
  1300. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  1301. const gitChanges = getGitChangedFiles(this.rootDir);
  1302. if (gitChanges) {
  1303. // === Git fast path ===
  1304. const added: string[] = [];
  1305. const modified: string[] = [];
  1306. const removed: string[] = [];
  1307. // Deleted files — only report if tracked in DB
  1308. for (const filePath of gitChanges.deleted) {
  1309. const tracked = this.queries.getFileByPath(filePath);
  1310. if (tracked) {
  1311. removed.push(filePath);
  1312. }
  1313. }
  1314. // Modified + added files — read + hash, compare with DB. Untracked (`??`)
  1315. // files stay untracked in git even after indexing, so they must be
  1316. // hash-compared like modified files instead of always counting as added —
  1317. // otherwise status reports them as pending forever. (See issue #206.)
  1318. for (const filePath of [...gitChanges.modified, ...gitChanges.added]) {
  1319. const fullPath = path.join(this.rootDir, filePath);
  1320. let content: string;
  1321. try {
  1322. content = fs.readFileSync(fullPath, 'utf-8');
  1323. } catch (error) {
  1324. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1325. continue;
  1326. }
  1327. const contentHash = hashContent(content);
  1328. const tracked = this.queries.getFileByPath(filePath);
  1329. if (!tracked) {
  1330. added.push(filePath);
  1331. } else if (tracked.contentHash !== contentHash) {
  1332. modified.push(filePath);
  1333. }
  1334. }
  1335. return { added, modified, removed };
  1336. }
  1337. // === Fallback: full scan (non-git project or git failure) ===
  1338. const currentFiles = new Set(scanDirectory(this.rootDir));
  1339. const trackedFiles = this.queries.getAllFiles();
  1340. // Build Map for O(1) lookups
  1341. const trackedMap = new Map<string, FileRecord>();
  1342. for (const f of trackedFiles) {
  1343. trackedMap.set(f.path, f);
  1344. }
  1345. const added: string[] = [];
  1346. const modified: string[] = [];
  1347. const removed: string[] = [];
  1348. // Find removed files
  1349. for (const tracked of trackedFiles) {
  1350. if (!currentFiles.has(tracked.path)) {
  1351. removed.push(tracked.path);
  1352. }
  1353. }
  1354. // Find added and modified files
  1355. for (const filePath of currentFiles) {
  1356. const fullPath = path.join(this.rootDir, filePath);
  1357. let content: string;
  1358. try {
  1359. content = fs.readFileSync(fullPath, 'utf-8');
  1360. } catch (error) {
  1361. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1362. continue;
  1363. }
  1364. const contentHash = hashContent(content);
  1365. const tracked = trackedMap.get(filePath);
  1366. if (!tracked) {
  1367. added.push(filePath);
  1368. } else if (tracked.contentHash !== contentHash) {
  1369. modified.push(filePath);
  1370. }
  1371. }
  1372. return { added, modified, removed };
  1373. }
  1374. }
  1375. // Re-export useful types and functions
  1376. export { extractFromSource } from './tree-sitter';
  1377. export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';