index.ts 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. CodeGraphConfig,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
  21. import { logDebug, logWarn } from '../errors';
  22. import { validatePathWithinRoot, normalizePath } from '../utils';
  23. import picomatch from 'picomatch';
  24. /**
  25. * Number of files to read in parallel during indexing.
  26. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  27. */
  28. const FILE_IO_BATCH_SIZE = 10;
  29. // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
  30. /**
  31. * Maximum time (ms) to wait for a single file to parse in the worker thread.
  32. * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
  33. * indexing run from freezing. The worker is restarted after a timeout.
  34. */
  35. const PARSE_TIMEOUT_MS = 10_000;
  36. /**
  37. * Number of files to parse before recycling the worker thread.
  38. * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
  39. * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
  40. * V8 isolate by terminating the worker thread and spawning a fresh one.
  41. * This interval balances memory usage against the cost of reloading grammars.
  42. */
  43. const WORKER_RECYCLE_INTERVAL = 250;
  44. /**
  45. * Progress callback for indexing operations
  46. */
  47. export interface IndexProgress {
  48. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  49. current: number;
  50. total: number;
  51. currentFile?: string;
  52. }
  53. /**
  54. * Result of an indexing operation
  55. */
  56. export interface IndexResult {
  57. success: boolean;
  58. filesIndexed: number;
  59. filesSkipped: number;
  60. filesErrored: number;
  61. nodesCreated: number;
  62. edgesCreated: number;
  63. errors: ExtractionError[];
  64. durationMs: number;
  65. }
  66. /**
  67. * Result of a sync operation
  68. */
  69. export interface SyncResult {
  70. filesChecked: number;
  71. filesAdded: number;
  72. filesModified: number;
  73. filesRemoved: number;
  74. nodesUpdated: number;
  75. durationMs: number;
  76. changedFilePaths?: string[];
  77. }
  78. /**
  79. * Calculate SHA256 hash of file contents
  80. */
  81. export function hashContent(content: string): string {
  82. return crypto.createHash('sha256').update(content).digest('hex');
  83. }
  84. /**
  85. * Check if a path matches any glob pattern (simplified)
  86. */
  87. function matchesGlob(filePath: string, pattern: string): boolean {
  88. filePath = normalizePath(filePath);
  89. return picomatch.isMatch(filePath, pattern, { dot: true });
  90. }
  91. /**
  92. * Check if a file should be included based on config
  93. */
  94. export function shouldIncludeFile(
  95. filePath: string,
  96. config: CodeGraphConfig
  97. ): boolean {
  98. // Check exclude patterns first
  99. for (const pattern of config.exclude) {
  100. if (matchesGlob(filePath, pattern)) {
  101. return false;
  102. }
  103. }
  104. // Check include patterns
  105. for (const pattern of config.include) {
  106. if (matchesGlob(filePath, pattern)) {
  107. return true;
  108. }
  109. }
  110. return false;
  111. }
  112. /**
  113. * Get all files visible to git (tracked + untracked but not ignored).
  114. * Respects .gitignore at all levels (root, subdirectories).
  115. * Returns null on failure (non-git project) so callers can fall back.
  116. */
  117. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  118. try {
  119. // Check if the project directory is gitignored by a parent repo.
  120. // When rootDir lives inside a parent git repo that ignores it,
  121. // `git ls-files` returns nothing — fall back to filesystem walk.
  122. const gitRoot = execFileSync(
  123. 'git',
  124. ['rev-parse', '--show-toplevel'],
  125. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  126. ).trim();
  127. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  128. try {
  129. // git check-ignore exits 0 if the path IS ignored, 1 if not
  130. execFileSync(
  131. 'git',
  132. ['check-ignore', '-q', path.resolve(rootDir)],
  133. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  134. );
  135. // Directory is gitignored by parent repo — fall back to filesystem walk
  136. return null;
  137. } catch {
  138. // Not ignored — safe to use git ls-files
  139. }
  140. }
  141. // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
  142. const output = execFileSync(
  143. 'git',
  144. ['ls-files', '-co', '--exclude-standard'],
  145. { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
  146. );
  147. const files = new Set<string>();
  148. for (const line of output.split('\n')) {
  149. const trimmed = line.trim();
  150. if (trimmed) {
  151. files.add(normalizePath(trimmed));
  152. }
  153. }
  154. return files;
  155. } catch {
  156. return null;
  157. }
  158. }
  159. /**
  160. * Result of git-based change detection.
  161. * Returns null when git is unavailable (non-git project or command failure),
  162. * signaling the caller to fall back to full filesystem scan.
  163. */
  164. interface GitChanges {
  165. modified: string[]; // M, MM, AM — files to re-hash + re-index
  166. added: string[]; // ?? — new untracked files to index
  167. deleted: string[]; // D — files to remove from DB
  168. }
  169. /**
  170. * Use `git status` to detect changed files instead of scanning every file.
  171. * Returns null on failure so callers fall back to full scan.
  172. */
  173. function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
  174. try {
  175. const output = execFileSync(
  176. 'git',
  177. ['status', '--porcelain', '--no-renames'],
  178. { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
  179. );
  180. const modified: string[] = [];
  181. const added: string[] = [];
  182. const deleted: string[] = [];
  183. for (const line of output.split('\n')) {
  184. if (line.length < 4) continue; // Minimum: "XY file"
  185. const statusCode = line.substring(0, 2);
  186. const filePath = normalizePath(line.substring(3));
  187. // Skip files that don't match include/exclude config
  188. if (!shouldIncludeFile(filePath, config)) continue;
  189. if (statusCode === '??') {
  190. added.push(filePath);
  191. } else if (statusCode.includes('D')) {
  192. deleted.push(filePath);
  193. } else {
  194. // M, MM, AM, A (staged), etc. — treat as modified
  195. modified.push(filePath);
  196. }
  197. }
  198. return { modified, added, deleted };
  199. } catch {
  200. return null;
  201. }
  202. }
  203. /**
  204. * Marker file name that indicates a directory (and all children) should be skipped
  205. */
  206. const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
  207. /**
  208. * Recursively scan directory for source files.
  209. *
  210. * In git repos, uses `git ls-files` to get the file list (inherently
  211. * respects .gitignore at all levels), then filters by config include patterns.
  212. * Falls back to filesystem walk for non-git projects.
  213. */
  214. export function scanDirectory(
  215. rootDir: string,
  216. config: CodeGraphConfig,
  217. onProgress?: (current: number, file: string) => void
  218. ): string[] {
  219. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  220. const gitFiles = getGitVisibleFiles(rootDir);
  221. if (gitFiles) {
  222. const files: string[] = [];
  223. let count = 0;
  224. for (const filePath of gitFiles) {
  225. if (shouldIncludeFile(filePath, config)) {
  226. files.push(filePath);
  227. count++;
  228. onProgress?.(count, filePath);
  229. }
  230. }
  231. return files;
  232. }
  233. // Fallback: walk filesystem for non-git projects
  234. return scanDirectoryWalk(rootDir, config, onProgress);
  235. }
  236. /**
  237. * Async variant of scanDirectory that yields to the event loop periodically,
  238. * allowing worker threads to receive and render progress messages.
  239. */
  240. export async function scanDirectoryAsync(
  241. rootDir: string,
  242. config: CodeGraphConfig,
  243. onProgress?: (current: number, file: string) => void
  244. ): Promise<string[]> {
  245. const gitFiles = getGitVisibleFiles(rootDir);
  246. if (gitFiles) {
  247. const files: string[] = [];
  248. let count = 0;
  249. for (const filePath of gitFiles) {
  250. if (shouldIncludeFile(filePath, config)) {
  251. files.push(filePath);
  252. count++;
  253. onProgress?.(count, filePath);
  254. // Yield every 100 files so worker threads can render progress
  255. if (count % 100 === 0) {
  256. await new Promise<void>(r => setImmediate(r));
  257. }
  258. }
  259. }
  260. return files;
  261. }
  262. return scanDirectoryWalk(rootDir, config, onProgress);
  263. }
  264. /**
  265. * Filesystem walk fallback for non-git projects.
  266. */
  267. function scanDirectoryWalk(
  268. rootDir: string,
  269. config: CodeGraphConfig,
  270. onProgress?: (current: number, file: string) => void
  271. ): string[] {
  272. const files: string[] = [];
  273. let count = 0;
  274. const visitedDirs = new Set<string>();
  275. function walk(dir: string): void {
  276. let realDir: string;
  277. try {
  278. realDir = fs.realpathSync(dir);
  279. } catch {
  280. logDebug('Skipping unresolvable directory', { dir });
  281. return;
  282. }
  283. if (visitedDirs.has(realDir)) {
  284. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  285. return;
  286. }
  287. visitedDirs.add(realDir);
  288. // Check for .codegraphignore marker file
  289. const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
  290. if (fs.existsSync(ignoreMarker)) {
  291. logDebug('Skipping directory due to .codegraphignore marker', { dir });
  292. return;
  293. }
  294. let entries: fs.Dirent[];
  295. try {
  296. entries = fs.readdirSync(dir, { withFileTypes: true });
  297. } catch (error) {
  298. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  299. return;
  300. }
  301. for (const entry of entries) {
  302. const fullPath = path.join(dir, entry.name);
  303. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  304. if (entry.isSymbolicLink()) {
  305. try {
  306. const realTarget = fs.realpathSync(fullPath);
  307. const stat = fs.statSync(realTarget);
  308. if (stat.isDirectory()) {
  309. const dirPattern = relativePath + '/';
  310. let excluded = false;
  311. for (const pattern of config.exclude) {
  312. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  313. excluded = true;
  314. break;
  315. }
  316. }
  317. if (!excluded) {
  318. walk(fullPath);
  319. }
  320. } else if (stat.isFile()) {
  321. if (shouldIncludeFile(relativePath, config)) {
  322. files.push(relativePath);
  323. count++;
  324. onProgress?.(count, relativePath);
  325. }
  326. }
  327. } catch {
  328. logDebug('Skipping broken symlink', { path: fullPath });
  329. }
  330. continue;
  331. }
  332. if (entry.isDirectory()) {
  333. const dirPattern = relativePath + '/';
  334. let excluded = false;
  335. for (const pattern of config.exclude) {
  336. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  337. excluded = true;
  338. break;
  339. }
  340. }
  341. if (!excluded) {
  342. walk(fullPath);
  343. }
  344. } else if (entry.isFile()) {
  345. if (shouldIncludeFile(relativePath, config)) {
  346. files.push(relativePath);
  347. count++;
  348. onProgress?.(count, relativePath);
  349. }
  350. }
  351. }
  352. }
  353. walk(rootDir);
  354. return files;
  355. }
  356. /**
  357. * Extraction orchestrator
  358. */
  359. export class ExtractionOrchestrator {
  360. private rootDir: string;
  361. private config: CodeGraphConfig;
  362. private queries: QueryBuilder;
  363. constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
  364. this.rootDir = rootDir;
  365. this.config = config;
  366. this.queries = queries;
  367. }
  368. /**
  369. * Index all files in the project
  370. */
  371. async indexAll(
  372. onProgress?: (progress: IndexProgress) => void,
  373. signal?: AbortSignal,
  374. verbose?: boolean
  375. ): Promise<IndexResult> {
  376. await initGrammars();
  377. const startTime = Date.now();
  378. const errors: ExtractionError[] = [];
  379. let filesIndexed = 0;
  380. let filesSkipped = 0;
  381. let filesErrored = 0;
  382. let totalNodes = 0;
  383. let totalEdges = 0;
  384. const log = verbose
  385. ? (msg: string) => { console.log(`[worker] ${msg}`); }
  386. : (_msg: string) => {};
  387. // Phase 1: Scan for files
  388. onProgress?.({
  389. phase: 'scanning',
  390. current: 0,
  391. total: 0,
  392. });
  393. const files = await scanDirectoryAsync(this.rootDir, this.config, (current, file) => {
  394. onProgress?.({
  395. phase: 'scanning',
  396. current,
  397. total: 0,
  398. currentFile: file,
  399. });
  400. });
  401. if (signal?.aborted) {
  402. return {
  403. success: false,
  404. filesIndexed: 0,
  405. filesSkipped: 0,
  406. filesErrored: 0,
  407. nodesCreated: 0,
  408. edgesCreated: 0,
  409. errors: [{ message: 'Aborted', severity: 'error' }],
  410. durationMs: Date.now() - startTime,
  411. };
  412. }
  413. // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
  414. const total = files.length;
  415. let processed = 0;
  416. // Emit parsing phase immediately so the progress bar appears during worker setup.
  417. // The yield lets the shimmer worker flush the phase transition to stdout before
  418. // the main thread starts synchronous grammar detection work.
  419. onProgress?.({
  420. phase: 'parsing',
  421. current: 0,
  422. total,
  423. });
  424. await new Promise(resolve => setImmediate(resolve));
  425. // Detect needed languages and load grammars in the parse worker
  426. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
  427. // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
  428. // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
  429. const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
  430. const useWorker = fs.existsSync(parseWorkerPath);
  431. let WorkerClass: typeof import('worker_threads').Worker | null = null;
  432. if (useWorker) {
  433. const { Worker } = await import('worker_threads');
  434. WorkerClass = Worker;
  435. } else {
  436. // In-process fallback: load grammars locally
  437. await loadGrammarsForLanguages(neededLanguages);
  438. }
  439. // --- Worker lifecycle management ---
  440. // The worker can crash (OOM in WASM) or hang on pathological files.
  441. // We track pending parse promises and handle both cases:
  442. // - Timeout: terminate + restart the worker, reject the timed-out request
  443. // - Crash: reject all pending promises, restart for remaining files
  444. let parseWorker: import('worker_threads').Worker | null = null;
  445. let nextId = 0;
  446. let workerParseCount = 0;
  447. const pendingParses = new Map<number, {
  448. resolve: (result: ExtractionResult) => void;
  449. reject: (err: Error) => void;
  450. timer: ReturnType<typeof setTimeout>;
  451. }>();
  452. function rejectAllPending(reason: string): void {
  453. for (const [id, pending] of pendingParses) {
  454. clearTimeout(pending.timer);
  455. pendingParses.delete(id);
  456. pending.reject(new Error(reason));
  457. }
  458. }
  459. function attachWorkerHandlers(w: import('worker_threads').Worker): void {
  460. w.on('message', (msg: { type: string; id?: number; result?: ExtractionResult }) => {
  461. if (msg.type === 'parse-result' && msg.id !== undefined) {
  462. const pending = pendingParses.get(msg.id);
  463. if (pending) {
  464. clearTimeout(pending.timer);
  465. pendingParses.delete(msg.id);
  466. pending.resolve(msg.result!);
  467. }
  468. }
  469. });
  470. w.on('error', (err) => {
  471. logWarn('Parse worker error', { error: err.message });
  472. rejectAllPending(`Worker error: ${err.message}`);
  473. });
  474. w.on('exit', (code) => {
  475. if (code !== 0 && pendingParses.size > 0) {
  476. logWarn('Parse worker exited unexpectedly', { code });
  477. rejectAllPending(`Worker exited with code ${code}`);
  478. }
  479. // Clear reference so we know to respawn, reset count so
  480. // the fresh worker gets a full cycle before recycling.
  481. if (parseWorker === w) {
  482. parseWorker = null;
  483. workerParseCount = 0;
  484. }
  485. });
  486. }
  487. async function ensureWorker(): Promise<import('worker_threads').Worker> {
  488. if (parseWorker) return parseWorker;
  489. log('Spawning new parse worker...');
  490. parseWorker = new WorkerClass!(parseWorkerPath);
  491. attachWorkerHandlers(parseWorker);
  492. // Load grammars in the new worker
  493. await new Promise<void>((resolve, reject) => {
  494. parseWorker!.once('message', (msg: { type: string }) => {
  495. if (msg.type === 'grammars-loaded') resolve();
  496. else reject(new Error(`Unexpected message: ${msg.type}`));
  497. });
  498. parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages });
  499. });
  500. return parseWorker;
  501. }
  502. if (WorkerClass) {
  503. await ensureWorker();
  504. }
  505. /**
  506. * Recycle the worker thread to reclaim WASM memory.
  507. * Terminates the current worker and clears the reference so
  508. * ensureWorker() will spawn a fresh one on the next call.
  509. */
  510. function recycleWorker(): void {
  511. if (!parseWorker) return;
  512. log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
  513. const w = parseWorker;
  514. parseWorker = null;
  515. workerParseCount = 0;
  516. // Fire-and-forget: worker.terminate() can hang if WASM is stuck
  517. w.terminate().catch(() => {});
  518. }
  519. async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
  520. if (!WorkerClass) {
  521. // In-process fallback
  522. return extractFromSource(filePath, content, detectLanguage(filePath));
  523. }
  524. // Recycle the worker before the next parse if we've hit the threshold.
  525. // This destroys the WASM linear memory (which can grow but never shrink)
  526. // and starts a fresh worker with a clean heap.
  527. if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
  528. await recycleWorker();
  529. }
  530. const worker = await ensureWorker();
  531. const id = nextId++;
  532. workerParseCount++;
  533. // Scale timeout for large files: base 10s + 10s per 100KB
  534. const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
  535. return new Promise<ExtractionResult>((resolve, reject) => {
  536. const timer = setTimeout(() => {
  537. pendingParses.delete(id);
  538. log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
  539. // Reject FIRST — worker.terminate() can hang if WASM is stuck
  540. parseWorker = null;
  541. workerParseCount = 0;
  542. reject(new Error(`Parse timed out after ${timeoutMs}ms`));
  543. // Fire-and-forget: kill the stuck worker in the background
  544. worker.terminate().catch(() => {});
  545. }, timeoutMs);
  546. pendingParses.set(id, { resolve, reject, timer });
  547. worker.postMessage({ type: 'parse', id, filePath, content });
  548. });
  549. }
  550. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  551. if (signal?.aborted) {
  552. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  553. return {
  554. success: false,
  555. filesIndexed,
  556. filesSkipped,
  557. filesErrored,
  558. nodesCreated: totalNodes,
  559. edgesCreated: totalEdges,
  560. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  561. durationMs: Date.now() - startTime,
  562. };
  563. }
  564. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  565. // Read files in parallel (with path validation before any I/O)
  566. const fileContents = await Promise.all(
  567. batch.map(async (fp) => {
  568. try {
  569. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  570. if (!fullPath) {
  571. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  572. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  573. }
  574. const content = await fsp.readFile(fullPath, 'utf-8');
  575. const stats = await fsp.stat(fullPath);
  576. return { filePath: fp, content, stats, error: null as Error | null };
  577. } catch (err) {
  578. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  579. }
  580. })
  581. );
  582. // Send to worker for parsing, store results on main thread
  583. for (const { filePath, content, stats, error } of fileContents) {
  584. if (signal?.aborted) {
  585. if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  586. return {
  587. success: false,
  588. filesIndexed,
  589. filesSkipped,
  590. filesErrored,
  591. nodesCreated: totalNodes,
  592. edgesCreated: totalEdges,
  593. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  594. durationMs: Date.now() - startTime,
  595. };
  596. }
  597. // Report progress before parsing (show current file being worked on)
  598. onProgress?.({
  599. phase: 'parsing',
  600. current: processed,
  601. total,
  602. currentFile: filePath,
  603. });
  604. if (error || content === null || stats === null) {
  605. processed++;
  606. filesErrored++;
  607. errors.push({
  608. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  609. filePath,
  610. severity: 'error',
  611. code: 'read_error',
  612. });
  613. continue;
  614. }
  615. // Parse in worker thread (main thread stays unblocked).
  616. // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
  617. let result: ExtractionResult;
  618. try {
  619. result = await requestParse(filePath, content);
  620. } catch (parseErr) {
  621. processed++;
  622. filesErrored++;
  623. errors.push({
  624. message: parseErr instanceof Error ? parseErr.message : String(parseErr),
  625. filePath,
  626. severity: 'error',
  627. code: 'parse_error',
  628. });
  629. continue;
  630. }
  631. processed++;
  632. // Store in database on main thread (SQLite is not thread-safe)
  633. if (result.nodes.length > 0 || result.errors.length === 0) {
  634. const language = detectLanguage(filePath);
  635. this.storeExtractionResult(filePath, content, language, stats, result);
  636. }
  637. if (result.errors.length > 0) {
  638. for (const err of result.errors) {
  639. if (!err.filePath) err.filePath = filePath;
  640. }
  641. errors.push(...result.errors);
  642. }
  643. if (result.nodes.length > 0) {
  644. filesIndexed++;
  645. totalNodes += result.nodes.length;
  646. totalEdges += result.edges.length;
  647. } else if (result.errors.some((e) => e.severity === 'error')) {
  648. filesErrored++;
  649. } else {
  650. filesSkipped++;
  651. }
  652. }
  653. }
  654. // Report 100% so the progress bar doesn't hang at 99%
  655. onProgress?.({
  656. phase: 'parsing',
  657. current: total,
  658. total,
  659. });
  660. // Yield so the shimmer worker's buffered stdout writes can flush.
  661. // Worker thread stdout is proxied through the main thread's event loop,
  662. // so synchronous work here blocks the animation from rendering.
  663. await new Promise(resolve => setImmediate(resolve));
  664. // Retry pass: files that failed due to WASM memory corruption may succeed
  665. // on a fresh worker with a clean heap. Recycle before each attempt so
  666. // every file gets the absolute cleanest WASM state possible.
  667. const retryableErrors = errors.filter(
  668. (e) => e.code === 'parse_error' && e.filePath &&
  669. (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
  670. );
  671. if (retryableErrors.length > 0 && WorkerClass) {
  672. log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
  673. const stillFailing: typeof retryableErrors = [];
  674. for (const errEntry of retryableErrors) {
  675. const filePath = errEntry.filePath!;
  676. if (signal?.aborted) break;
  677. // Fresh worker for every retry — maximum WASM headroom
  678. recycleWorker();
  679. let content: string;
  680. try {
  681. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  682. if (!fullPath) continue;
  683. content = await fsp.readFile(fullPath, 'utf-8');
  684. } catch {
  685. continue;
  686. }
  687. let result: ExtractionResult;
  688. try {
  689. result = await requestParse(filePath, content);
  690. } catch {
  691. stillFailing.push(errEntry);
  692. continue;
  693. }
  694. if (result.nodes.length > 0 || result.errors.length === 0) {
  695. const language = detectLanguage(filePath);
  696. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  697. this.storeExtractionResult(filePath, content, language, stats, result);
  698. const idx = errors.indexOf(errEntry);
  699. if (idx >= 0) errors.splice(idx, 1);
  700. filesErrored--;
  701. filesIndexed++;
  702. totalNodes += result.nodes.length;
  703. totalEdges += result.edges.length;
  704. log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
  705. }
  706. }
  707. // Last resort: for files that still crash on a clean worker, strip
  708. // comment-only lines to reduce WASM memory pressure. Many compiler
  709. // test files are 90%+ comments (CHECK directives) that don't contribute
  710. // code nodes but consume parser memory.
  711. if (stillFailing.length > 0) {
  712. log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
  713. for (const errEntry of stillFailing) {
  714. const filePath = errEntry.filePath!;
  715. if (signal?.aborted) break;
  716. recycleWorker();
  717. let fullContent: string;
  718. try {
  719. const fullPath = validatePathWithinRoot(this.rootDir, filePath);
  720. if (!fullPath) continue;
  721. fullContent = await fsp.readFile(fullPath, 'utf-8');
  722. } catch {
  723. continue;
  724. }
  725. // Strip lines that are entirely comments (preserving line numbers
  726. // by replacing with empty lines so node positions stay correct)
  727. const stripped = fullContent
  728. .split('\n')
  729. .map(line => /^\s*\/\//.test(line) ? '' : line)
  730. .join('\n');
  731. let result: ExtractionResult;
  732. try {
  733. result = await requestParse(filePath, stripped);
  734. } catch {
  735. continue;
  736. }
  737. if (result.nodes.length > 0 || result.errors.length === 0) {
  738. const language = detectLanguage(filePath);
  739. const stats = await fsp.stat(path.join(this.rootDir, filePath));
  740. this.storeExtractionResult(filePath, fullContent, language, stats, result);
  741. const idx = errors.indexOf(errEntry);
  742. if (idx >= 0) errors.splice(idx, 1);
  743. filesErrored--;
  744. filesIndexed++;
  745. totalNodes += result.nodes.length;
  746. totalEdges += result.edges.length;
  747. log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
  748. }
  749. }
  750. }
  751. }
  752. // Shut down parse worker and clear any pending timers
  753. rejectAllPending('Indexing complete');
  754. if (parseWorker) {
  755. (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
  756. }
  757. return {
  758. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  759. filesIndexed,
  760. filesSkipped,
  761. filesErrored,
  762. nodesCreated: totalNodes,
  763. edgesCreated: totalEdges,
  764. errors,
  765. durationMs: Date.now() - startTime,
  766. };
  767. }
  768. /**
  769. * Index specific files
  770. */
  771. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  772. const startTime = Date.now();
  773. const errors: ExtractionError[] = [];
  774. let filesIndexed = 0;
  775. let filesSkipped = 0;
  776. let filesErrored = 0;
  777. let totalNodes = 0;
  778. let totalEdges = 0;
  779. for (const filePath of filePaths) {
  780. const result = await this.indexFile(filePath);
  781. if (result.errors.length > 0) {
  782. errors.push(...result.errors);
  783. }
  784. if (result.nodes.length > 0) {
  785. filesIndexed++;
  786. totalNodes += result.nodes.length;
  787. totalEdges += result.edges.length;
  788. } else if (result.errors.some((e) => e.severity === 'error')) {
  789. filesErrored++;
  790. } else {
  791. filesSkipped++;
  792. }
  793. }
  794. return {
  795. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  796. filesIndexed,
  797. filesSkipped,
  798. filesErrored,
  799. nodesCreated: totalNodes,
  800. edgesCreated: totalEdges,
  801. errors,
  802. durationMs: Date.now() - startTime,
  803. };
  804. }
  805. /**
  806. * Index a single file
  807. */
  808. async indexFile(relativePath: string): Promise<ExtractionResult> {
  809. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  810. if (!fullPath) {
  811. return {
  812. nodes: [],
  813. edges: [],
  814. unresolvedReferences: [],
  815. errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  816. durationMs: 0,
  817. };
  818. }
  819. // Read file content and stats
  820. let content: string;
  821. let stats: fs.Stats;
  822. try {
  823. stats = await fsp.stat(fullPath);
  824. content = await fsp.readFile(fullPath, 'utf-8');
  825. } catch (error) {
  826. return {
  827. nodes: [],
  828. edges: [],
  829. unresolvedReferences: [],
  830. errors: [
  831. {
  832. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  833. filePath: relativePath,
  834. severity: 'error',
  835. code: 'read_error',
  836. },
  837. ],
  838. durationMs: 0,
  839. };
  840. }
  841. return this.indexFileWithContent(relativePath, content, stats);
  842. }
  843. /**
  844. * Index a single file with pre-read content and stats.
  845. * Used by the parallel batch reader to avoid redundant file I/O.
  846. */
  847. async indexFileWithContent(
  848. relativePath: string,
  849. content: string,
  850. stats: fs.Stats
  851. ): Promise<ExtractionResult> {
  852. // Prevent path traversal
  853. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  854. if (!fullPath) {
  855. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  856. return {
  857. nodes: [],
  858. edges: [],
  859. unresolvedReferences: [],
  860. errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  861. durationMs: 0,
  862. };
  863. }
  864. // Check file size
  865. if (stats.size > this.config.maxFileSize) {
  866. return {
  867. nodes: [],
  868. edges: [],
  869. unresolvedReferences: [],
  870. errors: [
  871. {
  872. message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
  873. filePath: relativePath,
  874. severity: 'warning',
  875. code: 'size_exceeded',
  876. },
  877. ],
  878. durationMs: 0,
  879. };
  880. }
  881. // Detect language
  882. const language = detectLanguage(relativePath);
  883. if (!isLanguageSupported(language)) {
  884. return {
  885. nodes: [],
  886. edges: [],
  887. unresolvedReferences: [],
  888. errors: [],
  889. durationMs: 0,
  890. };
  891. }
  892. // Extract from source
  893. const result = extractFromSource(relativePath, content, language);
  894. // Store in database
  895. if (result.nodes.length > 0 || result.errors.length === 0) {
  896. this.storeExtractionResult(relativePath, content, language, stats, result);
  897. }
  898. return result;
  899. }
  900. /**
  901. * Store extraction result in database
  902. */
  903. private storeExtractionResult(
  904. filePath: string,
  905. content: string,
  906. language: Language,
  907. stats: fs.Stats,
  908. result: ExtractionResult
  909. ): void {
  910. const contentHash = hashContent(content);
  911. // Check if file already exists and hasn't changed
  912. const existingFile = this.queries.getFileByPath(filePath);
  913. if (existingFile && existingFile.contentHash === contentHash) {
  914. return; // No changes
  915. }
  916. // Delete existing data for this file
  917. if (existingFile) {
  918. this.queries.deleteFile(filePath);
  919. }
  920. // Filter out nodes with missing required fields before insertion.
  921. // This prevents FK violations when edges reference nodes that would
  922. // be silently skipped by insertNode() (see issue #42).
  923. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  924. // Insert nodes
  925. if (validNodes.length > 0) {
  926. this.queries.insertNodes(validNodes);
  927. }
  928. // Filter edges to only reference nodes that were actually inserted
  929. if (result.edges.length > 0) {
  930. const insertedIds = new Set(validNodes.map((n) => n.id));
  931. const validEdges = result.edges.filter(
  932. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  933. );
  934. if (validEdges.length > 0) {
  935. this.queries.insertEdges(validEdges);
  936. }
  937. }
  938. // Insert unresolved references in batch with denormalized filePath/language
  939. if (result.unresolvedReferences.length > 0) {
  940. const insertedIds = new Set(validNodes.map((n) => n.id));
  941. const refsWithContext = result.unresolvedReferences
  942. .filter((ref) => insertedIds.has(ref.fromNodeId))
  943. .map((ref) => ({
  944. ...ref,
  945. filePath: ref.filePath ?? filePath,
  946. language: ref.language ?? language,
  947. }));
  948. if (refsWithContext.length > 0) {
  949. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  950. }
  951. }
  952. // Insert file record
  953. const fileRecord: FileRecord = {
  954. path: filePath,
  955. contentHash,
  956. language,
  957. size: stats.size,
  958. modifiedAt: stats.mtimeMs,
  959. indexedAt: Date.now(),
  960. nodeCount: result.nodes.length,
  961. errors: result.errors.length > 0 ? result.errors : undefined,
  962. };
  963. this.queries.upsertFile(fileRecord);
  964. }
  965. /**
  966. * Sync with current file state.
  967. * Uses git status as a fast path when available, falling back to full scan.
  968. */
  969. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  970. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  971. const startTime = Date.now();
  972. let filesChecked = 0;
  973. let filesAdded = 0;
  974. let filesModified = 0;
  975. let filesRemoved = 0;
  976. let nodesUpdated = 0;
  977. const changedFilePaths: string[] = [];
  978. onProgress?.({
  979. phase: 'scanning',
  980. current: 0,
  981. total: 0,
  982. });
  983. const filesToIndex: string[] = [];
  984. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  985. if (gitChanges) {
  986. // === Git fast path ===
  987. // Only inspect the files git reports as changed instead of scanning everything.
  988. filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
  989. // Handle deleted files
  990. for (const filePath of gitChanges.deleted) {
  991. const tracked = this.queries.getFileByPath(filePath);
  992. if (tracked) {
  993. this.queries.deleteFile(filePath);
  994. filesRemoved++;
  995. }
  996. }
  997. // Handle modified files — read + hash only these files
  998. for (const filePath of gitChanges.modified) {
  999. const fullPath = path.join(this.rootDir, filePath);
  1000. let content: string;
  1001. try {
  1002. content = fs.readFileSync(fullPath, 'utf-8');
  1003. } catch (error) {
  1004. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  1005. continue;
  1006. }
  1007. const contentHash = hashContent(content);
  1008. const tracked = this.queries.getFileByPath(filePath);
  1009. if (!tracked) {
  1010. filesToIndex.push(filePath);
  1011. changedFilePaths.push(filePath);
  1012. filesAdded++;
  1013. } else if (tracked.contentHash !== contentHash) {
  1014. filesToIndex.push(filePath);
  1015. changedFilePaths.push(filePath);
  1016. filesModified++;
  1017. }
  1018. }
  1019. // Handle added (untracked) files
  1020. for (const filePath of gitChanges.added) {
  1021. filesToIndex.push(filePath);
  1022. changedFilePaths.push(filePath);
  1023. filesAdded++;
  1024. }
  1025. } else {
  1026. // === Fallback: full scan (non-git project or git failure) ===
  1027. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  1028. filesChecked = currentFiles.size;
  1029. // Build Map for O(1) lookups instead of .find() per file
  1030. const trackedFiles = this.queries.getAllFiles();
  1031. const trackedMap = new Map<string, FileRecord>();
  1032. for (const f of trackedFiles) {
  1033. trackedMap.set(f.path, f);
  1034. }
  1035. // Find files to remove (in DB but not on disk)
  1036. for (const tracked of trackedFiles) {
  1037. if (!currentFiles.has(tracked.path)) {
  1038. this.queries.deleteFile(tracked.path);
  1039. filesRemoved++;
  1040. }
  1041. }
  1042. // Find files to add or update
  1043. for (const filePath of currentFiles) {
  1044. const fullPath = path.join(this.rootDir, filePath);
  1045. let content: string;
  1046. try {
  1047. content = fs.readFileSync(fullPath, 'utf-8');
  1048. } catch (error) {
  1049. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  1050. continue;
  1051. }
  1052. const contentHash = hashContent(content);
  1053. const tracked = trackedMap.get(filePath);
  1054. if (!tracked) {
  1055. filesToIndex.push(filePath);
  1056. changedFilePaths.push(filePath);
  1057. filesAdded++;
  1058. } else if (tracked.contentHash !== contentHash) {
  1059. filesToIndex.push(filePath);
  1060. changedFilePaths.push(filePath);
  1061. filesModified++;
  1062. }
  1063. }
  1064. }
  1065. // Load only grammars needed for changed files
  1066. if (filesToIndex.length > 0) {
  1067. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
  1068. await loadGrammarsForLanguages(neededLanguages);
  1069. }
  1070. // Index changed files
  1071. const total = filesToIndex.length;
  1072. for (let i = 0; i < filesToIndex.length; i++) {
  1073. const filePath = filesToIndex[i]!;
  1074. onProgress?.({
  1075. phase: 'parsing',
  1076. current: i + 1,
  1077. total,
  1078. currentFile: filePath,
  1079. });
  1080. const result = await this.indexFile(filePath);
  1081. nodesUpdated += result.nodes.length;
  1082. }
  1083. return {
  1084. filesChecked,
  1085. filesAdded,
  1086. filesModified,
  1087. filesRemoved,
  1088. nodesUpdated,
  1089. durationMs: Date.now() - startTime,
  1090. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  1091. };
  1092. }
  1093. /**
  1094. * Get files that have changed since last index.
  1095. * Uses git status as a fast path when available, falling back to full scan.
  1096. */
  1097. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  1098. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  1099. if (gitChanges) {
  1100. // === Git fast path ===
  1101. const added: string[] = [];
  1102. const modified: string[] = [];
  1103. const removed: string[] = [];
  1104. // Deleted files — only report if tracked in DB
  1105. for (const filePath of gitChanges.deleted) {
  1106. const tracked = this.queries.getFileByPath(filePath);
  1107. if (tracked) {
  1108. removed.push(filePath);
  1109. }
  1110. }
  1111. // Modified files — read + hash only these, compare with DB
  1112. for (const filePath of gitChanges.modified) {
  1113. const fullPath = path.join(this.rootDir, filePath);
  1114. let content: string;
  1115. try {
  1116. content = fs.readFileSync(fullPath, 'utf-8');
  1117. } catch (error) {
  1118. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1119. continue;
  1120. }
  1121. const contentHash = hashContent(content);
  1122. const tracked = this.queries.getFileByPath(filePath);
  1123. if (!tracked) {
  1124. added.push(filePath);
  1125. } else if (tracked.contentHash !== contentHash) {
  1126. modified.push(filePath);
  1127. }
  1128. }
  1129. // Added (untracked) files
  1130. for (const filePath of gitChanges.added) {
  1131. added.push(filePath);
  1132. }
  1133. return { added, modified, removed };
  1134. }
  1135. // === Fallback: full scan (non-git project or git failure) ===
  1136. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  1137. const trackedFiles = this.queries.getAllFiles();
  1138. // Build Map for O(1) lookups
  1139. const trackedMap = new Map<string, FileRecord>();
  1140. for (const f of trackedFiles) {
  1141. trackedMap.set(f.path, f);
  1142. }
  1143. const added: string[] = [];
  1144. const modified: string[] = [];
  1145. const removed: string[] = [];
  1146. // Find removed files
  1147. for (const tracked of trackedFiles) {
  1148. if (!currentFiles.has(tracked.path)) {
  1149. removed.push(tracked.path);
  1150. }
  1151. }
  1152. // Find added and modified files
  1153. for (const filePath of currentFiles) {
  1154. const fullPath = path.join(this.rootDir, filePath);
  1155. let content: string;
  1156. try {
  1157. content = fs.readFileSync(fullPath, 'utf-8');
  1158. } catch (error) {
  1159. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  1160. continue;
  1161. }
  1162. const contentHash = hashContent(content);
  1163. const tracked = trackedMap.get(filePath);
  1164. if (!tracked) {
  1165. added.push(filePath);
  1166. } else if (tracked.contentHash !== contentHash) {
  1167. modified.push(filePath);
  1168. }
  1169. }
  1170. return { added, modified, removed };
  1171. }
  1172. }
  1173. // Re-export useful types and functions
  1174. export { extractFromSource } from './tree-sitter';
  1175. export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';