| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331 |
- /**
- * Extraction Orchestrator
- *
- * Coordinates file scanning, parsing, and database storage.
- */
- import * as fs from 'fs';
- import * as fsp from 'fs/promises';
- import * as path from 'path';
- import * as crypto from 'crypto';
- import { execFileSync } from 'child_process';
- import {
- Language,
- FileRecord,
- ExtractionResult,
- ExtractionError,
- CodeGraphConfig,
- } from '../types';
- import { QueryBuilder } from '../db/queries';
- import { extractFromSource } from './tree-sitter';
- import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
- import { logDebug, logWarn } from '../errors';
- import { validatePathWithinRoot, normalizePath } from '../utils';
- import picomatch from 'picomatch';
- /**
- * Number of files to read in parallel during indexing.
- * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
- */
- const FILE_IO_BATCH_SIZE = 10;
- // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
- /**
- * Maximum time (ms) to wait for a single file to parse in the worker thread.
- * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
- * indexing run from freezing. The worker is restarted after a timeout.
- */
- const PARSE_TIMEOUT_MS = 10_000;
- /**
- * Number of files to parse before recycling the worker thread.
- * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
- * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
- * V8 isolate by terminating the worker thread and spawning a fresh one.
- * This interval balances memory usage against the cost of reloading grammars.
- */
- const WORKER_RECYCLE_INTERVAL = 250;
- /**
- * Progress callback for indexing operations
- */
- export interface IndexProgress {
- phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
- current: number;
- total: number;
- currentFile?: string;
- }
- /**
- * Result of an indexing operation
- */
- export interface IndexResult {
- success: boolean;
- filesIndexed: number;
- filesSkipped: number;
- filesErrored: number;
- nodesCreated: number;
- edgesCreated: number;
- errors: ExtractionError[];
- durationMs: number;
- }
- /**
- * Result of a sync operation
- */
- export interface SyncResult {
- filesChecked: number;
- filesAdded: number;
- filesModified: number;
- filesRemoved: number;
- nodesUpdated: number;
- durationMs: number;
- changedFilePaths?: string[];
- }
- /**
- * Calculate SHA256 hash of file contents
- */
- export function hashContent(content: string): string {
- return crypto.createHash('sha256').update(content).digest('hex');
- }
- /**
- * Check if a path matches any glob pattern (simplified)
- */
- function matchesGlob(filePath: string, pattern: string): boolean {
- filePath = normalizePath(filePath);
- return picomatch.isMatch(filePath, pattern, { dot: true });
- }
- /**
- * Check if a file should be included based on config
- */
- export function shouldIncludeFile(
- filePath: string,
- config: CodeGraphConfig
- ): boolean {
- // Check exclude patterns first
- for (const pattern of config.exclude) {
- if (matchesGlob(filePath, pattern)) {
- return false;
- }
- }
- // Check include patterns
- for (const pattern of config.include) {
- if (matchesGlob(filePath, pattern)) {
- return true;
- }
- }
- return false;
- }
- /**
- * Get all files visible to git (tracked + untracked but not ignored).
- * Respects .gitignore at all levels (root, subdirectories).
- * Returns null on failure (non-git project) so callers can fall back.
- */
- function getGitVisibleFiles(rootDir: string): Set<string> | null {
- try {
- // Check if the project directory is gitignored by a parent repo.
- // When rootDir lives inside a parent git repo that ignores it,
- // `git ls-files` returns nothing — fall back to filesystem walk.
- const gitRoot = execFileSync(
- 'git',
- ['rev-parse', '--show-toplevel'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
- ).trim();
- if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
- try {
- // git check-ignore exits 0 if the path IS ignored, 1 if not
- execFileSync(
- 'git',
- ['check-ignore', '-q', path.resolve(rootDir)],
- { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- // Directory is gitignored by parent repo — fall back to filesystem walk
- return null;
- } catch {
- // Not ignored — safe to use git ls-files
- }
- }
- // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
- const output = execFileSync(
- 'git',
- ['ls-files', '-co', '--exclude-standard'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- const files = new Set<string>();
- for (const line of output.split('\n')) {
- const trimmed = line.trim();
- if (trimmed) {
- files.add(normalizePath(trimmed));
- }
- }
- return files;
- } catch {
- return null;
- }
- }
- /**
- * Result of git-based change detection.
- * Returns null when git is unavailable (non-git project or command failure),
- * signaling the caller to fall back to full filesystem scan.
- */
- interface GitChanges {
- modified: string[]; // M, MM, AM — files to re-hash + re-index
- added: string[]; // ?? — new untracked files to index
- deleted: string[]; // D — files to remove from DB
- }
- /**
- * Use `git status` to detect changed files instead of scanning every file.
- * Returns null on failure so callers fall back to full scan.
- */
- function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
- try {
- const output = execFileSync(
- 'git',
- ['status', '--porcelain', '--no-renames'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- const modified: string[] = [];
- const added: string[] = [];
- const deleted: string[] = [];
- for (const line of output.split('\n')) {
- if (line.length < 4) continue; // Minimum: "XY file"
- const statusCode = line.substring(0, 2);
- const filePath = normalizePath(line.substring(3));
- // Skip files that don't match include/exclude config
- if (!shouldIncludeFile(filePath, config)) continue;
- if (statusCode === '??') {
- added.push(filePath);
- } else if (statusCode.includes('D')) {
- deleted.push(filePath);
- } else {
- // M, MM, AM, A (staged), etc. — treat as modified
- modified.push(filePath);
- }
- }
- return { modified, added, deleted };
- } catch {
- return null;
- }
- }
- /**
- * Marker file name that indicates a directory (and all children) should be skipped
- */
- const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
- /**
- * Recursively scan directory for source files.
- *
- * In git repos, uses `git ls-files` to get the file list (inherently
- * respects .gitignore at all levels), then filters by config include patterns.
- * Falls back to filesystem walk for non-git projects.
- */
- export function scanDirectory(
- rootDir: string,
- config: CodeGraphConfig,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- // Fast path: use git to get all visible files (respects .gitignore everywhere)
- const gitFiles = getGitVisibleFiles(rootDir);
- if (gitFiles) {
- const files: string[] = [];
- let count = 0;
- for (const filePath of gitFiles) {
- if (shouldIncludeFile(filePath, config)) {
- files.push(filePath);
- count++;
- onProgress?.(count, filePath);
- }
- }
- return files;
- }
- // Fallback: walk filesystem for non-git projects
- return scanDirectoryWalk(rootDir, config, onProgress);
- }
- /**
- * Async variant of scanDirectory that yields to the event loop periodically,
- * allowing worker threads to receive and render progress messages.
- */
- export async function scanDirectoryAsync(
- rootDir: string,
- config: CodeGraphConfig,
- onProgress?: (current: number, file: string) => void
- ): Promise<string[]> {
- const gitFiles = getGitVisibleFiles(rootDir);
- if (gitFiles) {
- const files: string[] = [];
- let count = 0;
- for (const filePath of gitFiles) {
- if (shouldIncludeFile(filePath, config)) {
- files.push(filePath);
- count++;
- onProgress?.(count, filePath);
- // Yield every 100 files so worker threads can render progress
- if (count % 100 === 0) {
- await new Promise<void>(r => setImmediate(r));
- }
- }
- }
- return files;
- }
- return scanDirectoryWalk(rootDir, config, onProgress);
- }
- /**
- * Filesystem walk fallback for non-git projects.
- */
- function scanDirectoryWalk(
- rootDir: string,
- config: CodeGraphConfig,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- const files: string[] = [];
- let count = 0;
- const visitedDirs = new Set<string>();
- function walk(dir: string): void {
- let realDir: string;
- try {
- realDir = fs.realpathSync(dir);
- } catch {
- logDebug('Skipping unresolvable directory', { dir });
- return;
- }
- if (visitedDirs.has(realDir)) {
- logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
- return;
- }
- visitedDirs.add(realDir);
- // Check for .codegraphignore marker file
- const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
- if (fs.existsSync(ignoreMarker)) {
- logDebug('Skipping directory due to .codegraphignore marker', { dir });
- return;
- }
- let entries: fs.Dirent[];
- try {
- entries = fs.readdirSync(dir, { withFileTypes: true });
- } catch (error) {
- logDebug('Skipping unreadable directory', { dir, error: String(error) });
- return;
- }
- for (const entry of entries) {
- const fullPath = path.join(dir, entry.name);
- const relativePath = normalizePath(path.relative(rootDir, fullPath));
- if (entry.isSymbolicLink()) {
- try {
- const realTarget = fs.realpathSync(fullPath);
- const stat = fs.statSync(realTarget);
- if (stat.isDirectory()) {
- const dirPattern = relativePath + '/';
- let excluded = false;
- for (const pattern of config.exclude) {
- if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
- excluded = true;
- break;
- }
- }
- if (!excluded) {
- walk(fullPath);
- }
- } else if (stat.isFile()) {
- if (shouldIncludeFile(relativePath, config)) {
- files.push(relativePath);
- count++;
- onProgress?.(count, relativePath);
- }
- }
- } catch {
- logDebug('Skipping broken symlink', { path: fullPath });
- }
- continue;
- }
- if (entry.isDirectory()) {
- const dirPattern = relativePath + '/';
- let excluded = false;
- for (const pattern of config.exclude) {
- if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
- excluded = true;
- break;
- }
- }
- if (!excluded) {
- walk(fullPath);
- }
- } else if (entry.isFile()) {
- if (shouldIncludeFile(relativePath, config)) {
- files.push(relativePath);
- count++;
- onProgress?.(count, relativePath);
- }
- }
- }
- }
- walk(rootDir);
- return files;
- }
- /**
- * Extraction orchestrator
- */
- export class ExtractionOrchestrator {
- private rootDir: string;
- private config: CodeGraphConfig;
- private queries: QueryBuilder;
- constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
- this.rootDir = rootDir;
- this.config = config;
- this.queries = queries;
- }
- /**
- * Index all files in the project
- */
- async indexAll(
- onProgress?: (progress: IndexProgress) => void,
- signal?: AbortSignal,
- verbose?: boolean
- ): Promise<IndexResult> {
- await initGrammars();
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let filesErrored = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- const log = verbose
- ? (msg: string) => { console.log(`[worker] ${msg}`); }
- : (_msg: string) => {};
- // Phase 1: Scan for files
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const files = await scanDirectoryAsync(this.rootDir, this.config, (current, file) => {
- onProgress?.({
- phase: 'scanning',
- current,
- total: 0,
- currentFile: file,
- });
- });
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed: 0,
- filesSkipped: 0,
- filesErrored: 0,
- nodesCreated: 0,
- edgesCreated: 0,
- errors: [{ message: 'Aborted', severity: 'error' }],
- durationMs: Date.now() - startTime,
- };
- }
- // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
- const total = files.length;
- let processed = 0;
- // Emit parsing phase immediately so the progress bar appears during worker setup.
- // The yield lets the shimmer worker flush the phase transition to stdout before
- // the main thread starts synchronous grammar detection work.
- onProgress?.({
- phase: 'parsing',
- current: 0,
- total,
- });
- await new Promise(resolve => setImmediate(resolve));
- // Detect needed languages and load grammars in the parse worker
- const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
- // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
- // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
- const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
- const useWorker = fs.existsSync(parseWorkerPath);
- let WorkerClass: typeof import('worker_threads').Worker | null = null;
- if (useWorker) {
- const { Worker } = await import('worker_threads');
- WorkerClass = Worker;
- } else {
- // In-process fallback: load grammars locally
- await loadGrammarsForLanguages(neededLanguages);
- }
- // --- Worker lifecycle management ---
- // The worker can crash (OOM in WASM) or hang on pathological files.
- // We track pending parse promises and handle both cases:
- // - Timeout: terminate + restart the worker, reject the timed-out request
- // - Crash: reject all pending promises, restart for remaining files
- let parseWorker: import('worker_threads').Worker | null = null;
- let nextId = 0;
- let workerParseCount = 0;
- const pendingParses = new Map<number, {
- resolve: (result: ExtractionResult) => void;
- reject: (err: Error) => void;
- timer: ReturnType<typeof setTimeout>;
- }>();
- function rejectAllPending(reason: string): void {
- for (const [id, pending] of pendingParses) {
- clearTimeout(pending.timer);
- pendingParses.delete(id);
- pending.reject(new Error(reason));
- }
- }
- function attachWorkerHandlers(w: import('worker_threads').Worker): void {
- w.on('message', (msg: { type: string; id?: number; result?: ExtractionResult }) => {
- if (msg.type === 'parse-result' && msg.id !== undefined) {
- const pending = pendingParses.get(msg.id);
- if (pending) {
- clearTimeout(pending.timer);
- pendingParses.delete(msg.id);
- pending.resolve(msg.result!);
- }
- }
- });
- w.on('error', (err) => {
- logWarn('Parse worker error', { error: err.message });
- rejectAllPending(`Worker error: ${err.message}`);
- });
- w.on('exit', (code) => {
- if (code !== 0 && pendingParses.size > 0) {
- logWarn('Parse worker exited unexpectedly', { code });
- rejectAllPending(`Worker exited with code ${code}`);
- }
- // Clear reference so we know to respawn, reset count so
- // the fresh worker gets a full cycle before recycling.
- if (parseWorker === w) {
- parseWorker = null;
- workerParseCount = 0;
- }
- });
- }
- async function ensureWorker(): Promise<import('worker_threads').Worker> {
- if (parseWorker) return parseWorker;
- log('Spawning new parse worker...');
- parseWorker = new WorkerClass!(parseWorkerPath);
- attachWorkerHandlers(parseWorker);
- // Load grammars in the new worker
- await new Promise<void>((resolve, reject) => {
- parseWorker!.once('message', (msg: { type: string }) => {
- if (msg.type === 'grammars-loaded') resolve();
- else reject(new Error(`Unexpected message: ${msg.type}`));
- });
- parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages });
- });
- return parseWorker;
- }
- if (WorkerClass) {
- await ensureWorker();
- }
- /**
- * Recycle the worker thread to reclaim WASM memory.
- * Terminates the current worker and clears the reference so
- * ensureWorker() will spawn a fresh one on the next call.
- */
- function recycleWorker(): void {
- if (!parseWorker) return;
- log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
- const w = parseWorker;
- parseWorker = null;
- workerParseCount = 0;
- // Fire-and-forget: worker.terminate() can hang if WASM is stuck
- w.terminate().catch(() => {});
- }
- async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
- if (!WorkerClass) {
- // In-process fallback
- return extractFromSource(filePath, content, detectLanguage(filePath));
- }
- // Recycle the worker before the next parse if we've hit the threshold.
- // This destroys the WASM linear memory (which can grow but never shrink)
- // and starts a fresh worker with a clean heap.
- if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
- await recycleWorker();
- }
- const worker = await ensureWorker();
- const id = nextId++;
- workerParseCount++;
- // Scale timeout for large files: base 10s + 10s per 100KB
- const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
- return new Promise<ExtractionResult>((resolve, reject) => {
- const timer = setTimeout(() => {
- pendingParses.delete(id);
- log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
- // Reject FIRST — worker.terminate() can hang if WASM is stuck
- parseWorker = null;
- workerParseCount = 0;
- reject(new Error(`Parse timed out after ${timeoutMs}ms`));
- // Fire-and-forget: kill the stuck worker in the background
- worker.terminate().catch(() => {});
- }, timeoutMs);
- pendingParses.set(id, { resolve, reject, timer });
- worker.postMessage({ type: 'parse', id, filePath, content });
- });
- }
- for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
- if (signal?.aborted) {
- if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
- // Read files in parallel (with path validation before any I/O)
- const fileContents = await Promise.all(
- batch.map(async (fp) => {
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, fp);
- if (!fullPath) {
- logWarn('Path traversal blocked in batch reader', { filePath: fp });
- return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
- }
- const content = await fsp.readFile(fullPath, 'utf-8');
- const stats = await fsp.stat(fullPath);
- return { filePath: fp, content, stats, error: null as Error | null };
- } catch (err) {
- return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
- }
- })
- );
- // Send to worker for parsing, store results on main thread
- for (const { filePath, content, stats, error } of fileContents) {
- if (signal?.aborted) {
- if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- // Report progress before parsing (show current file being worked on)
- onProgress?.({
- phase: 'parsing',
- current: processed,
- total,
- currentFile: filePath,
- });
- if (error || content === null || stats === null) {
- processed++;
- filesErrored++;
- errors.push({
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- filePath,
- severity: 'error',
- code: 'read_error',
- });
- continue;
- }
- // Parse in worker thread (main thread stays unblocked).
- // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
- let result: ExtractionResult;
- try {
- result = await requestParse(filePath, content);
- } catch (parseErr) {
- processed++;
- filesErrored++;
- errors.push({
- message: parseErr instanceof Error ? parseErr.message : String(parseErr),
- filePath,
- severity: 'error',
- code: 'parse_error',
- });
- continue;
- }
- processed++;
- // Store in database on main thread (SQLite is not thread-safe)
- if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath);
- this.storeExtractionResult(filePath, content, language, stats, result);
- }
- if (result.errors.length > 0) {
- for (const err of result.errors) {
- if (!err.filePath) err.filePath = filePath;
- }
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.some((e) => e.severity === 'error')) {
- filesErrored++;
- } else {
- filesSkipped++;
- }
- }
- }
- // Report 100% so the progress bar doesn't hang at 99%
- onProgress?.({
- phase: 'parsing',
- current: total,
- total,
- });
- // Yield so the shimmer worker's buffered stdout writes can flush.
- // Worker thread stdout is proxied through the main thread's event loop,
- // so synchronous work here blocks the animation from rendering.
- await new Promise(resolve => setImmediate(resolve));
- // Retry pass: files that failed due to WASM memory corruption may succeed
- // on a fresh worker with a clean heap. Recycle before each attempt so
- // every file gets the absolute cleanest WASM state possible.
- const retryableErrors = errors.filter(
- (e) => e.code === 'parse_error' && e.filePath &&
- (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
- );
- if (retryableErrors.length > 0 && WorkerClass) {
- log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
- const stillFailing: typeof retryableErrors = [];
- for (const errEntry of retryableErrors) {
- const filePath = errEntry.filePath!;
- if (signal?.aborted) break;
- // Fresh worker for every retry — maximum WASM headroom
- recycleWorker();
- let content: string;
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, filePath);
- if (!fullPath) continue;
- content = await fsp.readFile(fullPath, 'utf-8');
- } catch {
- continue;
- }
- let result: ExtractionResult;
- try {
- result = await requestParse(filePath, content);
- } catch {
- stillFailing.push(errEntry);
- continue;
- }
- if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath);
- const stats = await fsp.stat(path.join(this.rootDir, filePath));
- this.storeExtractionResult(filePath, content, language, stats, result);
- const idx = errors.indexOf(errEntry);
- if (idx >= 0) errors.splice(idx, 1);
- filesErrored--;
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
- }
- }
- // Last resort: for files that still crash on a clean worker, strip
- // comment-only lines to reduce WASM memory pressure. Many compiler
- // test files are 90%+ comments (CHECK directives) that don't contribute
- // code nodes but consume parser memory.
- if (stillFailing.length > 0) {
- log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
- for (const errEntry of stillFailing) {
- const filePath = errEntry.filePath!;
- if (signal?.aborted) break;
- recycleWorker();
- let fullContent: string;
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, filePath);
- if (!fullPath) continue;
- fullContent = await fsp.readFile(fullPath, 'utf-8');
- } catch {
- continue;
- }
- // Strip lines that are entirely comments (preserving line numbers
- // by replacing with empty lines so node positions stay correct)
- const stripped = fullContent
- .split('\n')
- .map(line => /^\s*\/\//.test(line) ? '' : line)
- .join('\n');
- let result: ExtractionResult;
- try {
- result = await requestParse(filePath, stripped);
- } catch {
- continue;
- }
- if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath);
- const stats = await fsp.stat(path.join(this.rootDir, filePath));
- this.storeExtractionResult(filePath, fullContent, language, stats, result);
- const idx = errors.indexOf(errEntry);
- if (idx >= 0) errors.splice(idx, 1);
- filesErrored--;
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
- }
- }
- }
- }
- // Shut down parse worker and clear any pending timers
- rejectAllPending('Indexing complete');
- if (parseWorker) {
- (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
- }
- return {
- success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index specific files
- */
- async indexFiles(filePaths: string[]): Promise<IndexResult> {
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let filesErrored = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- for (const filePath of filePaths) {
- const result = await this.indexFile(filePath);
- if (result.errors.length > 0) {
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.some((e) => e.severity === 'error')) {
- filesErrored++;
- } else {
- filesSkipped++;
- }
- }
- return {
- success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index a single file
- */
- async indexFile(relativePath: string): Promise<ExtractionResult> {
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
- durationMs: 0,
- };
- }
- // Read file content and stats
- let content: string;
- let stats: fs.Stats;
- try {
- stats = await fsp.stat(fullPath);
- content = await fsp.readFile(fullPath, 'utf-8');
- } catch (error) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- filePath: relativePath,
- severity: 'error',
- code: 'read_error',
- },
- ],
- durationMs: 0,
- };
- }
- return this.indexFileWithContent(relativePath, content, stats);
- }
- /**
- * Index a single file with pre-read content and stats.
- * Used by the parallel batch reader to avoid redundant file I/O.
- */
- async indexFileWithContent(
- relativePath: string,
- content: string,
- stats: fs.Stats
- ): Promise<ExtractionResult> {
- // Prevent path traversal
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
- durationMs: 0,
- };
- }
- // Check file size
- if (stats.size > this.config.maxFileSize) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
- filePath: relativePath,
- severity: 'warning',
- code: 'size_exceeded',
- },
- ],
- durationMs: 0,
- };
- }
- // Detect language
- const language = detectLanguage(relativePath);
- if (!isLanguageSupported(language)) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [],
- durationMs: 0,
- };
- }
- // Extract from source
- const result = extractFromSource(relativePath, content, language);
- // Store in database
- if (result.nodes.length > 0 || result.errors.length === 0) {
- this.storeExtractionResult(relativePath, content, language, stats, result);
- }
- return result;
- }
- /**
- * Store extraction result in database
- */
- private storeExtractionResult(
- filePath: string,
- content: string,
- language: Language,
- stats: fs.Stats,
- result: ExtractionResult
- ): void {
- const contentHash = hashContent(content);
- // Check if file already exists and hasn't changed
- const existingFile = this.queries.getFileByPath(filePath);
- if (existingFile && existingFile.contentHash === contentHash) {
- return; // No changes
- }
- // Delete existing data for this file
- if (existingFile) {
- this.queries.deleteFile(filePath);
- }
- // Filter out nodes with missing required fields before insertion.
- // This prevents FK violations when edges reference nodes that would
- // be silently skipped by insertNode() (see issue #42).
- const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
- // Insert nodes
- if (validNodes.length > 0) {
- this.queries.insertNodes(validNodes);
- }
- // Filter edges to only reference nodes that were actually inserted
- if (result.edges.length > 0) {
- const insertedIds = new Set(validNodes.map((n) => n.id));
- const validEdges = result.edges.filter(
- (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
- );
- if (validEdges.length > 0) {
- this.queries.insertEdges(validEdges);
- }
- }
- // Insert unresolved references in batch with denormalized filePath/language
- if (result.unresolvedReferences.length > 0) {
- const insertedIds = new Set(validNodes.map((n) => n.id));
- const refsWithContext = result.unresolvedReferences
- .filter((ref) => insertedIds.has(ref.fromNodeId))
- .map((ref) => ({
- ...ref,
- filePath: ref.filePath ?? filePath,
- language: ref.language ?? language,
- }));
- if (refsWithContext.length > 0) {
- this.queries.insertUnresolvedRefsBatch(refsWithContext);
- }
- }
- // Insert file record
- const fileRecord: FileRecord = {
- path: filePath,
- contentHash,
- language,
- size: stats.size,
- modifiedAt: stats.mtimeMs,
- indexedAt: Date.now(),
- nodeCount: result.nodes.length,
- errors: result.errors.length > 0 ? result.errors : undefined,
- };
- this.queries.upsertFile(fileRecord);
- }
- /**
- * Sync with current file state.
- * Uses git status as a fast path when available, falling back to full scan.
- */
- async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
- await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
- const startTime = Date.now();
- let filesChecked = 0;
- let filesAdded = 0;
- let filesModified = 0;
- let filesRemoved = 0;
- let nodesUpdated = 0;
- const changedFilePaths: string[] = [];
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const filesToIndex: string[] = [];
- const gitChanges = getGitChangedFiles(this.rootDir, this.config);
- if (gitChanges) {
- // === Git fast path ===
- // Only inspect the files git reports as changed instead of scanning everything.
- filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
- // Handle deleted files
- for (const filePath of gitChanges.deleted) {
- const tracked = this.queries.getFileByPath(filePath);
- if (tracked) {
- this.queries.deleteFile(filePath);
- filesRemoved++;
- }
- }
- // Handle modified files — read + hash only these files
- for (const filePath of gitChanges.modified) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = this.queries.getFileByPath(filePath);
- if (!tracked) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- } else if (tracked.contentHash !== contentHash) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesModified++;
- }
- }
- // Handle added (untracked) files
- for (const filePath of gitChanges.added) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- }
- } else {
- // === Fallback: full scan (non-git project or git failure) ===
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
- filesChecked = currentFiles.size;
- // Build Map for O(1) lookups instead of .find() per file
- const trackedFiles = this.queries.getAllFiles();
- const trackedMap = new Map<string, FileRecord>();
- for (const f of trackedFiles) {
- trackedMap.set(f.path, f);
- }
- // Find files to remove (in DB but not on disk)
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- this.queries.deleteFile(tracked.path);
- filesRemoved++;
- }
- }
- // Find files to add or update
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedMap.get(filePath);
- if (!tracked) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- } else if (tracked.contentHash !== contentHash) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesModified++;
- }
- }
- }
- // Load only grammars needed for changed files
- if (filesToIndex.length > 0) {
- const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
- await loadGrammarsForLanguages(neededLanguages);
- }
- // Index changed files
- const total = filesToIndex.length;
- for (let i = 0; i < filesToIndex.length; i++) {
- const filePath = filesToIndex[i]!;
- onProgress?.({
- phase: 'parsing',
- current: i + 1,
- total,
- currentFile: filePath,
- });
- const result = await this.indexFile(filePath);
- nodesUpdated += result.nodes.length;
- }
- return {
- filesChecked,
- filesAdded,
- filesModified,
- filesRemoved,
- nodesUpdated,
- durationMs: Date.now() - startTime,
- changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
- };
- }
- /**
- * Get files that have changed since last index.
- * Uses git status as a fast path when available, falling back to full scan.
- */
- getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
- const gitChanges = getGitChangedFiles(this.rootDir, this.config);
- if (gitChanges) {
- // === Git fast path ===
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Deleted files — only report if tracked in DB
- for (const filePath of gitChanges.deleted) {
- const tracked = this.queries.getFileByPath(filePath);
- if (tracked) {
- removed.push(filePath);
- }
- }
- // Modified files — read + hash only these, compare with DB
- for (const filePath of gitChanges.modified) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = this.queries.getFileByPath(filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- // Added (untracked) files
- for (const filePath of gitChanges.added) {
- added.push(filePath);
- }
- return { added, modified, removed };
- }
- // === Fallback: full scan (non-git project or git failure) ===
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
- const trackedFiles = this.queries.getAllFiles();
- // Build Map for O(1) lookups
- const trackedMap = new Map<string, FileRecord>();
- for (const f of trackedFiles) {
- trackedMap.set(f.path, f);
- }
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Find removed files
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- removed.push(tracked.path);
- }
- }
- // Find added and modified files
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedMap.get(filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- return { added, modified, removed };
- }
- }
- // Re-export useful types and functions
- export { extractFromSource } from './tree-sitter';
- export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';
|