| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537 |
- /**
- * Extraction Orchestrator
- *
- * Coordinates file scanning, parsing, and database storage.
- */
- import * as fs from 'fs';
- import * as fsp from 'fs/promises';
- import * as path from 'path';
- import * as crypto from 'crypto';
- import { execFileSync } from 'child_process';
- import {
- Language,
- FileRecord,
- ExtractionResult,
- ExtractionError,
- } from '../types';
- import { QueryBuilder } from '../db/queries';
- import { extractFromSource } from './tree-sitter';
- import { detectLanguage, isSourceFile, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
- import { logDebug, logWarn } from '../errors';
- import { validatePathWithinRoot, normalizePath } from '../utils';
- import ignore, { Ignore } from 'ignore';
- import { detectFrameworks } from '../resolution/frameworks';
- import type { ResolutionContext } from '../resolution/types';
- /**
- * Number of files to read in parallel during indexing.
- * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
- */
- const FILE_IO_BATCH_SIZE = 10;
- // PARSER_RESET_INTERVAL moved to parse-worker.ts (runs in worker thread)
- /**
- * Maximum time (ms) to wait for a single file to parse in the worker thread.
- * If tree-sitter hangs or WASM runs out of memory, this prevents the entire
- * indexing run from freezing. The worker is restarted after a timeout.
- */
- const PARSE_TIMEOUT_MS = 10_000;
- /**
- * Number of files to parse before recycling the worker thread.
- * WASM linear memory can grow but NEVER shrink (WebAssembly spec limitation).
- * The only way to reclaim tree-sitter's WASM heap is to destroy the entire
- * V8 isolate by terminating the worker thread and spawning a fresh one.
- * This interval balances memory usage against the cost of reloading grammars.
- */
- const WORKER_RECYCLE_INTERVAL = 250;
- /**
- * Progress callback for indexing operations
- */
- export interface IndexProgress {
- phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
- current: number;
- total: number;
- currentFile?: string;
- }
- /**
- * Result of an indexing operation
- */
- export interface IndexResult {
- success: boolean;
- filesIndexed: number;
- filesSkipped: number;
- filesErrored: number;
- nodesCreated: number;
- edgesCreated: number;
- errors: ExtractionError[];
- durationMs: number;
- }
- /**
- * Result of a sync operation
- */
- export interface SyncResult {
- filesChecked: number;
- filesAdded: number;
- filesModified: number;
- filesRemoved: number;
- nodesUpdated: number;
- durationMs: number;
- changedFilePaths?: string[];
- }
- /**
- * Calculate SHA256 hash of file contents
- */
- export function hashContent(content: string): string {
- return crypto.createHash('sha256').update(content).digest('hex');
- }
- /**
- * Skip files larger than this (bytes). Generated bundles, minified JS, and
- * vendored blobs blow the WASM heap and the worker-recycle budget for no useful
- * symbols. 1 MB covers essentially all hand-written source.
- */
- const MAX_FILE_SIZE = 1024 * 1024;
- /**
- * Directory names that are dependency, build, cache, or tooling output across the
- * languages/frameworks CodeGraph supports — curated from the canonical
- * github/gitignore templates. Excluded by default so the graph reflects your code,
- * not third-party noise, without requiring a `.gitignore` (issue #407). The
- * exclusion applies uniformly (git or not, tracked or not); the only opt-in is an
- * explicit `.gitignore` negation (e.g. `!vendor/`). First-party-prone or generic
- * names (`packages`, `lib`, `app`, `bin`, `src`, `deps`, `env`, `tmp`, `storage`,
- * `Library`) are deliberately NOT listed, to avoid ever hiding real source.
- *
- * Only dirs that actually contain *indexable source* (or are enormous) earn a slot
- * — IDE/state dirs like `.idea`/`.vs` are omitted because CodeGraph indexes only
- * recognized source extensions, so they produce no symbols regardless.
- */
- const DEFAULT_IGNORE_DIRS: ReadonlySet<string> = new Set([
- // JS / TS — dependency directories
- 'node_modules', 'bower_components', 'jspm_packages', 'web_modules',
- '.yarn', '.pnpm-store',
- // JS / TS — framework & bundler build / cache / deploy output
- '.next', '.nuxt', '.svelte-kit', '.turbo', '.vite', '.parcel-cache', '.angular',
- '.docusaurus', 'storybook-static', '.vinxi', '.nitro', 'out-tsc',
- '.vercel', '.netlify', '.wrangler',
- // Build output (common across ecosystems)
- 'dist', 'build', 'out', '.output',
- // Test / coverage
- 'coverage', '.nyc_output',
- // Python
- '__pycache__', '__pypackages__', '.venv', 'venv', '.pixi', '.pdm-build',
- '.mypy_cache', '.pytest_cache', '.ruff_cache', '.tox', '.nox', '.hypothesis',
- '.ipynb_checkpoints', '.eggs',
- // Rust / JVM (Maven, Gradle, Scala)
- 'target', '.gradle',
- // .NET
- 'obj',
- // Vendored deps (Go, PHP/Composer, Ruby/Bundler)
- 'vendor',
- // Swift / iOS
- '.build', 'Pods', 'Carthage', 'DerivedData', '.swiftpm',
- // Dart / Flutter
- '.dart_tool', '.pub-cache',
- // Native (Android NDK, C/C++ deps)
- '.cxx', '.externalNativeBuild', 'vcpkg_installed',
- // Scala tooling
- '.bloop', '.metals',
- // Lua / Luau (LuaRocks)
- 'lua_modules', '.luarocks',
- // Delphi / RAD Studio IDE backups (duplicate .pas source — would double-count)
- '__history', '__recovery',
- // Generic cache
- '.cache',
- ]);
- /** Gitignore-style patterns for the `ignore` matcher: the dirs above plus a few globs. */
- const DEFAULT_IGNORE_PATTERNS: string[] = [
- ...Array.from(DEFAULT_IGNORE_DIRS, (d) => `${d}/`),
- '*.egg-info/', // Python packaging metadata
- 'cmake-build-*/', // CLion / CMake build trees
- 'bazel-*/', // Bazel output symlink trees
- ];
- /**
- * An `ignore` matcher seeded with the built-in defaults, merged with the project's
- * root .gitignore so a negation there (e.g. `!vendor/`) overrides a default. Shared
- * by both enumeration paths so behavior is identical with or without git — and so
- * the defaults apply to tracked files too (committing a dependency dir doesn't make
- * it project code; the explicit `.gitignore` negation is the only opt-in).
- */
- export function buildDefaultIgnore(rootDir: string): Ignore {
- const ig = ignore().add(DEFAULT_IGNORE_PATTERNS);
- try {
- const rootGitignore = path.join(rootDir, '.gitignore');
- if (fs.existsSync(rootGitignore)) ig.add(fs.readFileSync(rootGitignore, 'utf-8'));
- } catch {
- // Unreadable root .gitignore — the built-in defaults still apply.
- }
- return ig;
- }
- /**
- * Collect git-visible files (tracked + untracked, .gitignore-respected) from the
- * git repository rooted at `repoDir`, adding each to `files` with `prefix`
- * prepended so paths stay relative to the original scan root.
- *
- * Recurses into embedded git repositories — nested repos that are NOT submodules
- * (independent clones living inside the workspace, common in CMake "super-repo"
- * layouts). The parent repo's `git ls-files` cannot see into them: tracked output
- * skips them entirely, and untracked output reports them only as an opaque
- * "subdir/" entry (trailing slash) rather than expanding their files. Each
- * embedded repo is its own git boundary, so we re-run `git ls-files` inside it.
- * (See issue #193.)
- */
- function collectGitFiles(repoDir: string, prefix: string, files: Set<string>): void {
- const gitOpts = { cwd: repoDir, encoding: 'utf-8' as const, timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] as ['pipe', 'pipe', 'pipe'] };
- // Tracked files. --recurse-submodules pulls in files from active submodules,
- // which the index would otherwise represent only as a commit pointer.
- // Without this, monorepos using submodules index 0 files. (See issue #147.)
- // Note: --recurse-submodules only supports -c/--cached and --stage modes — it
- // can't be combined with -o, so untracked files are gathered separately below.
- const tracked = execFileSync('git', ['ls-files', '-c', '--recurse-submodules'], gitOpts);
- for (const line of tracked.split('\n')) {
- const trimmed = line.trim();
- if (trimmed) {
- files.add(normalizePath(prefix + trimmed));
- }
- }
- // Untracked files (submodules manage their own untracked state). Embedded git
- // repos surface here as a single "subdir/" entry that git refuses to descend
- // into — recurse into those as their own repos so their source gets indexed.
- const untracked = execFileSync('git', ['ls-files', '-o', '--exclude-standard'], gitOpts);
- for (const line of untracked.split('\n')) {
- const trimmed = line.trim();
- if (!trimmed) continue;
- if (trimmed.endsWith('/')) {
- // git only emits a trailing-slash directory entry for an embedded repo.
- // Guard with a .git check anyway, and skip anything else exactly as git
- // itself skips it (we never descend into a non-repo opaque dir).
- const childDir = path.join(repoDir, trimmed);
- if (fs.existsSync(path.join(childDir, '.git'))) {
- collectGitFiles(childDir, prefix + trimmed, files);
- }
- continue;
- }
- files.add(normalizePath(prefix + trimmed));
- }
- }
- /**
- * Get all files visible to git (tracked + untracked but not ignored).
- * Respects .gitignore at all levels (root, subdirectories) and descends into
- * embedded (nested, non-submodule) git repos. Returns null on failure
- * (non-git project) so callers can fall back to a filesystem walk.
- */
- function getGitVisibleFiles(rootDir: string): Set<string> | null {
- try {
- // Check if the project directory is gitignored by a parent repo.
- // When rootDir lives inside a parent git repo that ignores it,
- // `git ls-files` returns nothing — fall back to filesystem walk.
- const gitRoot = execFileSync(
- 'git',
- ['rev-parse', '--show-toplevel'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
- ).trim();
- if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
- try {
- // git check-ignore exits 0 if the path IS ignored, 1 if not
- execFileSync(
- 'git',
- ['check-ignore', '-q', path.resolve(rootDir)],
- { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- // Directory is gitignored by parent repo — fall back to filesystem walk
- return null;
- } catch {
- // Not ignored — safe to use git ls-files
- }
- }
- const files = new Set<string>();
- collectGitFiles(rootDir, '', files);
- // Apply built-in default ignores uniformly — to tracked files too, since
- // committing a dependency/build dir doesn't make it project code. A
- // `.gitignore` negation (e.g. `!vendor/`) is the explicit opt-in. (issue #407)
- const ig = buildDefaultIgnore(rootDir);
- return new Set([...files].filter((f) => !ig.ignores(f)));
- } catch {
- return null;
- }
- }
- /**
- * Result of git-based change detection.
- * Returns null when git is unavailable (non-git project or command failure),
- * signaling the caller to fall back to full filesystem scan.
- */
- interface GitChanges {
- modified: string[]; // M, MM, AM — files to re-hash + re-index
- added: string[]; // ?? — new untracked files to index
- deleted: string[]; // D — files to remove from DB
- }
- /**
- * Use `git status` to detect changed files instead of scanning every file.
- * Returns null on failure so callers fall back to full scan.
- */
- function getGitChangedFiles(rootDir: string): GitChanges | null {
- try {
- const output = execFileSync(
- 'git',
- ['status', '--porcelain', '--no-renames'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- const modified: string[] = [];
- const added: string[] = [];
- const deleted: string[] = [];
- for (const line of output.split('\n')) {
- if (line.length < 4) continue; // Minimum: "XY file"
- const statusCode = line.substring(0, 2);
- const filePath = normalizePath(line.substring(3));
- // Skip non-source files (git status already omits .gitignored paths).
- if (!isSourceFile(filePath)) continue;
- if (statusCode === '??') {
- added.push(filePath);
- } else if (statusCode.includes('D')) {
- deleted.push(filePath);
- } else {
- // M, MM, AM, A (staged), etc. — treat as modified
- modified.push(filePath);
- }
- }
- return { modified, added, deleted };
- } catch {
- return null;
- }
- }
- /**
- * Recursively scan a directory for source files.
- *
- * In git repos, uses `git ls-files` (inherently respects .gitignore at all
- * levels), then keeps files with a supported source extension. For non-git
- * projects, falls back to a filesystem walk that parses .gitignore itself.
- */
- export function scanDirectory(
- rootDir: string,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- // Fast path: use git to get all visible files (respects .gitignore everywhere)
- const gitFiles = getGitVisibleFiles(rootDir);
- if (gitFiles) {
- const files: string[] = [];
- let count = 0;
- for (const filePath of gitFiles) {
- if (isSourceFile(filePath)) {
- files.push(filePath);
- count++;
- onProgress?.(count, filePath);
- }
- }
- return files;
- }
- // Fallback: walk filesystem for non-git projects
- return scanDirectoryWalk(rootDir, onProgress);
- }
- /**
- * Async variant of scanDirectory that yields to the event loop periodically,
- * allowing worker threads to receive and render progress messages.
- */
- export async function scanDirectoryAsync(
- rootDir: string,
- onProgress?: (current: number, file: string) => void
- ): Promise<string[]> {
- const gitFiles = getGitVisibleFiles(rootDir);
- if (gitFiles) {
- const files: string[] = [];
- let count = 0;
- for (const filePath of gitFiles) {
- if (isSourceFile(filePath)) {
- files.push(filePath);
- count++;
- onProgress?.(count, filePath);
- // Yield every 100 files so worker threads can render progress
- if (count % 100 === 0) {
- await new Promise<void>(r => setImmediate(r));
- }
- }
- }
- return files;
- }
- return scanDirectoryWalk(rootDir, onProgress);
- }
- /**
- * Filesystem walk fallback for non-git projects.
- */
- function scanDirectoryWalk(
- rootDir: string,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- const files: string[] = [];
- let count = 0;
- const visitedDirs = new Set<string>();
- // A .gitignore matcher scoped to the directory that declared it. Patterns in
- // a nested .gitignore are relative to that directory, so we keep the dir
- // alongside the matcher and test paths relative to it — mirroring how git
- // applies .gitignore files at every level.
- interface ScopedIgnore {
- dir: string;
- ig: Ignore;
- }
- const loadIgnore = (dir: string): ScopedIgnore | null => {
- try {
- const giPath = path.join(dir, '.gitignore');
- if (fs.existsSync(giPath)) {
- return { dir, ig: ignore().add(fs.readFileSync(giPath, 'utf-8')) };
- }
- } catch {
- // Unreadable .gitignore — treat as absent.
- }
- return null;
- };
- const isIgnored = (fullPath: string, isDir: boolean, matchers: ScopedIgnore[]): boolean => {
- for (const { dir, ig } of matchers) {
- let rel = normalizePath(path.relative(dir, fullPath));
- if (!rel || rel.startsWith('..')) continue; // not under this matcher's dir
- if (isDir) rel += '/'; // dir-only rules (e.g. `build/`) only match with the slash
- if (ig.ignores(rel)) return true;
- }
- return false;
- };
- function walk(dir: string, matchers: ScopedIgnore[]): void {
- let realDir: string;
- try {
- realDir = fs.realpathSync(dir);
- } catch {
- logDebug('Skipping unresolvable directory', { dir });
- return;
- }
- if (visitedDirs.has(realDir)) {
- logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
- return;
- }
- visitedDirs.add(realDir);
- // This directory's own .gitignore (if present) applies to everything below it.
- // The root's .gitignore is already merged into the seeded base matcher (so a
- // negation there can override a built-in default), so skip it here.
- const own = dir === rootDir ? null : loadIgnore(dir);
- const active = own ? [...matchers, own] : matchers;
- let entries: fs.Dirent[];
- try {
- entries = fs.readdirSync(dir, { withFileTypes: true });
- } catch (error) {
- logDebug('Skipping unreadable directory', { dir, error: String(error) });
- return;
- }
- for (const entry of entries) {
- // Never descend into git internals or our own data directory.
- if (entry.name === '.git' || entry.name === '.codegraph') continue;
- const fullPath = path.join(dir, entry.name);
- const relativePath = normalizePath(path.relative(rootDir, fullPath));
- if (entry.isSymbolicLink()) {
- try {
- const realTarget = fs.realpathSync(fullPath);
- const stat = fs.statSync(realTarget);
- if (stat.isDirectory()) {
- if (!isIgnored(fullPath, true, active)) {
- walk(fullPath, active);
- }
- } else if (stat.isFile()) {
- if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath)) {
- files.push(relativePath);
- count++;
- onProgress?.(count, relativePath);
- }
- }
- } catch {
- logDebug('Skipping broken symlink', { path: fullPath });
- }
- continue;
- }
- if (entry.isDirectory()) {
- if (!isIgnored(fullPath, true, active)) {
- walk(fullPath, active);
- }
- } else if (entry.isFile()) {
- if (!isIgnored(fullPath, false, active) && isSourceFile(relativePath)) {
- files.push(relativePath);
- count++;
- onProgress?.(count, relativePath);
- }
- }
- }
- }
- // Seed a base matcher with the built-in default ignores (merged with the root
- // .gitignore so a negation can override). Nested .gitignores still layer per-dir.
- walk(rootDir, [{ dir: rootDir, ig: buildDefaultIgnore(rootDir) }]);
- return files;
- }
- /**
- * Extraction orchestrator
- */
- export class ExtractionOrchestrator {
- private rootDir: string;
- private queries: QueryBuilder;
- /**
- * Names of frameworks detected for this project, populated by indexAll().
- * Passed to extractFromSource so framework-specific extractors (route nodes,
- * middleware, etc.) run after the tree-sitter pass. Cleared if detection
- * hasn't run yet so single-file re-index paths can detect on the spot.
- */
- private detectedFrameworkNames: string[] | null = null;
- constructor(rootDir: string, queries: QueryBuilder) {
- this.rootDir = rootDir;
- this.queries = queries;
- }
- /**
- * Build a filesystem-backed ResolutionContext sufficient for framework
- * detection. Graph-query methods (getNodesByName etc.) return empty because
- * the DB hasn't been populated yet, but detect() only uses readFile,
- * fileExists, and getAllFiles, so that's fine.
- */
- private buildDetectionContext(files: string[]): ResolutionContext {
- const rootDir = this.rootDir;
- return {
- getNodesInFile: () => [],
- getNodesByName: () => [],
- getNodesByQualifiedName: () => [],
- getNodesByKind: () => [],
- getNodesByLowerName: () => [],
- getImportMappings: () => [],
- getAllFiles: () => files,
- getProjectRoot: () => rootDir,
- fileExists: (relativePath: string) => {
- const full = validatePathWithinRoot(rootDir, relativePath);
- if (!full) return false;
- try {
- return fs.existsSync(full);
- } catch {
- return false;
- }
- },
- readFile: (relativePath: string) => {
- const full = validatePathWithinRoot(rootDir, relativePath);
- if (!full) return null;
- try {
- return fs.readFileSync(full, 'utf-8');
- } catch {
- return null;
- }
- },
- // Monorepo support — needed by framework detect()s that probe
- // subpackage manifests (e.g. fabric-view looking at
- // packages/<sub>/package.json when the root manifest is just a
- // workspace declaration). Matches the resolver-context shape.
- listDirectories: (relativePath: string) => {
- const target =
- relativePath === '.' || relativePath === ''
- ? rootDir
- : path.join(rootDir, relativePath);
- try {
- return fs
- .readdirSync(target, { withFileTypes: true })
- .filter((entry) => entry.isDirectory())
- .map((entry) => entry.name);
- } catch {
- return [];
- }
- },
- };
- }
- /**
- * Detect frameworks on demand using the current scanned files (or a fresh
- * scan if none are provided). Cached on the orchestrator so repeat calls
- * inside a single run don't re-scan.
- */
- private ensureDetectedFrameworks(files?: string[]): string[] {
- if (this.detectedFrameworkNames !== null) return this.detectedFrameworkNames;
- const fileList = files ?? scanDirectory(this.rootDir);
- const context = this.buildDetectionContext(fileList);
- this.detectedFrameworkNames = detectFrameworks(context).map((r) => r.name);
- return this.detectedFrameworkNames;
- }
- /**
- * Index all files in the project
- */
- async indexAll(
- onProgress?: (progress: IndexProgress) => void,
- signal?: AbortSignal,
- verbose?: boolean
- ): Promise<IndexResult> {
- await initGrammars();
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let filesErrored = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- const log = verbose
- ? (msg: string) => { console.log(`[worker] ${msg}`); }
- : (_msg: string) => {};
- // Phase 1: Scan for files
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const files = await scanDirectoryAsync(this.rootDir, (current, file) => {
- onProgress?.({
- phase: 'scanning',
- current,
- total: 0,
- currentFile: file,
- });
- });
- // Detect frameworks once per indexAll run using the scanned file list.
- // Names are passed to each parse call so framework-specific extractors
- // (route nodes, middleware, etc.) run after the tree-sitter pass.
- // Framework detection is reset each run so adding e.g. requirements.txt
- // between runs is picked up without restarting the process.
- this.detectedFrameworkNames = null;
- const frameworkNames = this.ensureDetectedFrameworks(files);
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed: 0,
- filesSkipped: 0,
- filesErrored: 0,
- nodesCreated: 0,
- edgesCreated: 0,
- errors: [{ message: 'Aborted', severity: 'error' }],
- durationMs: Date.now() - startTime,
- };
- }
- // Phase 2: Parse files in a worker thread (keeps main thread unblocked for UI)
- const total = files.length;
- let processed = 0;
- // Emit parsing phase immediately so the progress bar appears during worker setup.
- // The yield lets the shimmer worker flush the phase transition to stdout before
- // the main thread starts synchronous grammar detection work.
- onProgress?.({
- phase: 'parsing',
- current: 0,
- total,
- });
- await new Promise(resolve => setImmediate(resolve));
- // Detect needed languages and load grammars in the parse worker
- const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
- // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded when c is needed
- if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
- neededLanguages.push('cpp');
- }
- // Try to use a worker thread for parsing (keeps main thread unblocked for UI).
- // Falls back to in-process parsing if the compiled worker is unavailable (e.g. tests).
- const parseWorkerPath = path.join(__dirname, 'parse-worker.js');
- const useWorker = fs.existsSync(parseWorkerPath);
- let WorkerClass: typeof import('worker_threads').Worker | null = null;
- if (useWorker) {
- const { Worker } = await import('worker_threads');
- WorkerClass = Worker;
- } else {
- // In-process fallback: load grammars locally
- await loadGrammarsForLanguages(neededLanguages);
- }
- // --- Worker lifecycle management ---
- // The worker can crash (OOM in WASM) or hang on pathological files.
- // We track pending parse promises and handle both cases:
- // - Timeout: terminate + restart the worker, reject the timed-out request
- // - Crash: reject all pending promises, restart for remaining files
- let parseWorker: import('worker_threads').Worker | null = null;
- let nextId = 0;
- let workerParseCount = 0;
- const pendingParses = new Map<number, {
- resolve: (result: ExtractionResult) => void;
- reject: (err: Error) => void;
- timer: ReturnType<typeof setTimeout>;
- }>();
- function rejectAllPending(reason: string): void {
- for (const [id, pending] of pendingParses) {
- clearTimeout(pending.timer);
- pendingParses.delete(id);
- pending.reject(new Error(reason));
- }
- }
- function attachWorkerHandlers(w: import('worker_threads').Worker): void {
- w.on('message', (msg: { type: string; id?: number; result?: ExtractionResult }) => {
- if (msg.type === 'parse-result' && msg.id !== undefined) {
- const pending = pendingParses.get(msg.id);
- if (pending) {
- clearTimeout(pending.timer);
- pendingParses.delete(msg.id);
- pending.resolve(msg.result!);
- }
- }
- });
- w.on('error', (err) => {
- logWarn('Parse worker error', { error: err.message });
- rejectAllPending(`Worker error: ${err.message}`);
- });
- w.on('exit', (code) => {
- if (code !== 0 && pendingParses.size > 0) {
- logWarn('Parse worker exited unexpectedly', { code });
- rejectAllPending(`Worker exited with code ${code}`);
- }
- // Clear reference so we know to respawn, reset count so
- // the fresh worker gets a full cycle before recycling.
- if (parseWorker === w) {
- parseWorker = null;
- workerParseCount = 0;
- }
- });
- }
- async function ensureWorker(): Promise<import('worker_threads').Worker> {
- if (parseWorker) return parseWorker;
- log('Spawning new parse worker...');
- parseWorker = new WorkerClass!(parseWorkerPath);
- attachWorkerHandlers(parseWorker);
- // Load grammars in the new worker
- await new Promise<void>((resolve, reject) => {
- parseWorker!.once('message', (msg: { type: string }) => {
- if (msg.type === 'grammars-loaded') resolve();
- else reject(new Error(`Unexpected message: ${msg.type}`));
- });
- parseWorker!.postMessage({ type: 'load-grammars', languages: neededLanguages });
- });
- return parseWorker;
- }
- if (WorkerClass) {
- await ensureWorker();
- }
- /**
- * Recycle the worker thread to reclaim WASM memory.
- * Terminates the current worker and clears the reference so
- * ensureWorker() will spawn a fresh one on the next call.
- */
- function recycleWorker(): void {
- if (!parseWorker) return;
- log(`Recycling worker after ${workerParseCount} parses (heap: ${Math.round(process.memoryUsage().rss / 1024 / 1024)}MB RSS)`);
- const w = parseWorker;
- parseWorker = null;
- workerParseCount = 0;
- // Fire-and-forget: worker.terminate() can hang if WASM is stuck
- w.terminate().catch(() => {});
- }
- async function requestParse(filePath: string, content: string): Promise<ExtractionResult> {
- if (!WorkerClass) {
- // In-process fallback
- return extractFromSource(
- filePath,
- content,
- detectLanguage(filePath, content),
- frameworkNames
- );
- }
- // Recycle the worker before the next parse if we've hit the threshold.
- // This destroys the WASM linear memory (which can grow but never shrink)
- // and starts a fresh worker with a clean heap.
- if (workerParseCount >= WORKER_RECYCLE_INTERVAL) {
- await recycleWorker();
- }
- const worker = await ensureWorker();
- const id = nextId++;
- workerParseCount++;
- // Scale timeout for large files: base 10s + 10s per 100KB
- const timeoutMs = PARSE_TIMEOUT_MS + Math.floor(content.length / 100_000) * 10_000;
- return new Promise<ExtractionResult>((resolve, reject) => {
- const timer = setTimeout(() => {
- pendingParses.delete(id);
- log(`TIMEOUT: ${filePath} exceeded ${timeoutMs}ms — killing worker`);
- // Reject FIRST — worker.terminate() can hang if WASM is stuck
- parseWorker = null;
- workerParseCount = 0;
- reject(new Error(`Parse timed out after ${timeoutMs}ms`));
- // Fire-and-forget: kill the stuck worker in the background
- worker.terminate().catch(() => {});
- }, timeoutMs);
- pendingParses.set(id, { resolve, reject, timer });
- worker.postMessage({ type: 'parse', id, filePath, content, frameworkNames });
- });
- }
- for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
- if (signal?.aborted) {
- if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
- // Read files in parallel (with path validation before any I/O)
- const fileContents = await Promise.all(
- batch.map(async (fp) => {
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, fp);
- if (!fullPath) {
- logWarn('Path traversal blocked in batch reader', { filePath: fp });
- return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
- }
- const content = await fsp.readFile(fullPath, 'utf-8');
- const stats = await fsp.stat(fullPath);
- return { filePath: fp, content, stats, error: null as Error | null };
- } catch (err) {
- return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
- }
- })
- );
- // Send to worker for parsing, store results on main thread
- for (const { filePath, content, stats, error } of fileContents) {
- if (signal?.aborted) {
- if (parseWorker) (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- // Report progress before parsing (show current file being worked on)
- onProgress?.({
- phase: 'parsing',
- current: processed,
- total,
- currentFile: filePath,
- });
- if (error || content === null || stats === null) {
- processed++;
- filesErrored++;
- errors.push({
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- filePath,
- severity: 'error',
- code: 'read_error',
- });
- continue;
- }
- // Honour MAX_FILE_SIZE. Without this check, vendored generated
- // headers, minified bundles, and other multi-MB files get indexed,
- // wasting WASM heap and the worker recycle budget on inputs with no
- // useful symbols. The single-file extractFile path already enforces
- // this; the bulk path used to silently skip the check.
- if (stats.size > MAX_FILE_SIZE) {
- processed++;
- filesSkipped++;
- errors.push({
- message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
- filePath,
- severity: 'warning',
- code: 'size_exceeded',
- });
- onProgress?.({ phase: 'parsing', current: processed, total });
- continue;
- }
- // Parse in worker thread (main thread stays unblocked).
- // Wrapped in try/catch to handle worker timeouts and crashes gracefully.
- let result: ExtractionResult;
- try {
- result = await requestParse(filePath, content);
- } catch (parseErr) {
- processed++;
- filesErrored++;
- errors.push({
- message: parseErr instanceof Error ? parseErr.message : String(parseErr),
- filePath,
- severity: 'error',
- code: 'parse_error',
- });
- continue;
- }
- processed++;
- // Store in database on main thread (SQLite is not thread-safe)
- if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath, content);
- this.storeExtractionResult(filePath, content, language, stats, result);
- }
- if (result.errors.length > 0) {
- for (const err of result.errors) {
- if (!err.filePath) err.filePath = filePath;
- }
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.some((e) => e.severity === 'error')) {
- filesErrored++;
- } else {
- filesSkipped++;
- }
- }
- }
- // Report 100% so the progress bar doesn't hang at 99%
- onProgress?.({
- phase: 'parsing',
- current: total,
- total,
- });
- // Yield so the shimmer worker's buffered stdout writes can flush.
- // Worker thread stdout is proxied through the main thread's event loop,
- // so synchronous work here blocks the animation from rendering.
- await new Promise(resolve => setImmediate(resolve));
- // Retry pass: files that failed due to WASM memory corruption may succeed
- // on a fresh worker with a clean heap. Recycle before each attempt so
- // every file gets the absolute cleanest WASM state possible.
- const retryableErrors = errors.filter(
- (e) => e.code === 'parse_error' && e.filePath &&
- (e.message.includes('Worker exited') || e.message.includes('memory access out of bounds'))
- );
- if (retryableErrors.length > 0 && WorkerClass) {
- log(`Retrying ${retryableErrors.length} files that failed due to WASM memory errors...`);
- const stillFailing: typeof retryableErrors = [];
- for (const errEntry of retryableErrors) {
- const filePath = errEntry.filePath!;
- if (signal?.aborted) break;
- // Fresh worker for every retry — maximum WASM headroom
- recycleWorker();
- let content: string;
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, filePath);
- if (!fullPath) continue;
- content = await fsp.readFile(fullPath, 'utf-8');
- } catch {
- continue;
- }
- let result: ExtractionResult;
- try {
- result = await requestParse(filePath, content);
- } catch {
- stillFailing.push(errEntry);
- continue;
- }
- if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath, content);
- const stats = await fsp.stat(path.join(this.rootDir, filePath));
- this.storeExtractionResult(filePath, content, language, stats, result);
- const idx = errors.indexOf(errEntry);
- if (idx >= 0) errors.splice(idx, 1);
- filesErrored--;
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- log(`Retry OK: ${filePath} (${result.nodes.length} nodes)`);
- }
- }
- // Last resort: for files that still crash on a clean worker, strip
- // comment-only lines to reduce WASM memory pressure. Many compiler
- // test files are 90%+ comments (CHECK directives) that don't contribute
- // code nodes but consume parser memory.
- if (stillFailing.length > 0) {
- log(`${stillFailing.length} files still failing — retrying with comments stripped...`);
- for (const errEntry of stillFailing) {
- const filePath = errEntry.filePath!;
- if (signal?.aborted) break;
- recycleWorker();
- let fullContent: string;
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, filePath);
- if (!fullPath) continue;
- fullContent = await fsp.readFile(fullPath, 'utf-8');
- } catch {
- continue;
- }
- // Strip lines that are entirely comments (preserving line numbers
- // by replacing with empty lines so node positions stay correct)
- const stripped = fullContent
- .split('\n')
- .map(line => /^\s*\/\//.test(line) ? '' : line)
- .join('\n');
- let result: ExtractionResult;
- try {
- result = await requestParse(filePath, stripped);
- } catch {
- continue;
- }
- if (result.nodes.length > 0 || result.errors.length === 0) {
- const language = detectLanguage(filePath, fullContent);
- const stats = await fsp.stat(path.join(this.rootDir, filePath));
- this.storeExtractionResult(filePath, fullContent, language, stats, result);
- const idx = errors.indexOf(errEntry);
- if (idx >= 0) errors.splice(idx, 1);
- filesErrored--;
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- log(`Retry (stripped) OK: ${filePath} (${result.nodes.length} nodes)`);
- }
- }
- }
- }
- // Shut down parse worker and clear any pending timers
- rejectAllPending('Indexing complete');
- if (parseWorker) {
- (parseWorker as import('worker_threads').Worker).terminate().catch(() => {});
- }
- return {
- success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index specific files
- */
- async indexFiles(filePaths: string[]): Promise<IndexResult> {
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let filesErrored = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- for (const filePath of filePaths) {
- const result = await this.indexFile(filePath);
- if (result.errors.length > 0) {
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.some((e) => e.severity === 'error')) {
- filesErrored++;
- } else {
- filesSkipped++;
- }
- }
- return {
- success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index a single file
- */
- async indexFile(relativePath: string): Promise<ExtractionResult> {
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
- durationMs: 0,
- };
- }
- // Read file content and stats
- let content: string;
- let stats: fs.Stats;
- try {
- stats = await fsp.stat(fullPath);
- content = await fsp.readFile(fullPath, 'utf-8');
- } catch (error) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- filePath: relativePath,
- severity: 'error',
- code: 'read_error',
- },
- ],
- durationMs: 0,
- };
- }
- return this.indexFileWithContent(relativePath, content, stats);
- }
- /**
- * Index a single file with pre-read content and stats.
- * Used by the parallel batch reader to avoid redundant file I/O.
- */
- async indexFileWithContent(
- relativePath: string,
- content: string,
- stats: fs.Stats
- ): Promise<ExtractionResult> {
- // Prevent path traversal
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
- durationMs: 0,
- };
- }
- // Check file size
- if (stats.size > MAX_FILE_SIZE) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `File exceeds max size (${stats.size} > ${MAX_FILE_SIZE})`,
- filePath: relativePath,
- severity: 'warning',
- code: 'size_exceeded',
- },
- ],
- durationMs: 0,
- };
- }
- // Detect language
- const language = detectLanguage(relativePath, content);
- if (!isLanguageSupported(language)) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [],
- durationMs: 0,
- };
- }
- // Extract from source. Use cached framework names if indexAll has run,
- // otherwise detect on the spot so single-file re-index paths still emit
- // route nodes / middleware / etc.
- const frameworkNames = this.ensureDetectedFrameworks();
- const result = extractFromSource(relativePath, content, language, frameworkNames);
- // Store in database
- if (result.nodes.length > 0 || result.errors.length === 0) {
- this.storeExtractionResult(relativePath, content, language, stats, result);
- }
- return result;
- }
- /**
- * Store extraction result in database
- */
- private storeExtractionResult(
- filePath: string,
- content: string,
- language: Language,
- stats: fs.Stats,
- result: ExtractionResult
- ): void {
- const contentHash = hashContent(content);
- // Check if file already exists and hasn't changed
- const existingFile = this.queries.getFileByPath(filePath);
- if (existingFile && existingFile.contentHash === contentHash) {
- return; // No changes
- }
- // Delete existing data for this file
- if (existingFile) {
- this.queries.deleteFile(filePath);
- }
- // Filter out nodes with missing required fields before insertion.
- // This prevents FK violations when edges reference nodes that would
- // be silently skipped by insertNode() (see issue #42).
- const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
- // Insert nodes
- if (validNodes.length > 0) {
- this.queries.insertNodes(validNodes);
- }
- // Filter edges to only reference nodes that were actually inserted
- if (result.edges.length > 0) {
- const insertedIds = new Set(validNodes.map((n) => n.id));
- const validEdges = result.edges.filter(
- (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
- );
- if (validEdges.length > 0) {
- this.queries.insertEdges(validEdges);
- }
- }
- // Insert unresolved references in batch with denormalized filePath/language
- if (result.unresolvedReferences.length > 0) {
- const insertedIds = new Set(validNodes.map((n) => n.id));
- const refsWithContext = result.unresolvedReferences
- .filter((ref) => insertedIds.has(ref.fromNodeId))
- .map((ref) => ({
- ...ref,
- filePath: ref.filePath ?? filePath,
- language: ref.language ?? language,
- }));
- if (refsWithContext.length > 0) {
- this.queries.insertUnresolvedRefsBatch(refsWithContext);
- }
- }
- // Insert file record
- const fileRecord: FileRecord = {
- path: filePath,
- contentHash,
- language,
- size: stats.size,
- modifiedAt: stats.mtimeMs,
- indexedAt: Date.now(),
- nodeCount: result.nodes.length,
- errors: result.errors.length > 0 ? result.errors : undefined,
- };
- this.queries.upsertFile(fileRecord);
- }
- /**
- * Sync the index with the current file state.
- *
- * Change detection is filesystem-based, never git: a (size, mtime) stat
- * pre-filter skips unchanged files, then a content-hash compare confirms real
- * changes. This works in non-git projects and catches committed changes from
- * `git pull`/`checkout`/`merge`/`rebase` that `git status` cannot see.
- */
- async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
- await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
- const startTime = Date.now();
- let filesChecked = 0;
- let filesAdded = 0;
- let filesModified = 0;
- let filesRemoved = 0;
- let nodesUpdated = 0;
- const changedFilePaths: string[] = [];
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const filesToIndex: string[] = [];
- // === Filesystem reconcile (git-independent) ===
- // The source of truth for "what changed" is the filesystem vs the indexed
- // state — never git. We enumerate the current source files and reconcile
- // each against the DB. A cheap (size, mtime) stat pre-filter skips unchanged
- // files without reading or hashing them, so the expensive read+hash+parse
- // only runs for files that actually changed. This catches edits/adds/deletes
- // whether or not the project uses git, and crucially also catches committed
- // changes from `git pull`/`checkout`/`merge`/`rebase` — which `git status`
- // cannot see, because the working tree is clean afterward.
- const currentFiles = scanDirectory(this.rootDir);
- filesChecked = currentFiles.length;
- const currentSet = new Set(currentFiles);
- const trackedFiles = this.queries.getAllFiles();
- const trackedMap = new Map<string, FileRecord>();
- for (const f of trackedFiles) {
- trackedMap.set(f.path, f);
- }
- // Removals: tracked in the DB but no longer a present source file. Check the
- // filesystem directly — `scanDirectory` (via `git ls-files`) still lists a
- // file deleted from disk but not yet staged, so set membership alone misses it.
- for (const tracked of trackedFiles) {
- if (!currentSet.has(tracked.path) || !fs.existsSync(path.join(this.rootDir, tracked.path))) {
- this.queries.deleteFile(tracked.path);
- filesRemoved++;
- }
- }
- // Adds / modifications.
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- const tracked = trackedMap.get(filePath);
- // Cheap pre-filter: an already-indexed file whose size AND mtime both match
- // the DB is unchanged — skip it without reading or hashing. (A content
- // change that preserves both exactly is the blind spot every mtime-based
- // incremental tool accepts; `index --force` is the escape hatch. Git bumps
- // mtime on every file it writes during checkout/merge, so pulls are caught.)
- if (tracked) {
- try {
- const stat = fs.statSync(fullPath);
- if (stat.size === tracked.size && Math.floor(stat.mtimeMs) === Math.floor(tracked.modifiedAt)) {
- continue;
- }
- } catch (error) {
- logDebug('Skipping unstattable file during sync', { filePath, error: String(error) });
- continue;
- }
- }
- // New, or size/mtime changed — read + hash to confirm a real content change.
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- if (!tracked) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- } else if (tracked.contentHash !== contentHash) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesModified++;
- }
- }
- // Load only grammars needed for changed files
- if (filesToIndex.length > 0) {
- const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
- // .h files default to 'c' but may be C++ — ensure cpp grammar is loaded
- if (neededLanguages.includes('c') && !neededLanguages.includes('cpp')) {
- neededLanguages.push('cpp');
- }
- await loadGrammarsForLanguages(neededLanguages);
- }
- // Index changed files
- const total = filesToIndex.length;
- for (let i = 0; i < filesToIndex.length; i++) {
- const filePath = filesToIndex[i]!;
- onProgress?.({
- phase: 'parsing',
- current: i + 1,
- total,
- currentFile: filePath,
- });
- const result = await this.indexFile(filePath);
- nodesUpdated += result.nodes.length;
- }
- return {
- filesChecked,
- filesAdded,
- filesModified,
- filesRemoved,
- nodesUpdated,
- durationMs: Date.now() - startTime,
- changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
- };
- }
- /**
- * Get files that have changed since last index.
- * Uses git status as a fast path when available, falling back to full scan.
- */
- getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
- const gitChanges = getGitChangedFiles(this.rootDir);
- if (gitChanges) {
- // === Git fast path ===
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Deleted files — only report if tracked in DB
- for (const filePath of gitChanges.deleted) {
- const tracked = this.queries.getFileByPath(filePath);
- if (tracked) {
- removed.push(filePath);
- }
- }
- // Modified + added files — read + hash, compare with DB. Untracked (`??`)
- // files stay untracked in git even after indexing, so they must be
- // hash-compared like modified files instead of always counting as added —
- // otherwise status reports them as pending forever. (See issue #206.)
- for (const filePath of [...gitChanges.modified, ...gitChanges.added]) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = this.queries.getFileByPath(filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- return { added, modified, removed };
- }
- // === Fallback: full scan (non-git project or git failure) ===
- const currentFiles = new Set(scanDirectory(this.rootDir));
- const trackedFiles = this.queries.getAllFiles();
- // Build Map for O(1) lookups
- const trackedMap = new Map<string, FileRecord>();
- for (const f of trackedFiles) {
- trackedMap.set(f.path, f);
- }
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Find removed files
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- removed.push(tracked.path);
- }
- }
- // Find added and modified files
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedMap.get(filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- return { added, modified, removed };
- }
- }
- // Re-export useful types and functions
- export { extractFromSource } from './tree-sitter';
- export { detectLanguage, isSourceFile, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';
|