| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008 |
- /**
- * Extraction Orchestrator
- *
- * Coordinates file scanning, parsing, and database storage.
- */
- import * as fs from 'fs';
- import * as fsp from 'fs/promises';
- import * as path from 'path';
- import * as crypto from 'crypto';
- import { execFileSync } from 'child_process';
- import {
- Language,
- FileRecord,
- ExtractionResult,
- ExtractionError,
- CodeGraphConfig,
- } from '../types';
- import { QueryBuilder } from '../db/queries';
- import { extractFromSource } from './tree-sitter';
- import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages, resetParser } from './grammars';
- import { logDebug, logWarn } from '../errors';
- import { validatePathWithinRoot, normalizePath } from '../utils';
- import picomatch from 'picomatch';
- /**
- * Number of files to read in parallel during indexing.
- * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
- */
- const FILE_IO_BATCH_SIZE = 10;
- /**
- * Reset tree-sitter parser after this many parses per language to reclaim
- * WASM heap memory and prevent "memory access out of bounds" crashes.
- */
- const PARSER_RESET_INTERVAL = 5000;
- /**
- * Progress callback for indexing operations
- */
- export interface IndexProgress {
- phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
- current: number;
- total: number;
- currentFile?: string;
- }
- /**
- * Result of an indexing operation
- */
- export interface IndexResult {
- success: boolean;
- filesIndexed: number;
- filesSkipped: number;
- filesErrored: number;
- nodesCreated: number;
- edgesCreated: number;
- errors: ExtractionError[];
- durationMs: number;
- }
- /**
- * Result of a sync operation
- */
- export interface SyncResult {
- filesChecked: number;
- filesAdded: number;
- filesModified: number;
- filesRemoved: number;
- nodesUpdated: number;
- durationMs: number;
- changedFilePaths?: string[];
- }
- /**
- * Calculate SHA256 hash of file contents
- */
- export function hashContent(content: string): string {
- return crypto.createHash('sha256').update(content).digest('hex');
- }
- /**
- * Check if a path matches any glob pattern (simplified)
- */
- function matchesGlob(filePath: string, pattern: string): boolean {
- filePath = normalizePath(filePath);
- return picomatch.isMatch(filePath, pattern, { dot: true });
- }
- /**
- * Check if a file should be included based on config
- */
- export function shouldIncludeFile(
- filePath: string,
- config: CodeGraphConfig
- ): boolean {
- // Check exclude patterns first
- for (const pattern of config.exclude) {
- if (matchesGlob(filePath, pattern)) {
- return false;
- }
- }
- // Check include patterns
- for (const pattern of config.include) {
- if (matchesGlob(filePath, pattern)) {
- return true;
- }
- }
- return false;
- }
- /**
- * Get all files visible to git (tracked + untracked but not ignored).
- * Respects .gitignore at all levels (root, subdirectories).
- * Returns null on failure (non-git project) so callers can fall back.
- */
- function getGitVisibleFiles(rootDir: string): Set<string> | null {
- try {
- // Check if the project directory is gitignored by a parent repo.
- // When rootDir lives inside a parent git repo that ignores it,
- // `git ls-files` returns nothing — fall back to filesystem walk.
- const gitRoot = execFileSync(
- 'git',
- ['rev-parse', '--show-toplevel'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
- ).trim();
- if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
- try {
- // git check-ignore exits 0 if the path IS ignored, 1 if not
- execFileSync(
- 'git',
- ['check-ignore', '-q', path.resolve(rootDir)],
- { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- // Directory is gitignored by parent repo — fall back to filesystem walk
- return null;
- } catch {
- // Not ignored — safe to use git ls-files
- }
- }
- // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
- const output = execFileSync(
- 'git',
- ['ls-files', '-co', '--exclude-standard'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- const files = new Set<string>();
- for (const line of output.split('\n')) {
- const trimmed = line.trim();
- if (trimmed) {
- files.add(normalizePath(trimmed));
- }
- }
- return files;
- } catch {
- return null;
- }
- }
- /**
- * Result of git-based change detection.
- * Returns null when git is unavailable (non-git project or command failure),
- * signaling the caller to fall back to full filesystem scan.
- */
- interface GitChanges {
- modified: string[]; // M, MM, AM — files to re-hash + re-index
- added: string[]; // ?? — new untracked files to index
- deleted: string[]; // D — files to remove from DB
- }
- /**
- * Use `git status` to detect changed files instead of scanning every file.
- * Returns null on failure so callers fall back to full scan.
- */
- function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
- try {
- const output = execFileSync(
- 'git',
- ['status', '--porcelain', '--no-renames'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- const modified: string[] = [];
- const added: string[] = [];
- const deleted: string[] = [];
- for (const line of output.split('\n')) {
- if (line.length < 4) continue; // Minimum: "XY file"
- const statusCode = line.substring(0, 2);
- const filePath = normalizePath(line.substring(3));
- // Skip files that don't match include/exclude config
- if (!shouldIncludeFile(filePath, config)) continue;
- if (statusCode === '??') {
- added.push(filePath);
- } else if (statusCode.includes('D')) {
- deleted.push(filePath);
- } else {
- // M, MM, AM, A (staged), etc. — treat as modified
- modified.push(filePath);
- }
- }
- return { modified, added, deleted };
- } catch {
- return null;
- }
- }
- /**
- * Marker file name that indicates a directory (and all children) should be skipped
- */
- const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
- /**
- * Recursively scan directory for source files.
- *
- * In git repos, uses `git ls-files` to get the file list (inherently
- * respects .gitignore at all levels), then filters by config include patterns.
- * Falls back to filesystem walk for non-git projects.
- */
- export function scanDirectory(
- rootDir: string,
- config: CodeGraphConfig,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- // Fast path: use git to get all visible files (respects .gitignore everywhere)
- const gitFiles = getGitVisibleFiles(rootDir);
- if (gitFiles) {
- const files: string[] = [];
- let count = 0;
- for (const filePath of gitFiles) {
- if (shouldIncludeFile(filePath, config)) {
- files.push(filePath);
- count++;
- onProgress?.(count, filePath);
- }
- }
- return files;
- }
- // Fallback: walk filesystem for non-git projects
- return scanDirectoryWalk(rootDir, config, onProgress);
- }
- /**
- * Filesystem walk fallback for non-git projects.
- */
- function scanDirectoryWalk(
- rootDir: string,
- config: CodeGraphConfig,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- const files: string[] = [];
- let count = 0;
- const visitedDirs = new Set<string>();
- function walk(dir: string): void {
- let realDir: string;
- try {
- realDir = fs.realpathSync(dir);
- } catch {
- logDebug('Skipping unresolvable directory', { dir });
- return;
- }
- if (visitedDirs.has(realDir)) {
- logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
- return;
- }
- visitedDirs.add(realDir);
- // Check for .codegraphignore marker file
- const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
- if (fs.existsSync(ignoreMarker)) {
- logDebug('Skipping directory due to .codegraphignore marker', { dir });
- return;
- }
- let entries: fs.Dirent[];
- try {
- entries = fs.readdirSync(dir, { withFileTypes: true });
- } catch (error) {
- logDebug('Skipping unreadable directory', { dir, error: String(error) });
- return;
- }
- for (const entry of entries) {
- const fullPath = path.join(dir, entry.name);
- const relativePath = normalizePath(path.relative(rootDir, fullPath));
- if (entry.isSymbolicLink()) {
- try {
- const realTarget = fs.realpathSync(fullPath);
- const stat = fs.statSync(realTarget);
- if (stat.isDirectory()) {
- const dirPattern = relativePath + '/';
- let excluded = false;
- for (const pattern of config.exclude) {
- if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
- excluded = true;
- break;
- }
- }
- if (!excluded) {
- walk(fullPath);
- }
- } else if (stat.isFile()) {
- if (shouldIncludeFile(relativePath, config)) {
- files.push(relativePath);
- count++;
- onProgress?.(count, relativePath);
- }
- }
- } catch {
- logDebug('Skipping broken symlink', { path: fullPath });
- }
- continue;
- }
- if (entry.isDirectory()) {
- const dirPattern = relativePath + '/';
- let excluded = false;
- for (const pattern of config.exclude) {
- if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
- excluded = true;
- break;
- }
- }
- if (!excluded) {
- walk(fullPath);
- }
- } else if (entry.isFile()) {
- if (shouldIncludeFile(relativePath, config)) {
- files.push(relativePath);
- count++;
- onProgress?.(count, relativePath);
- }
- }
- }
- }
- walk(rootDir);
- return files;
- }
- /**
- * Extraction orchestrator
- */
- export class ExtractionOrchestrator {
- private rootDir: string;
- private config: CodeGraphConfig;
- private queries: QueryBuilder;
- constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
- this.rootDir = rootDir;
- this.config = config;
- this.queries = queries;
- }
- /**
- * Index all files in the project
- */
- async indexAll(
- onProgress?: (progress: IndexProgress) => void,
- signal?: AbortSignal
- ): Promise<IndexResult> {
- await initGrammars();
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let filesErrored = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- // Phase 1: Scan for files
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const files = scanDirectory(this.rootDir, this.config, (current, file) => {
- onProgress?.({
- phase: 'scanning',
- current,
- total: 0,
- currentFile: file,
- });
- });
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed: 0,
- filesSkipped: 0,
- filesErrored: 0,
- nodesCreated: 0,
- edgesCreated: 0,
- errors: [{ message: 'Aborted', severity: 'error' }],
- durationMs: Date.now() - startTime,
- };
- }
- // Load only the grammars needed for languages actually present in the project.
- // This avoids compiling all 16+ WASM grammar modules upfront, which can cause
- // V8 WASM Zone OOM on large codebases (see issue #54).
- const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
- await loadGrammarsForLanguages(neededLanguages);
- // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
- const total = files.length;
- let processed = 0;
- const parseCounts = new Map<Language, number>(); // track parses per language for WASM reset
- for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
- // Read files in parallel (with path validation before any I/O)
- const fileContents = await Promise.all(
- batch.map(async (fp) => {
- try {
- const fullPath = validatePathWithinRoot(this.rootDir, fp);
- if (!fullPath) {
- logWarn('Path traversal blocked in batch reader', { filePath: fp });
- return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
- }
- const content = await fsp.readFile(fullPath, 'utf-8');
- const stats = await fsp.stat(fullPath);
- return { filePath: fp, content, stats, error: null as Error | null };
- } catch (err) {
- return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
- }
- })
- );
- // Parse and store sequentially
- for (const { filePath, content, stats, error } of fileContents) {
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- processed++;
- onProgress?.({
- phase: 'parsing',
- current: processed,
- total,
- currentFile: filePath,
- });
- if (error || content === null || stats === null) {
- filesErrored++;
- errors.push({
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- filePath,
- severity: 'error',
- code: 'read_error',
- });
- continue;
- }
- const result = await this.indexFileWithContent(filePath, content, stats);
- // Periodically reset the parser to reclaim WASM heap memory.
- // Without this, tree-sitter's WASM runtime fragments its heap
- // across thousands of parses and eventually crashes.
- const lang = detectLanguage(filePath);
- const count = (parseCounts.get(lang) ?? 0) + 1;
- parseCounts.set(lang, count);
- if (count % PARSER_RESET_INTERVAL === 0) {
- resetParser(lang);
- }
- if (result.errors.length > 0) {
- // Annotate errors with file path if not already set
- for (const err of result.errors) {
- if (!err.filePath) err.filePath = filePath;
- }
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.some((e) => e.severity === 'error')) {
- filesErrored++;
- } else {
- filesSkipped++;
- }
- }
- }
- // Phase 3: Resolve references
- onProgress?.({
- phase: 'resolving',
- current: 0,
- total: 1,
- });
- // TODO: Implement reference resolution in Phase 3
- return {
- success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index specific files
- */
- async indexFiles(filePaths: string[]): Promise<IndexResult> {
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let filesErrored = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- for (const filePath of filePaths) {
- const result = await this.indexFile(filePath);
- if (result.errors.length > 0) {
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.some((e) => e.severity === 'error')) {
- filesErrored++;
- } else {
- filesSkipped++;
- }
- }
- return {
- success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- filesErrored,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index a single file
- */
- async indexFile(relativePath: string): Promise<ExtractionResult> {
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
- durationMs: 0,
- };
- }
- // Read file content and stats
- let content: string;
- let stats: fs.Stats;
- try {
- stats = await fsp.stat(fullPath);
- content = await fsp.readFile(fullPath, 'utf-8');
- } catch (error) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- filePath: relativePath,
- severity: 'error',
- code: 'read_error',
- },
- ],
- durationMs: 0,
- };
- }
- return this.indexFileWithContent(relativePath, content, stats);
- }
- /**
- * Index a single file with pre-read content and stats.
- * Used by the parallel batch reader to avoid redundant file I/O.
- */
- async indexFileWithContent(
- relativePath: string,
- content: string,
- stats: fs.Stats
- ): Promise<ExtractionResult> {
- // Prevent path traversal
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
- durationMs: 0,
- };
- }
- // Check file size
- if (stats.size > this.config.maxFileSize) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
- filePath: relativePath,
- severity: 'warning',
- code: 'size_exceeded',
- },
- ],
- durationMs: 0,
- };
- }
- // Detect language
- const language = detectLanguage(relativePath);
- if (!isLanguageSupported(language)) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [],
- durationMs: 0,
- };
- }
- // Extract from source
- const result = extractFromSource(relativePath, content, language);
- // Store in database
- if (result.nodes.length > 0 || result.errors.length === 0) {
- this.storeExtractionResult(relativePath, content, language, stats, result);
- }
- return result;
- }
- /**
- * Store extraction result in database
- */
- private storeExtractionResult(
- filePath: string,
- content: string,
- language: Language,
- stats: fs.Stats,
- result: ExtractionResult
- ): void {
- const contentHash = hashContent(content);
- // Check if file already exists and hasn't changed
- const existingFile = this.queries.getFileByPath(filePath);
- if (existingFile && existingFile.contentHash === contentHash) {
- return; // No changes
- }
- // Delete existing data for this file
- if (existingFile) {
- this.queries.deleteFile(filePath);
- }
- // Filter out nodes with missing required fields before insertion.
- // This prevents FK violations when edges reference nodes that would
- // be silently skipped by insertNode() (see issue #42).
- const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
- // Insert nodes
- if (validNodes.length > 0) {
- this.queries.insertNodes(validNodes);
- }
- // Filter edges to only reference nodes that were actually inserted
- if (result.edges.length > 0) {
- const insertedIds = new Set(validNodes.map((n) => n.id));
- const validEdges = result.edges.filter(
- (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
- );
- if (validEdges.length > 0) {
- this.queries.insertEdges(validEdges);
- }
- }
- // Insert unresolved references in batch with denormalized filePath/language
- if (result.unresolvedReferences.length > 0) {
- const insertedIds = new Set(validNodes.map((n) => n.id));
- const refsWithContext = result.unresolvedReferences
- .filter((ref) => insertedIds.has(ref.fromNodeId))
- .map((ref) => ({
- ...ref,
- filePath: ref.filePath ?? filePath,
- language: ref.language ?? language,
- }));
- if (refsWithContext.length > 0) {
- this.queries.insertUnresolvedRefsBatch(refsWithContext);
- }
- }
- // Insert file record
- const fileRecord: FileRecord = {
- path: filePath,
- contentHash,
- language,
- size: stats.size,
- modifiedAt: stats.mtimeMs,
- indexedAt: Date.now(),
- nodeCount: result.nodes.length,
- errors: result.errors.length > 0 ? result.errors : undefined,
- };
- this.queries.upsertFile(fileRecord);
- }
- /**
- * Sync with current file state.
- * Uses git status as a fast path when available, falling back to full scan.
- */
- async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
- await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
- const startTime = Date.now();
- let filesChecked = 0;
- let filesAdded = 0;
- let filesModified = 0;
- let filesRemoved = 0;
- let nodesUpdated = 0;
- const changedFilePaths: string[] = [];
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const filesToIndex: string[] = [];
- const gitChanges = getGitChangedFiles(this.rootDir, this.config);
- if (gitChanges) {
- // === Git fast path ===
- // Only inspect the files git reports as changed instead of scanning everything.
- filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
- // Handle deleted files
- for (const filePath of gitChanges.deleted) {
- const tracked = this.queries.getFileByPath(filePath);
- if (tracked) {
- this.queries.deleteFile(filePath);
- filesRemoved++;
- }
- }
- // Handle modified files — read + hash only these files
- for (const filePath of gitChanges.modified) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = this.queries.getFileByPath(filePath);
- if (!tracked) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- } else if (tracked.contentHash !== contentHash) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesModified++;
- }
- }
- // Handle added (untracked) files
- for (const filePath of gitChanges.added) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- }
- } else {
- // === Fallback: full scan (non-git project or git failure) ===
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
- filesChecked = currentFiles.size;
- // Build Map for O(1) lookups instead of .find() per file
- const trackedFiles = this.queries.getAllFiles();
- const trackedMap = new Map<string, FileRecord>();
- for (const f of trackedFiles) {
- trackedMap.set(f.path, f);
- }
- // Find files to remove (in DB but not on disk)
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- this.queries.deleteFile(tracked.path);
- filesRemoved++;
- }
- }
- // Find files to add or update
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedMap.get(filePath);
- if (!tracked) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesAdded++;
- } else if (tracked.contentHash !== contentHash) {
- filesToIndex.push(filePath);
- changedFilePaths.push(filePath);
- filesModified++;
- }
- }
- }
- // Load only grammars needed for changed files
- if (filesToIndex.length > 0) {
- const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
- await loadGrammarsForLanguages(neededLanguages);
- }
- // Index changed files
- const total = filesToIndex.length;
- for (let i = 0; i < filesToIndex.length; i++) {
- const filePath = filesToIndex[i]!;
- onProgress?.({
- phase: 'parsing',
- current: i + 1,
- total,
- currentFile: filePath,
- });
- const result = await this.indexFile(filePath);
- nodesUpdated += result.nodes.length;
- }
- return {
- filesChecked,
- filesAdded,
- filesModified,
- filesRemoved,
- nodesUpdated,
- durationMs: Date.now() - startTime,
- changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
- };
- }
- /**
- * Get files that have changed since last index.
- * Uses git status as a fast path when available, falling back to full scan.
- */
- getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
- const gitChanges = getGitChangedFiles(this.rootDir, this.config);
- if (gitChanges) {
- // === Git fast path ===
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Deleted files — only report if tracked in DB
- for (const filePath of gitChanges.deleted) {
- const tracked = this.queries.getFileByPath(filePath);
- if (tracked) {
- removed.push(filePath);
- }
- }
- // Modified files — read + hash only these, compare with DB
- for (const filePath of gitChanges.modified) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = this.queries.getFileByPath(filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- // Added (untracked) files
- for (const filePath of gitChanges.added) {
- added.push(filePath);
- }
- return { added, modified, removed };
- }
- // === Fallback: full scan (non-git project or git failure) ===
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
- const trackedFiles = this.queries.getAllFiles();
- // Build Map for O(1) lookups
- const trackedMap = new Map<string, FileRecord>();
- for (const f of trackedFiles) {
- trackedMap.set(f.path, f);
- }
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Find removed files
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- removed.push(tracked.path);
- }
- }
- // Find added and modified files
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedMap.get(filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- return { added, modified, removed };
- }
- }
- // Re-export useful types and functions
- export { extractFromSource } from './tree-sitter';
- export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';
|