| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686 |
- /**
- * Extraction Orchestrator
- *
- * Coordinates file scanning, parsing, and database storage.
- */
- import * as fs from 'fs';
- import * as fsp from 'fs/promises';
- import * as path from 'path';
- import * as crypto from 'crypto';
- import { execFileSync } from 'child_process';
- import {
- Language,
- FileRecord,
- ExtractionResult,
- ExtractionError,
- CodeGraphConfig,
- } from '../types';
- import { QueryBuilder } from '../db/queries';
- import { extractFromSource } from './tree-sitter';
- import { detectLanguage, isLanguageSupported } from './grammars';
- import { logDebug } from '../errors';
- import { captureException } from '../sentry';
- import { validatePathWithinRoot, normalizePath } from '../utils';
- /**
- * Progress callback for indexing operations
- */
- export interface IndexProgress {
- phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
- current: number;
- total: number;
- currentFile?: string;
- }
- /**
- * Result of an indexing operation
- */
- export interface IndexResult {
- success: boolean;
- filesIndexed: number;
- filesSkipped: number;
- nodesCreated: number;
- edgesCreated: number;
- errors: ExtractionError[];
- durationMs: number;
- }
- /**
- * Result of a sync operation
- */
- export interface SyncResult {
- filesChecked: number;
- filesAdded: number;
- filesModified: number;
- filesRemoved: number;
- nodesUpdated: number;
- durationMs: number;
- }
- /**
- * Calculate SHA256 hash of file contents
- */
- export function hashContent(content: string): string {
- return crypto.createHash('sha256').update(content).digest('hex');
- }
- /**
- * Check if a path matches any glob pattern (simplified)
- */
- function matchesGlob(filePath: string, pattern: string): boolean {
- // Normalize to forward slashes so Windows backslash paths match glob patterns
- filePath = normalizePath(filePath);
- // Convert glob to regex using placeholders to avoid conflicts
- let regexStr = pattern;
- // Replace glob patterns with placeholders first
- regexStr = regexStr.replace(/\*\*\//g, '\x00GLOBSTAR_SLASH\x00');
- regexStr = regexStr.replace(/\*\*/g, '\x00GLOBSTAR\x00');
- regexStr = regexStr.replace(/\*/g, '\x00STAR\x00');
- regexStr = regexStr.replace(/\?/g, '\x00QUESTION\x00');
- // Escape regex special characters
- regexStr = regexStr.replace(/[.+^${}()|[\]\\]/g, '\\$&');
- // Replace placeholders with regex equivalents
- regexStr = regexStr.replace(/\x00GLOBSTAR_SLASH\x00/g, '(?:.*/)?'); // **/ = zero or more dirs
- regexStr = regexStr.replace(/\x00GLOBSTAR\x00/g, '.*'); // ** = anything
- regexStr = regexStr.replace(/\x00STAR\x00/g, '[^/]*'); // * = anything except /
- regexStr = regexStr.replace(/\x00QUESTION\x00/g, '.'); // ? = single char
- const regex = new RegExp(`^${regexStr}$`);
- return regex.test(filePath);
- }
- /**
- * Check if a file should be included based on config
- */
- export function shouldIncludeFile(
- filePath: string,
- config: CodeGraphConfig
- ): boolean {
- // Check exclude patterns first
- for (const pattern of config.exclude) {
- if (matchesGlob(filePath, pattern)) {
- return false;
- }
- }
- // Check include patterns
- for (const pattern of config.include) {
- if (matchesGlob(filePath, pattern)) {
- return true;
- }
- }
- return false;
- }
- /**
- * Get directories ignored by .gitignore using git ls-files.
- * Returns a Set of normalized relative directory paths (forward slashes, no trailing slash).
- * Gracefully returns empty Set on any failure.
- */
- function getGitIgnoredDirectories(rootDir: string): Set<string> {
- try {
- const output = execFileSync(
- 'git',
- ['ls-files', '-oi', '--exclude-standard', '--directory'],
- { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
- );
- const dirs = new Set<string>();
- for (const line of output.split('\n')) {
- const trimmed = line.trim();
- if (trimmed.endsWith('/')) {
- dirs.add(normalizePath(trimmed.slice(0, -1)));
- }
- }
- return dirs;
- } catch {
- return new Set<string>();
- }
- }
- /**
- * Marker file name that indicates a directory (and all children) should be skipped
- */
- const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
- /**
- * Recursively scan directory for source files
- */
- export function scanDirectory(
- rootDir: string,
- config: CodeGraphConfig,
- onProgress?: (current: number, file: string) => void
- ): string[] {
- const files: string[] = [];
- let count = 0;
- const visitedRealPaths = new Set<string>(); // Symlink cycle detection
- const gitIgnoredDirs = getGitIgnoredDirectories(rootDir);
- function walk(dir: string): void {
- // Symlink cycle detection: resolve real path and skip if already visited
- try {
- const realDir = fs.realpathSync(dir);
- if (visitedRealPaths.has(realDir)) {
- logDebug('Skipping directory to prevent symlink cycle', { dir, realDir });
- return;
- }
- visitedRealPaths.add(realDir);
- } catch {
- // If realpath fails, skip this directory
- return;
- }
- // Check for .codegraphignore marker file - skip entire directory tree if present
- const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
- if (fs.existsSync(ignoreMarker)) {
- logDebug('Skipping directory due to .codegraphignore marker', { dir });
- return;
- }
- let entries: fs.Dirent[];
- try {
- entries = fs.readdirSync(dir, { withFileTypes: true });
- } catch (error) {
- captureException(error, { operation: 'walk-directory', dir });
- logDebug('Skipping unreadable directory', { dir, error: String(error) });
- return;
- }
- for (const entry of entries) {
- const fullPath = path.join(dir, entry.name);
- const relativePath = normalizePath(path.relative(rootDir, fullPath));
- // Follow symlinked directories, but skip symlinked files to non-project targets
- if (entry.isSymbolicLink()) {
- try {
- const realTarget = fs.realpathSync(fullPath);
- const stat = fs.statSync(realTarget);
- if (stat.isDirectory()) {
- // Check gitignore first (fast O(1) lookup)
- if (gitIgnoredDirs.has(relativePath)) {
- continue;
- }
- // Check exclusion, then recurse (cycle detection handles the rest)
- const dirPattern = relativePath + '/';
- let excluded = false;
- for (const pattern of config.exclude) {
- if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
- excluded = true;
- break;
- }
- }
- if (!excluded) {
- walk(fullPath);
- }
- } else if (stat.isFile()) {
- if (shouldIncludeFile(relativePath, config)) {
- files.push(relativePath);
- count++;
- if (onProgress) {
- onProgress(count, relativePath);
- }
- }
- }
- } catch {
- logDebug('Skipping broken symlink', { path: fullPath });
- }
- continue;
- }
- if (entry.isDirectory()) {
- // Check gitignore first (fast O(1) lookup)
- if (gitIgnoredDirs.has(relativePath)) {
- continue;
- }
- // Check if directory should be excluded
- const dirPattern = relativePath + '/';
- let excluded = false;
- for (const pattern of config.exclude) {
- if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
- excluded = true;
- break;
- }
- }
- if (!excluded) {
- walk(fullPath);
- }
- } else if (entry.isFile()) {
- if (shouldIncludeFile(relativePath, config)) {
- files.push(relativePath);
- count++;
- if (onProgress) {
- onProgress(count, relativePath);
- }
- }
- }
- }
- }
- walk(rootDir);
- return files;
- }
- /**
- * Extraction orchestrator
- */
- export class ExtractionOrchestrator {
- private rootDir: string;
- private config: CodeGraphConfig;
- private queries: QueryBuilder;
- constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
- this.rootDir = rootDir;
- this.config = config;
- this.queries = queries;
- }
- /**
- * Index all files in the project
- */
- async indexAll(
- onProgress?: (progress: IndexProgress) => void,
- signal?: AbortSignal
- ): Promise<IndexResult> {
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- // Phase 1: Scan for files
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const files = scanDirectory(this.rootDir, this.config, (current, file) => {
- onProgress?.({
- phase: 'scanning',
- current,
- total: 0,
- currentFile: file,
- });
- });
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed: 0,
- filesSkipped: 0,
- nodesCreated: 0,
- edgesCreated: 0,
- errors: [{ message: 'Aborted', severity: 'error' }],
- durationMs: Date.now() - startTime,
- };
- }
- // Phase 2: Parse files
- const total = files.length;
- for (let i = 0; i < files.length; i++) {
- if (signal?.aborted) {
- return {
- success: false,
- filesIndexed,
- filesSkipped,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
- durationMs: Date.now() - startTime,
- };
- }
- const filePath = files[i]!;
- onProgress?.({
- phase: 'parsing',
- current: i + 1,
- total,
- currentFile: filePath,
- });
- const result = await this.indexFile(filePath);
- if (result.errors.length > 0) {
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else if (result.errors.length === 0) {
- filesSkipped++;
- }
- }
- // Phase 3: Resolve references
- onProgress?.({
- phase: 'resolving',
- current: 0,
- total: 1,
- });
- // TODO: Implement reference resolution in Phase 3
- return {
- success: errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index specific files
- */
- async indexFiles(filePaths: string[]): Promise<IndexResult> {
- const startTime = Date.now();
- const errors: ExtractionError[] = [];
- let filesIndexed = 0;
- let filesSkipped = 0;
- let totalNodes = 0;
- let totalEdges = 0;
- for (const filePath of filePaths) {
- const result = await this.indexFile(filePath);
- if (result.errors.length > 0) {
- errors.push(...result.errors);
- }
- if (result.nodes.length > 0) {
- filesIndexed++;
- totalNodes += result.nodes.length;
- totalEdges += result.edges.length;
- } else {
- filesSkipped++;
- }
- }
- return {
- success: errors.filter((e) => e.severity === 'error').length === 0,
- filesIndexed,
- filesSkipped,
- nodesCreated: totalNodes,
- edgesCreated: totalEdges,
- errors,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Index a single file
- */
- async indexFile(relativePath: string): Promise<ExtractionResult> {
- const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
- if (!fullPath) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [{ message: `Path traversal blocked: ${relativePath}`, severity: 'error' }],
- durationMs: 0,
- };
- }
- // Check file exists and is readable
- let content: string;
- let stats: fs.Stats;
- try {
- stats = await fsp.stat(fullPath);
- content = await fsp.readFile(fullPath, 'utf-8');
- } catch (error) {
- captureException(error, { operation: 'extract-file', filePath: fullPath });
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
- severity: 'error',
- },
- ],
- durationMs: 0,
- };
- }
- // Check file size
- if (stats.size > this.config.maxFileSize) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [
- {
- message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
- severity: 'warning',
- },
- ],
- durationMs: 0,
- };
- }
- // Detect language
- const language = detectLanguage(relativePath);
- if (!isLanguageSupported(language)) {
- return {
- nodes: [],
- edges: [],
- unresolvedReferences: [],
- errors: [],
- durationMs: 0,
- };
- }
- // Extract from source
- const result = extractFromSource(relativePath, content, language);
- // Store in database
- if (result.nodes.length > 0 || result.errors.length === 0) {
- this.storeExtractionResult(relativePath, content, language, stats, result);
- }
- return result;
- }
- /**
- * Store extraction result in database
- */
- private storeExtractionResult(
- filePath: string,
- content: string,
- language: Language,
- stats: fs.Stats,
- result: ExtractionResult
- ): void {
- const contentHash = hashContent(content);
- // Check if file already exists and hasn't changed
- const existingFile = this.queries.getFileByPath(filePath);
- if (existingFile && existingFile.contentHash === contentHash) {
- return; // No changes
- }
- // Delete existing data for this file
- if (existingFile) {
- this.queries.deleteFile(filePath);
- }
- // Insert nodes
- if (result.nodes.length > 0) {
- this.queries.insertNodes(result.nodes);
- }
- // Insert edges
- if (result.edges.length > 0) {
- this.queries.insertEdges(result.edges);
- }
- // Insert unresolved references in batch with denormalized filePath/language
- if (result.unresolvedReferences.length > 0) {
- const refsWithContext = result.unresolvedReferences.map((ref) => ({
- ...ref,
- filePath: ref.filePath ?? filePath,
- language: ref.language ?? language,
- }));
- this.queries.insertUnresolvedRefsBatch(refsWithContext);
- }
- // Insert file record
- const fileRecord: FileRecord = {
- path: filePath,
- contentHash,
- language,
- size: stats.size,
- modifiedAt: stats.mtimeMs,
- indexedAt: Date.now(),
- nodeCount: result.nodes.length,
- errors: result.errors.length > 0 ? result.errors : undefined,
- };
- this.queries.upsertFile(fileRecord);
- }
- /**
- * Sync with current file state
- */
- async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
- const startTime = Date.now();
- let filesChecked = 0;
- let filesAdded = 0;
- let filesModified = 0;
- let filesRemoved = 0;
- let nodesUpdated = 0;
- // Get current files on disk
- onProgress?.({
- phase: 'scanning',
- current: 0,
- total: 0,
- });
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
- filesChecked = currentFiles.size;
- // Get tracked files from database
- const trackedFiles = this.queries.getAllFiles();
- // Find files to remove (in DB but not on disk)
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- this.queries.deleteFile(tracked.path);
- filesRemoved++;
- }
- }
- // Find files to add or update
- const filesToIndex: string[] = [];
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- captureException(error, { operation: 'sync-read-file', filePath });
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedFiles.find((f) => f.path === filePath);
- if (!tracked) {
- // New file
- filesToIndex.push(filePath);
- filesAdded++;
- } else if (tracked.contentHash !== contentHash) {
- // Modified file
- filesToIndex.push(filePath);
- filesModified++;
- }
- }
- // Index changed files
- const total = filesToIndex.length;
- for (let i = 0; i < filesToIndex.length; i++) {
- const filePath = filesToIndex[i]!;
- onProgress?.({
- phase: 'parsing',
- current: i + 1,
- total,
- currentFile: filePath,
- });
- const result = await this.indexFile(filePath);
- nodesUpdated += result.nodes.length;
- }
- return {
- filesChecked,
- filesAdded,
- filesModified,
- filesRemoved,
- nodesUpdated,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Get files that have changed since last index
- */
- getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
- const trackedFiles = this.queries.getAllFiles();
- const added: string[] = [];
- const modified: string[] = [];
- const removed: string[] = [];
- // Find removed files
- for (const tracked of trackedFiles) {
- if (!currentFiles.has(tracked.path)) {
- removed.push(tracked.path);
- }
- }
- // Find added and modified files
- for (const filePath of currentFiles) {
- const fullPath = path.join(this.rootDir, filePath);
- let content: string;
- try {
- content = fs.readFileSync(fullPath, 'utf-8');
- } catch (error) {
- captureException(error, { operation: 'detect-changes-read-file', filePath });
- logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
- continue;
- }
- const contentHash = hashContent(content);
- const tracked = trackedFiles.find((f) => f.path === filePath);
- if (!tracked) {
- added.push(filePath);
- } else if (tracked.contentHash !== contentHash) {
- modified.push(filePath);
- }
- }
- return { added, modified, removed };
- }
- }
- // Re-export useful types and functions
- export { extractFromSource } from './tree-sitter';
- export { detectLanguage, isLanguageSupported, getSupportedLanguages } from './grammars';
|