|
|
@@ -63,6 +63,7 @@ export interface SyncResult {
|
|
|
filesRemoved: number;
|
|
|
nodesUpdated: number;
|
|
|
durationMs: number;
|
|
|
+ changedFilePaths?: string[];
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
@@ -105,27 +106,80 @@ export function shouldIncludeFile(
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Get directories ignored by .gitignore using git ls-files.
|
|
|
- * Returns a Set of normalized relative directory paths (forward slashes, no trailing slash).
|
|
|
- * Gracefully returns empty Set on any failure.
|
|
|
+ * Get all files visible to git (tracked + untracked but not ignored).
|
|
|
+ * Respects .gitignore at all levels (root, subdirectories).
|
|
|
+ * Returns null on failure (non-git project) so callers can fall back.
|
|
|
*/
|
|
|
-function getGitIgnoredDirectories(rootDir: string): Set<string> {
|
|
|
+function getGitVisibleFiles(rootDir: string): Set<string> | null {
|
|
|
try {
|
|
|
+ // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
|
|
|
const output = execFileSync(
|
|
|
'git',
|
|
|
- ['ls-files', '-oi', '--exclude-standard', '--directory'],
|
|
|
- { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
|
|
|
+ ['ls-files', '-co', '--exclude-standard'],
|
|
|
+ { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
|
|
|
);
|
|
|
- const dirs = new Set<string>();
|
|
|
+ const files = new Set<string>();
|
|
|
for (const line of output.split('\n')) {
|
|
|
const trimmed = line.trim();
|
|
|
- if (trimmed.endsWith('/')) {
|
|
|
- dirs.add(normalizePath(trimmed.slice(0, -1)));
|
|
|
+ if (trimmed) {
|
|
|
+ files.add(normalizePath(trimmed));
|
|
|
}
|
|
|
}
|
|
|
- return dirs;
|
|
|
+ return files;
|
|
|
} catch {
|
|
|
- return new Set<string>();
|
|
|
+ return null;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Result of git-based change detection.
|
|
|
+ * Returns null when git is unavailable (non-git project or command failure),
|
|
|
+ * signaling the caller to fall back to full filesystem scan.
|
|
|
+ */
|
|
|
+interface GitChanges {
|
|
|
+ modified: string[]; // M, MM, AM — files to re-hash + re-index
|
|
|
+ added: string[]; // ?? — new untracked files to index
|
|
|
+ deleted: string[]; // D — files to remove from DB
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Use `git status` to detect changed files instead of scanning every file.
|
|
|
+ * Returns null on failure so callers fall back to full scan.
|
|
|
+ */
|
|
|
+function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
|
|
|
+ try {
|
|
|
+ const output = execFileSync(
|
|
|
+ 'git',
|
|
|
+ ['status', '--porcelain', '--no-renames'],
|
|
|
+ { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
|
|
|
+ );
|
|
|
+
|
|
|
+ const modified: string[] = [];
|
|
|
+ const added: string[] = [];
|
|
|
+ const deleted: string[] = [];
|
|
|
+
|
|
|
+ for (const line of output.split('\n')) {
|
|
|
+ if (line.length < 4) continue; // Minimum: "XY file"
|
|
|
+
|
|
|
+ const statusCode = line.substring(0, 2);
|
|
|
+ const filePath = normalizePath(line.substring(3));
|
|
|
+
|
|
|
+ // Skip files that don't match include/exclude config
|
|
|
+ if (!shouldIncludeFile(filePath, config)) continue;
|
|
|
+
|
|
|
+ if (statusCode === '??') {
|
|
|
+ added.push(filePath);
|
|
|
+ } else if (statusCode.includes('D')) {
|
|
|
+ deleted.push(filePath);
|
|
|
+ } else {
|
|
|
+ // M, MM, AM, A (staged), etc. — treat as modified
|
|
|
+ modified.push(filePath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return { modified, added, deleted };
|
|
|
+ } catch {
|
|
|
+ return null;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -135,21 +189,49 @@ function getGitIgnoredDirectories(rootDir: string): Set<string> {
|
|
|
const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
|
|
|
|
|
|
/**
|
|
|
- * Recursively scan directory for source files
|
|
|
+ * Recursively scan directory for source files.
|
|
|
+ *
|
|
|
+ * In git repos, uses `git ls-files` to get the file list (inherently
|
|
|
+ * respects .gitignore at all levels), then filters by config include patterns.
|
|
|
+ * Falls back to filesystem walk for non-git projects.
|
|
|
*/
|
|
|
export function scanDirectory(
|
|
|
rootDir: string,
|
|
|
config: CodeGraphConfig,
|
|
|
onProgress?: (current: number, file: string) => void
|
|
|
+): string[] {
|
|
|
+ // Fast path: use git to get all visible files (respects .gitignore everywhere)
|
|
|
+ const gitFiles = getGitVisibleFiles(rootDir);
|
|
|
+ if (gitFiles) {
|
|
|
+ const files: string[] = [];
|
|
|
+ let count = 0;
|
|
|
+ for (const filePath of gitFiles) {
|
|
|
+ if (shouldIncludeFile(filePath, config)) {
|
|
|
+ files.push(filePath);
|
|
|
+ count++;
|
|
|
+ onProgress?.(count, filePath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return files;
|
|
|
+ }
|
|
|
+
|
|
|
+ // Fallback: walk filesystem for non-git projects
|
|
|
+ return scanDirectoryWalk(rootDir, config, onProgress);
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Filesystem walk fallback for non-git projects.
|
|
|
+ */
|
|
|
+function scanDirectoryWalk(
|
|
|
+ rootDir: string,
|
|
|
+ config: CodeGraphConfig,
|
|
|
+ onProgress?: (current: number, file: string) => void
|
|
|
): string[] {
|
|
|
const files: string[] = [];
|
|
|
let count = 0;
|
|
|
- // Track visited real paths to detect symlink cycles
|
|
|
const visitedDirs = new Set<string>();
|
|
|
- const gitIgnoredDirs = getGitIgnoredDirectories(rootDir);
|
|
|
|
|
|
function walk(dir: string): void {
|
|
|
- // Resolve real path to detect symlink cycles
|
|
|
let realDir: string;
|
|
|
try {
|
|
|
realDir = fs.realpathSync(dir);
|
|
|
@@ -164,7 +246,7 @@ export function scanDirectory(
|
|
|
}
|
|
|
visitedDirs.add(realDir);
|
|
|
|
|
|
- // Check for .codegraphignore marker file - skip entire directory tree if present
|
|
|
+ // Check for .codegraphignore marker file
|
|
|
const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
|
|
|
if (fs.existsSync(ignoreMarker)) {
|
|
|
logDebug('Skipping directory due to .codegraphignore marker', { dir });
|
|
|
@@ -184,17 +266,11 @@ export function scanDirectory(
|
|
|
const fullPath = path.join(dir, entry.name);
|
|
|
const relativePath = normalizePath(path.relative(rootDir, fullPath));
|
|
|
|
|
|
- // Follow symlinked directories, but skip symlinked files to non-project targets
|
|
|
if (entry.isSymbolicLink()) {
|
|
|
try {
|
|
|
const realTarget = fs.realpathSync(fullPath);
|
|
|
const stat = fs.statSync(realTarget);
|
|
|
if (stat.isDirectory()) {
|
|
|
- // Check gitignore first (fast O(1) lookup)
|
|
|
- if (gitIgnoredDirs.has(relativePath)) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- // Check exclusion, then recurse (cycle detection handles the rest)
|
|
|
const dirPattern = relativePath + '/';
|
|
|
let excluded = false;
|
|
|
for (const pattern of config.exclude) {
|
|
|
@@ -210,9 +286,7 @@ export function scanDirectory(
|
|
|
if (shouldIncludeFile(relativePath, config)) {
|
|
|
files.push(relativePath);
|
|
|
count++;
|
|
|
- if (onProgress) {
|
|
|
- onProgress(count, relativePath);
|
|
|
- }
|
|
|
+ onProgress?.(count, relativePath);
|
|
|
}
|
|
|
}
|
|
|
} catch {
|
|
|
@@ -222,11 +296,6 @@ export function scanDirectory(
|
|
|
}
|
|
|
|
|
|
if (entry.isDirectory()) {
|
|
|
- // Check gitignore first (fast O(1) lookup)
|
|
|
- if (gitIgnoredDirs.has(relativePath)) {
|
|
|
- continue;
|
|
|
- }
|
|
|
- // Check if directory should be excluded
|
|
|
const dirPattern = relativePath + '/';
|
|
|
let excluded = false;
|
|
|
for (const pattern of config.exclude) {
|
|
|
@@ -242,9 +311,7 @@ export function scanDirectory(
|
|
|
if (shouldIncludeFile(relativePath, config)) {
|
|
|
files.push(relativePath);
|
|
|
count++;
|
|
|
- if (onProgress) {
|
|
|
- onProgress(count, relativePath);
|
|
|
- }
|
|
|
+ onProgress?.(count, relativePath);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
@@ -611,7 +678,8 @@ export class ExtractionOrchestrator {
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Sync with current file state
|
|
|
+ * Sync with current file state.
|
|
|
+ * Uses git status as a fast path when available, falling back to full scan.
|
|
|
*/
|
|
|
async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
|
|
|
const startTime = Date.now();
|
|
|
@@ -620,53 +688,107 @@ export class ExtractionOrchestrator {
|
|
|
let filesModified = 0;
|
|
|
let filesRemoved = 0;
|
|
|
let nodesUpdated = 0;
|
|
|
+ const changedFilePaths: string[] = [];
|
|
|
|
|
|
- // Get current files on disk
|
|
|
onProgress?.({
|
|
|
phase: 'scanning',
|
|
|
current: 0,
|
|
|
total: 0,
|
|
|
});
|
|
|
|
|
|
- const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
|
|
|
- filesChecked = currentFiles.size;
|
|
|
+ const filesToIndex: string[] = [];
|
|
|
+ const gitChanges = getGitChangedFiles(this.rootDir, this.config);
|
|
|
+
|
|
|
+ if (gitChanges) {
|
|
|
+ // === Git fast path ===
|
|
|
+ // Only inspect the files git reports as changed instead of scanning everything.
|
|
|
+ filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
|
|
|
+
|
|
|
+ // Handle deleted files
|
|
|
+ for (const filePath of gitChanges.deleted) {
|
|
|
+ const tracked = this.queries.getFileByPath(filePath);
|
|
|
+ if (tracked) {
|
|
|
+ this.queries.deleteFile(filePath);
|
|
|
+ filesRemoved++;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- // Get tracked files from database
|
|
|
- const trackedFiles = this.queries.getAllFiles();
|
|
|
+ // Handle modified files — read + hash only these files
|
|
|
+ for (const filePath of gitChanges.modified) {
|
|
|
+ const fullPath = path.join(this.rootDir, filePath);
|
|
|
+ let content: string;
|
|
|
+ try {
|
|
|
+ content = fs.readFileSync(fullPath, 'utf-8');
|
|
|
+ } catch (error) {
|
|
|
+ captureException(error, { operation: 'sync-read-file', filePath });
|
|
|
+ logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- // Find files to remove (in DB but not on disk)
|
|
|
- for (const tracked of trackedFiles) {
|
|
|
- if (!currentFiles.has(tracked.path)) {
|
|
|
- this.queries.deleteFile(tracked.path);
|
|
|
- filesRemoved++;
|
|
|
+ const contentHash = hashContent(content);
|
|
|
+ const tracked = this.queries.getFileByPath(filePath);
|
|
|
+
|
|
|
+ if (!tracked) {
|
|
|
+ filesToIndex.push(filePath);
|
|
|
+ changedFilePaths.push(filePath);
|
|
|
+ filesAdded++;
|
|
|
+ } else if (tracked.contentHash !== contentHash) {
|
|
|
+ filesToIndex.push(filePath);
|
|
|
+ changedFilePaths.push(filePath);
|
|
|
+ filesModified++;
|
|
|
+ }
|
|
|
}
|
|
|
- }
|
|
|
|
|
|
- // Find files to add or update
|
|
|
- const filesToIndex: string[] = [];
|
|
|
+ // Handle added (untracked) files
|
|
|
+ for (const filePath of gitChanges.added) {
|
|
|
+ filesToIndex.push(filePath);
|
|
|
+ changedFilePaths.push(filePath);
|
|
|
+ filesAdded++;
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ // === Fallback: full scan (non-git project or git failure) ===
|
|
|
+ const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
|
|
|
+ filesChecked = currentFiles.size;
|
|
|
+
|
|
|
+ // Build Map for O(1) lookups instead of .find() per file
|
|
|
+ const trackedFiles = this.queries.getAllFiles();
|
|
|
+ const trackedMap = new Map<string, FileRecord>();
|
|
|
+ for (const f of trackedFiles) {
|
|
|
+ trackedMap.set(f.path, f);
|
|
|
+ }
|
|
|
|
|
|
- for (const filePath of currentFiles) {
|
|
|
- const fullPath = path.join(this.rootDir, filePath);
|
|
|
- let content: string;
|
|
|
- try {
|
|
|
- content = fs.readFileSync(fullPath, 'utf-8');
|
|
|
- } catch (error) {
|
|
|
- captureException(error, { operation: 'sync-read-file', filePath });
|
|
|
- logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
|
|
|
- continue;
|
|
|
+ // Find files to remove (in DB but not on disk)
|
|
|
+ for (const tracked of trackedFiles) {
|
|
|
+ if (!currentFiles.has(tracked.path)) {
|
|
|
+ this.queries.deleteFile(tracked.path);
|
|
|
+ filesRemoved++;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- const contentHash = hashContent(content);
|
|
|
- const tracked = trackedFiles.find((f) => f.path === filePath);
|
|
|
+ // Find files to add or update
|
|
|
+ for (const filePath of currentFiles) {
|
|
|
+ const fullPath = path.join(this.rootDir, filePath);
|
|
|
+ let content: string;
|
|
|
+ try {
|
|
|
+ content = fs.readFileSync(fullPath, 'utf-8');
|
|
|
+ } catch (error) {
|
|
|
+ captureException(error, { operation: 'sync-read-file', filePath });
|
|
|
+ logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
|
|
|
+ continue;
|
|
|
+ }
|
|
|
|
|
|
- if (!tracked) {
|
|
|
- // New file
|
|
|
- filesToIndex.push(filePath);
|
|
|
- filesAdded++;
|
|
|
- } else if (tracked.contentHash !== contentHash) {
|
|
|
- // Modified file
|
|
|
- filesToIndex.push(filePath);
|
|
|
- filesModified++;
|
|
|
+ const contentHash = hashContent(content);
|
|
|
+ const tracked = trackedMap.get(filePath);
|
|
|
+
|
|
|
+ if (!tracked) {
|
|
|
+ filesToIndex.push(filePath);
|
|
|
+ changedFilePaths.push(filePath);
|
|
|
+ filesAdded++;
|
|
|
+ } else if (tracked.contentHash !== contentHash) {
|
|
|
+ filesToIndex.push(filePath);
|
|
|
+ changedFilePaths.push(filePath);
|
|
|
+ filesModified++;
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -692,16 +814,71 @@ export class ExtractionOrchestrator {
|
|
|
filesRemoved,
|
|
|
nodesUpdated,
|
|
|
durationMs: Date.now() - startTime,
|
|
|
+ changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
|
|
|
};
|
|
|
}
|
|
|
|
|
|
/**
|
|
|
- * Get files that have changed since last index
|
|
|
+ * Get files that have changed since last index.
|
|
|
+ * Uses git status as a fast path when available, falling back to full scan.
|
|
|
*/
|
|
|
getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
|
|
|
+ const gitChanges = getGitChangedFiles(this.rootDir, this.config);
|
|
|
+
|
|
|
+ if (gitChanges) {
|
|
|
+ // === Git fast path ===
|
|
|
+ const added: string[] = [];
|
|
|
+ const modified: string[] = [];
|
|
|
+ const removed: string[] = [];
|
|
|
+
|
|
|
+ // Deleted files — only report if tracked in DB
|
|
|
+ for (const filePath of gitChanges.deleted) {
|
|
|
+ const tracked = this.queries.getFileByPath(filePath);
|
|
|
+ if (tracked) {
|
|
|
+ removed.push(filePath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Modified files — read + hash only these, compare with DB
|
|
|
+ for (const filePath of gitChanges.modified) {
|
|
|
+ const fullPath = path.join(this.rootDir, filePath);
|
|
|
+ let content: string;
|
|
|
+ try {
|
|
|
+ content = fs.readFileSync(fullPath, 'utf-8');
|
|
|
+ } catch (error) {
|
|
|
+ captureException(error, { operation: 'detect-changes-read-file', filePath });
|
|
|
+ logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ const contentHash = hashContent(content);
|
|
|
+ const tracked = this.queries.getFileByPath(filePath);
|
|
|
+
|
|
|
+ if (!tracked) {
|
|
|
+ added.push(filePath);
|
|
|
+ } else if (tracked.contentHash !== contentHash) {
|
|
|
+ modified.push(filePath);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // Added (untracked) files
|
|
|
+ for (const filePath of gitChanges.added) {
|
|
|
+ added.push(filePath);
|
|
|
+ }
|
|
|
+
|
|
|
+ return { added, modified, removed };
|
|
|
+ }
|
|
|
+
|
|
|
+ // === Fallback: full scan (non-git project or git failure) ===
|
|
|
const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
|
|
|
const trackedFiles = this.queries.getAllFiles();
|
|
|
|
|
|
+ // Build Map for O(1) lookups
|
|
|
+ const trackedMap = new Map<string, FileRecord>();
|
|
|
+ for (const f of trackedFiles) {
|
|
|
+ trackedMap.set(f.path, f);
|
|
|
+ }
|
|
|
+
|
|
|
const added: string[] = [];
|
|
|
const modified: string[] = [];
|
|
|
const removed: string[] = [];
|
|
|
@@ -726,7 +903,7 @@ export class ExtractionOrchestrator {
|
|
|
}
|
|
|
|
|
|
const contentHash = hashContent(content);
|
|
|
- const tracked = trackedFiles.find((f) => f.path === filePath);
|
|
|
+ const tracked = trackedMap.get(filePath);
|
|
|
|
|
|
if (!tracked) {
|
|
|
added.push(filePath);
|