index.ts 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. CodeGraphConfig,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isLanguageSupported, initGrammars } from './grammars';
  21. import { logDebug, logWarn } from '../errors';
  22. import { captureException } from '../sentry';
  23. import { validatePathWithinRoot, normalizePath } from '../utils';
  24. import picomatch from 'picomatch';
  25. /**
  26. * Number of files to read in parallel during indexing.
  27. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  28. */
  29. const FILE_IO_BATCH_SIZE = 10;
  30. /**
  31. * Progress callback for indexing operations
  32. */
  33. export interface IndexProgress {
  34. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  35. current: number;
  36. total: number;
  37. currentFile?: string;
  38. }
  39. /**
  40. * Result of an indexing operation
  41. */
  42. export interface IndexResult {
  43. success: boolean;
  44. filesIndexed: number;
  45. filesSkipped: number;
  46. nodesCreated: number;
  47. edgesCreated: number;
  48. errors: ExtractionError[];
  49. durationMs: number;
  50. }
  51. /**
  52. * Result of a sync operation
  53. */
  54. export interface SyncResult {
  55. filesChecked: number;
  56. filesAdded: number;
  57. filesModified: number;
  58. filesRemoved: number;
  59. nodesUpdated: number;
  60. durationMs: number;
  61. changedFilePaths?: string[];
  62. }
  63. /**
  64. * Calculate SHA256 hash of file contents
  65. */
  66. export function hashContent(content: string): string {
  67. return crypto.createHash('sha256').update(content).digest('hex');
  68. }
  69. /**
  70. * Check if a path matches any glob pattern (simplified)
  71. */
  72. function matchesGlob(filePath: string, pattern: string): boolean {
  73. filePath = normalizePath(filePath);
  74. return picomatch.isMatch(filePath, pattern, { dot: true });
  75. }
  76. /**
  77. * Check if a file should be included based on config
  78. */
  79. export function shouldIncludeFile(
  80. filePath: string,
  81. config: CodeGraphConfig
  82. ): boolean {
  83. // Check exclude patterns first
  84. for (const pattern of config.exclude) {
  85. if (matchesGlob(filePath, pattern)) {
  86. return false;
  87. }
  88. }
  89. // Check include patterns
  90. for (const pattern of config.include) {
  91. if (matchesGlob(filePath, pattern)) {
  92. return true;
  93. }
  94. }
  95. return false;
  96. }
  97. /**
  98. * Get all files visible to git (tracked + untracked but not ignored).
  99. * Respects .gitignore at all levels (root, subdirectories).
  100. * Returns null on failure (non-git project) so callers can fall back.
  101. */
  102. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  103. try {
  104. // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
  105. const output = execFileSync(
  106. 'git',
  107. ['ls-files', '-co', '--exclude-standard'],
  108. { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
  109. );
  110. const files = new Set<string>();
  111. for (const line of output.split('\n')) {
  112. const trimmed = line.trim();
  113. if (trimmed) {
  114. files.add(normalizePath(trimmed));
  115. }
  116. }
  117. return files;
  118. } catch {
  119. return null;
  120. }
  121. }
  122. /**
  123. * Result of git-based change detection.
  124. * Returns null when git is unavailable (non-git project or command failure),
  125. * signaling the caller to fall back to full filesystem scan.
  126. */
  127. interface GitChanges {
  128. modified: string[]; // M, MM, AM — files to re-hash + re-index
  129. added: string[]; // ?? — new untracked files to index
  130. deleted: string[]; // D — files to remove from DB
  131. }
  132. /**
  133. * Use `git status` to detect changed files instead of scanning every file.
  134. * Returns null on failure so callers fall back to full scan.
  135. */
  136. function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
  137. try {
  138. const output = execFileSync(
  139. 'git',
  140. ['status', '--porcelain', '--no-renames'],
  141. { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
  142. );
  143. const modified: string[] = [];
  144. const added: string[] = [];
  145. const deleted: string[] = [];
  146. for (const line of output.split('\n')) {
  147. if (line.length < 4) continue; // Minimum: "XY file"
  148. const statusCode = line.substring(0, 2);
  149. const filePath = normalizePath(line.substring(3));
  150. // Skip files that don't match include/exclude config
  151. if (!shouldIncludeFile(filePath, config)) continue;
  152. if (statusCode === '??') {
  153. added.push(filePath);
  154. } else if (statusCode.includes('D')) {
  155. deleted.push(filePath);
  156. } else {
  157. // M, MM, AM, A (staged), etc. — treat as modified
  158. modified.push(filePath);
  159. }
  160. }
  161. return { modified, added, deleted };
  162. } catch {
  163. return null;
  164. }
  165. }
  166. /**
  167. * Marker file name that indicates a directory (and all children) should be skipped
  168. */
  169. const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
  170. /**
  171. * Recursively scan directory for source files.
  172. *
  173. * In git repos, uses `git ls-files` to get the file list (inherently
  174. * respects .gitignore at all levels), then filters by config include patterns.
  175. * Falls back to filesystem walk for non-git projects.
  176. */
  177. export function scanDirectory(
  178. rootDir: string,
  179. config: CodeGraphConfig,
  180. onProgress?: (current: number, file: string) => void
  181. ): string[] {
  182. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  183. const gitFiles = getGitVisibleFiles(rootDir);
  184. if (gitFiles) {
  185. const files: string[] = [];
  186. let count = 0;
  187. for (const filePath of gitFiles) {
  188. if (shouldIncludeFile(filePath, config)) {
  189. files.push(filePath);
  190. count++;
  191. onProgress?.(count, filePath);
  192. }
  193. }
  194. return files;
  195. }
  196. // Fallback: walk filesystem for non-git projects
  197. return scanDirectoryWalk(rootDir, config, onProgress);
  198. }
  199. /**
  200. * Filesystem walk fallback for non-git projects.
  201. */
  202. function scanDirectoryWalk(
  203. rootDir: string,
  204. config: CodeGraphConfig,
  205. onProgress?: (current: number, file: string) => void
  206. ): string[] {
  207. const files: string[] = [];
  208. let count = 0;
  209. const visitedDirs = new Set<string>();
  210. function walk(dir: string): void {
  211. let realDir: string;
  212. try {
  213. realDir = fs.realpathSync(dir);
  214. } catch {
  215. logDebug('Skipping unresolvable directory', { dir });
  216. return;
  217. }
  218. if (visitedDirs.has(realDir)) {
  219. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  220. return;
  221. }
  222. visitedDirs.add(realDir);
  223. // Check for .codegraphignore marker file
  224. const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
  225. if (fs.existsSync(ignoreMarker)) {
  226. logDebug('Skipping directory due to .codegraphignore marker', { dir });
  227. return;
  228. }
  229. let entries: fs.Dirent[];
  230. try {
  231. entries = fs.readdirSync(dir, { withFileTypes: true });
  232. } catch (error) {
  233. captureException(error, { operation: 'walk-directory', dir });
  234. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  235. return;
  236. }
  237. for (const entry of entries) {
  238. const fullPath = path.join(dir, entry.name);
  239. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  240. if (entry.isSymbolicLink()) {
  241. try {
  242. const realTarget = fs.realpathSync(fullPath);
  243. const stat = fs.statSync(realTarget);
  244. if (stat.isDirectory()) {
  245. const dirPattern = relativePath + '/';
  246. let excluded = false;
  247. for (const pattern of config.exclude) {
  248. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  249. excluded = true;
  250. break;
  251. }
  252. }
  253. if (!excluded) {
  254. walk(fullPath);
  255. }
  256. } else if (stat.isFile()) {
  257. if (shouldIncludeFile(relativePath, config)) {
  258. files.push(relativePath);
  259. count++;
  260. onProgress?.(count, relativePath);
  261. }
  262. }
  263. } catch {
  264. logDebug('Skipping broken symlink', { path: fullPath });
  265. }
  266. continue;
  267. }
  268. if (entry.isDirectory()) {
  269. const dirPattern = relativePath + '/';
  270. let excluded = false;
  271. for (const pattern of config.exclude) {
  272. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  273. excluded = true;
  274. break;
  275. }
  276. }
  277. if (!excluded) {
  278. walk(fullPath);
  279. }
  280. } else if (entry.isFile()) {
  281. if (shouldIncludeFile(relativePath, config)) {
  282. files.push(relativePath);
  283. count++;
  284. onProgress?.(count, relativePath);
  285. }
  286. }
  287. }
  288. }
  289. walk(rootDir);
  290. return files;
  291. }
  292. /**
  293. * Extraction orchestrator
  294. */
  295. export class ExtractionOrchestrator {
  296. private rootDir: string;
  297. private config: CodeGraphConfig;
  298. private queries: QueryBuilder;
  299. constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
  300. this.rootDir = rootDir;
  301. this.config = config;
  302. this.queries = queries;
  303. }
  304. /**
  305. * Index all files in the project
  306. */
  307. async indexAll(
  308. onProgress?: (progress: IndexProgress) => void,
  309. signal?: AbortSignal
  310. ): Promise<IndexResult> {
  311. await initGrammars();
  312. const startTime = Date.now();
  313. const errors: ExtractionError[] = [];
  314. let filesIndexed = 0;
  315. let filesSkipped = 0;
  316. let totalNodes = 0;
  317. let totalEdges = 0;
  318. // Phase 1: Scan for files
  319. onProgress?.({
  320. phase: 'scanning',
  321. current: 0,
  322. total: 0,
  323. });
  324. const files = scanDirectory(this.rootDir, this.config, (current, file) => {
  325. onProgress?.({
  326. phase: 'scanning',
  327. current,
  328. total: 0,
  329. currentFile: file,
  330. });
  331. });
  332. if (signal?.aborted) {
  333. return {
  334. success: false,
  335. filesIndexed: 0,
  336. filesSkipped: 0,
  337. nodesCreated: 0,
  338. edgesCreated: 0,
  339. errors: [{ message: 'Aborted', severity: 'error' }],
  340. durationMs: Date.now() - startTime,
  341. };
  342. }
  343. // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
  344. const total = files.length;
  345. let processed = 0;
  346. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  347. if (signal?.aborted) {
  348. return {
  349. success: false,
  350. filesIndexed,
  351. filesSkipped,
  352. nodesCreated: totalNodes,
  353. edgesCreated: totalEdges,
  354. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  355. durationMs: Date.now() - startTime,
  356. };
  357. }
  358. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  359. // Read files in parallel (with path validation before any I/O)
  360. const fileContents = await Promise.all(
  361. batch.map(async (fp) => {
  362. try {
  363. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  364. if (!fullPath) {
  365. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  366. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  367. }
  368. const content = await fsp.readFile(fullPath, 'utf-8');
  369. const stats = await fsp.stat(fullPath);
  370. return { filePath: fp, content, stats, error: null as Error | null };
  371. } catch (err) {
  372. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  373. }
  374. })
  375. );
  376. // Parse and store sequentially
  377. for (const { filePath, content, stats, error } of fileContents) {
  378. if (signal?.aborted) {
  379. return {
  380. success: false,
  381. filesIndexed,
  382. filesSkipped,
  383. nodesCreated: totalNodes,
  384. edgesCreated: totalEdges,
  385. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  386. durationMs: Date.now() - startTime,
  387. };
  388. }
  389. processed++;
  390. onProgress?.({
  391. phase: 'parsing',
  392. current: processed,
  393. total,
  394. currentFile: filePath,
  395. });
  396. if (error || content === null || stats === null) {
  397. errors.push({
  398. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  399. severity: 'error',
  400. });
  401. continue;
  402. }
  403. const result = await this.indexFileWithContent(filePath, content, stats);
  404. if (result.errors.length > 0) {
  405. errors.push(...result.errors);
  406. }
  407. if (result.nodes.length > 0) {
  408. filesIndexed++;
  409. totalNodes += result.nodes.length;
  410. totalEdges += result.edges.length;
  411. } else if (result.errors.length === 0) {
  412. filesSkipped++;
  413. }
  414. }
  415. }
  416. // Phase 3: Resolve references
  417. onProgress?.({
  418. phase: 'resolving',
  419. current: 0,
  420. total: 1,
  421. });
  422. // TODO: Implement reference resolution in Phase 3
  423. return {
  424. success: errors.filter((e) => e.severity === 'error').length === 0,
  425. filesIndexed,
  426. filesSkipped,
  427. nodesCreated: totalNodes,
  428. edgesCreated: totalEdges,
  429. errors,
  430. durationMs: Date.now() - startTime,
  431. };
  432. }
  433. /**
  434. * Index specific files
  435. */
  436. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  437. const startTime = Date.now();
  438. const errors: ExtractionError[] = [];
  439. let filesIndexed = 0;
  440. let filesSkipped = 0;
  441. let totalNodes = 0;
  442. let totalEdges = 0;
  443. for (const filePath of filePaths) {
  444. const result = await this.indexFile(filePath);
  445. if (result.errors.length > 0) {
  446. errors.push(...result.errors);
  447. }
  448. if (result.nodes.length > 0) {
  449. filesIndexed++;
  450. totalNodes += result.nodes.length;
  451. totalEdges += result.edges.length;
  452. } else {
  453. filesSkipped++;
  454. }
  455. }
  456. return {
  457. success: errors.filter((e) => e.severity === 'error').length === 0,
  458. filesIndexed,
  459. filesSkipped,
  460. nodesCreated: totalNodes,
  461. edgesCreated: totalEdges,
  462. errors,
  463. durationMs: Date.now() - startTime,
  464. };
  465. }
  466. /**
  467. * Index a single file
  468. */
  469. async indexFile(relativePath: string): Promise<ExtractionResult> {
  470. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  471. if (!fullPath) {
  472. return {
  473. nodes: [],
  474. edges: [],
  475. unresolvedReferences: [],
  476. errors: [{ message: `Path traversal blocked: ${relativePath}`, severity: 'error' }],
  477. durationMs: 0,
  478. };
  479. }
  480. // Read file content and stats
  481. let content: string;
  482. let stats: fs.Stats;
  483. try {
  484. stats = await fsp.stat(fullPath);
  485. content = await fsp.readFile(fullPath, 'utf-8');
  486. } catch (error) {
  487. captureException(error, { operation: 'extract-file', filePath: fullPath });
  488. return {
  489. nodes: [],
  490. edges: [],
  491. unresolvedReferences: [],
  492. errors: [
  493. {
  494. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  495. severity: 'error',
  496. },
  497. ],
  498. durationMs: 0,
  499. };
  500. }
  501. return this.indexFileWithContent(relativePath, content, stats);
  502. }
  503. /**
  504. * Index a single file with pre-read content and stats.
  505. * Used by the parallel batch reader to avoid redundant file I/O.
  506. */
  507. async indexFileWithContent(
  508. relativePath: string,
  509. content: string,
  510. stats: fs.Stats
  511. ): Promise<ExtractionResult> {
  512. // Prevent path traversal
  513. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  514. if (!fullPath) {
  515. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  516. return {
  517. nodes: [],
  518. edges: [],
  519. unresolvedReferences: [],
  520. errors: [{ message: 'Path traversal blocked', severity: 'error' }],
  521. durationMs: 0,
  522. };
  523. }
  524. // Check file size
  525. if (stats.size > this.config.maxFileSize) {
  526. return {
  527. nodes: [],
  528. edges: [],
  529. unresolvedReferences: [],
  530. errors: [
  531. {
  532. message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
  533. severity: 'warning',
  534. },
  535. ],
  536. durationMs: 0,
  537. };
  538. }
  539. // Detect language
  540. const language = detectLanguage(relativePath);
  541. if (!isLanguageSupported(language)) {
  542. return {
  543. nodes: [],
  544. edges: [],
  545. unresolvedReferences: [],
  546. errors: [],
  547. durationMs: 0,
  548. };
  549. }
  550. // Extract from source
  551. const result = extractFromSource(relativePath, content, language);
  552. // Store in database
  553. if (result.nodes.length > 0 || result.errors.length === 0) {
  554. this.storeExtractionResult(relativePath, content, language, stats, result);
  555. }
  556. return result;
  557. }
  558. /**
  559. * Store extraction result in database
  560. */
  561. private storeExtractionResult(
  562. filePath: string,
  563. content: string,
  564. language: Language,
  565. stats: fs.Stats,
  566. result: ExtractionResult
  567. ): void {
  568. const contentHash = hashContent(content);
  569. // Check if file already exists and hasn't changed
  570. const existingFile = this.queries.getFileByPath(filePath);
  571. if (existingFile && existingFile.contentHash === contentHash) {
  572. return; // No changes
  573. }
  574. // Delete existing data for this file
  575. if (existingFile) {
  576. this.queries.deleteFile(filePath);
  577. }
  578. // Insert nodes
  579. if (result.nodes.length > 0) {
  580. this.queries.insertNodes(result.nodes);
  581. }
  582. // Insert edges
  583. if (result.edges.length > 0) {
  584. this.queries.insertEdges(result.edges);
  585. }
  586. // Insert unresolved references in batch with denormalized filePath/language
  587. if (result.unresolvedReferences.length > 0) {
  588. const refsWithContext = result.unresolvedReferences.map((ref) => ({
  589. ...ref,
  590. filePath: ref.filePath ?? filePath,
  591. language: ref.language ?? language,
  592. }));
  593. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  594. }
  595. // Insert file record
  596. const fileRecord: FileRecord = {
  597. path: filePath,
  598. contentHash,
  599. language,
  600. size: stats.size,
  601. modifiedAt: stats.mtimeMs,
  602. indexedAt: Date.now(),
  603. nodeCount: result.nodes.length,
  604. errors: result.errors.length > 0 ? result.errors : undefined,
  605. };
  606. this.queries.upsertFile(fileRecord);
  607. }
  608. /**
  609. * Sync with current file state.
  610. * Uses git status as a fast path when available, falling back to full scan.
  611. */
  612. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  613. await initGrammars();
  614. const startTime = Date.now();
  615. let filesChecked = 0;
  616. let filesAdded = 0;
  617. let filesModified = 0;
  618. let filesRemoved = 0;
  619. let nodesUpdated = 0;
  620. const changedFilePaths: string[] = [];
  621. onProgress?.({
  622. phase: 'scanning',
  623. current: 0,
  624. total: 0,
  625. });
  626. const filesToIndex: string[] = [];
  627. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  628. if (gitChanges) {
  629. // === Git fast path ===
  630. // Only inspect the files git reports as changed instead of scanning everything.
  631. filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
  632. // Handle deleted files
  633. for (const filePath of gitChanges.deleted) {
  634. const tracked = this.queries.getFileByPath(filePath);
  635. if (tracked) {
  636. this.queries.deleteFile(filePath);
  637. filesRemoved++;
  638. }
  639. }
  640. // Handle modified files — read + hash only these files
  641. for (const filePath of gitChanges.modified) {
  642. const fullPath = path.join(this.rootDir, filePath);
  643. let content: string;
  644. try {
  645. content = fs.readFileSync(fullPath, 'utf-8');
  646. } catch (error) {
  647. captureException(error, { operation: 'sync-read-file', filePath });
  648. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  649. continue;
  650. }
  651. const contentHash = hashContent(content);
  652. const tracked = this.queries.getFileByPath(filePath);
  653. if (!tracked) {
  654. filesToIndex.push(filePath);
  655. changedFilePaths.push(filePath);
  656. filesAdded++;
  657. } else if (tracked.contentHash !== contentHash) {
  658. filesToIndex.push(filePath);
  659. changedFilePaths.push(filePath);
  660. filesModified++;
  661. }
  662. }
  663. // Handle added (untracked) files
  664. for (const filePath of gitChanges.added) {
  665. filesToIndex.push(filePath);
  666. changedFilePaths.push(filePath);
  667. filesAdded++;
  668. }
  669. } else {
  670. // === Fallback: full scan (non-git project or git failure) ===
  671. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  672. filesChecked = currentFiles.size;
  673. // Build Map for O(1) lookups instead of .find() per file
  674. const trackedFiles = this.queries.getAllFiles();
  675. const trackedMap = new Map<string, FileRecord>();
  676. for (const f of trackedFiles) {
  677. trackedMap.set(f.path, f);
  678. }
  679. // Find files to remove (in DB but not on disk)
  680. for (const tracked of trackedFiles) {
  681. if (!currentFiles.has(tracked.path)) {
  682. this.queries.deleteFile(tracked.path);
  683. filesRemoved++;
  684. }
  685. }
  686. // Find files to add or update
  687. for (const filePath of currentFiles) {
  688. const fullPath = path.join(this.rootDir, filePath);
  689. let content: string;
  690. try {
  691. content = fs.readFileSync(fullPath, 'utf-8');
  692. } catch (error) {
  693. captureException(error, { operation: 'sync-read-file', filePath });
  694. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  695. continue;
  696. }
  697. const contentHash = hashContent(content);
  698. const tracked = trackedMap.get(filePath);
  699. if (!tracked) {
  700. filesToIndex.push(filePath);
  701. changedFilePaths.push(filePath);
  702. filesAdded++;
  703. } else if (tracked.contentHash !== contentHash) {
  704. filesToIndex.push(filePath);
  705. changedFilePaths.push(filePath);
  706. filesModified++;
  707. }
  708. }
  709. }
  710. // Index changed files
  711. const total = filesToIndex.length;
  712. for (let i = 0; i < filesToIndex.length; i++) {
  713. const filePath = filesToIndex[i]!;
  714. onProgress?.({
  715. phase: 'parsing',
  716. current: i + 1,
  717. total,
  718. currentFile: filePath,
  719. });
  720. const result = await this.indexFile(filePath);
  721. nodesUpdated += result.nodes.length;
  722. }
  723. return {
  724. filesChecked,
  725. filesAdded,
  726. filesModified,
  727. filesRemoved,
  728. nodesUpdated,
  729. durationMs: Date.now() - startTime,
  730. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  731. };
  732. }
  733. /**
  734. * Get files that have changed since last index.
  735. * Uses git status as a fast path when available, falling back to full scan.
  736. */
  737. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  738. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  739. if (gitChanges) {
  740. // === Git fast path ===
  741. const added: string[] = [];
  742. const modified: string[] = [];
  743. const removed: string[] = [];
  744. // Deleted files — only report if tracked in DB
  745. for (const filePath of gitChanges.deleted) {
  746. const tracked = this.queries.getFileByPath(filePath);
  747. if (tracked) {
  748. removed.push(filePath);
  749. }
  750. }
  751. // Modified files — read + hash only these, compare with DB
  752. for (const filePath of gitChanges.modified) {
  753. const fullPath = path.join(this.rootDir, filePath);
  754. let content: string;
  755. try {
  756. content = fs.readFileSync(fullPath, 'utf-8');
  757. } catch (error) {
  758. captureException(error, { operation: 'detect-changes-read-file', filePath });
  759. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  760. continue;
  761. }
  762. const contentHash = hashContent(content);
  763. const tracked = this.queries.getFileByPath(filePath);
  764. if (!tracked) {
  765. added.push(filePath);
  766. } else if (tracked.contentHash !== contentHash) {
  767. modified.push(filePath);
  768. }
  769. }
  770. // Added (untracked) files
  771. for (const filePath of gitChanges.added) {
  772. added.push(filePath);
  773. }
  774. return { added, modified, removed };
  775. }
  776. // === Fallback: full scan (non-git project or git failure) ===
  777. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  778. const trackedFiles = this.queries.getAllFiles();
  779. // Build Map for O(1) lookups
  780. const trackedMap = new Map<string, FileRecord>();
  781. for (const f of trackedFiles) {
  782. trackedMap.set(f.path, f);
  783. }
  784. const added: string[] = [];
  785. const modified: string[] = [];
  786. const removed: string[] = [];
  787. // Find removed files
  788. for (const tracked of trackedFiles) {
  789. if (!currentFiles.has(tracked.path)) {
  790. removed.push(tracked.path);
  791. }
  792. }
  793. // Find added and modified files
  794. for (const filePath of currentFiles) {
  795. const fullPath = path.join(this.rootDir, filePath);
  796. let content: string;
  797. try {
  798. content = fs.readFileSync(fullPath, 'utf-8');
  799. } catch (error) {
  800. captureException(error, { operation: 'detect-changes-read-file', filePath });
  801. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  802. continue;
  803. }
  804. const contentHash = hashContent(content);
  805. const tracked = trackedMap.get(filePath);
  806. if (!tracked) {
  807. added.push(filePath);
  808. } else if (tracked.contentHash !== contentHash) {
  809. modified.push(filePath);
  810. }
  811. }
  812. return { added, modified, removed };
  813. }
  814. }
  815. // Re-export useful types and functions
  816. export { extractFromSource } from './tree-sitter';
  817. export { detectLanguage, isLanguageSupported, getSupportedLanguages, initGrammars } from './grammars';