index.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import {
  11. Language,
  12. FileRecord,
  13. ExtractionResult,
  14. ExtractionError,
  15. CodeGraphConfig,
  16. } from '../types';
  17. import { QueryBuilder } from '../db/queries';
  18. import { extractFromSource } from './tree-sitter';
  19. import { detectLanguage, isLanguageSupported } from './grammars';
  20. import { logDebug } from '../errors';
  21. import { captureException } from '../sentry';
  22. import { validatePathWithinRoot } from '../utils';
  23. /**
  24. * Progress callback for indexing operations
  25. */
  26. export interface IndexProgress {
  27. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  28. current: number;
  29. total: number;
  30. currentFile?: string;
  31. }
  32. /**
  33. * Result of an indexing operation
  34. */
  35. export interface IndexResult {
  36. success: boolean;
  37. filesIndexed: number;
  38. filesSkipped: number;
  39. nodesCreated: number;
  40. edgesCreated: number;
  41. errors: ExtractionError[];
  42. durationMs: number;
  43. }
  44. /**
  45. * Result of a sync operation
  46. */
  47. export interface SyncResult {
  48. filesChecked: number;
  49. filesAdded: number;
  50. filesModified: number;
  51. filesRemoved: number;
  52. nodesUpdated: number;
  53. durationMs: number;
  54. }
  55. /**
  56. * Calculate SHA256 hash of file contents
  57. */
  58. export function hashContent(content: string): string {
  59. return crypto.createHash('sha256').update(content).digest('hex');
  60. }
  61. /**
  62. * Check if a path matches any glob pattern (simplified)
  63. */
  64. function matchesGlob(filePath: string, pattern: string): boolean {
  65. const picomatch = require('picomatch');
  66. return picomatch.isMatch(filePath, pattern, { dot: true });
  67. }
  68. /**
  69. * Check if a file should be included based on config
  70. */
  71. export function shouldIncludeFile(
  72. filePath: string,
  73. config: CodeGraphConfig
  74. ): boolean {
  75. // Check exclude patterns first
  76. for (const pattern of config.exclude) {
  77. if (matchesGlob(filePath, pattern)) {
  78. return false;
  79. }
  80. }
  81. // Check include patterns
  82. for (const pattern of config.include) {
  83. if (matchesGlob(filePath, pattern)) {
  84. return true;
  85. }
  86. }
  87. return false;
  88. }
  89. /**
  90. * Marker file name that indicates a directory (and all children) should be skipped
  91. */
  92. const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
  93. /**
  94. * Recursively scan directory for source files
  95. */
  96. export function scanDirectory(
  97. rootDir: string,
  98. config: CodeGraphConfig,
  99. onProgress?: (current: number, file: string) => void
  100. ): string[] {
  101. const files: string[] = [];
  102. let count = 0;
  103. const visitedRealPaths = new Set<string>(); // Symlink cycle detection
  104. function walk(dir: string): void {
  105. // Symlink cycle detection: resolve real path and skip if already visited
  106. try {
  107. const realDir = fs.realpathSync(dir);
  108. if (visitedRealPaths.has(realDir)) {
  109. logDebug('Skipping directory to prevent symlink cycle', { dir, realDir });
  110. return;
  111. }
  112. visitedRealPaths.add(realDir);
  113. } catch {
  114. // If realpath fails, skip this directory
  115. return;
  116. }
  117. // Check for .codegraphignore marker file - skip entire directory tree if present
  118. const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
  119. if (fs.existsSync(ignoreMarker)) {
  120. logDebug('Skipping directory due to .codegraphignore marker', { dir });
  121. return;
  122. }
  123. let entries: fs.Dirent[];
  124. try {
  125. entries = fs.readdirSync(dir, { withFileTypes: true });
  126. } catch (error) {
  127. captureException(error, { operation: 'walk-directory', dir });
  128. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  129. return;
  130. }
  131. for (const entry of entries) {
  132. const fullPath = path.join(dir, entry.name);
  133. const relativePath = path.relative(rootDir, fullPath);
  134. // Follow symlinked directories, but skip symlinked files to non-project targets
  135. if (entry.isSymbolicLink()) {
  136. try {
  137. const realTarget = fs.realpathSync(fullPath);
  138. const stat = fs.statSync(realTarget);
  139. if (stat.isDirectory()) {
  140. // Check exclusion, then recurse (cycle detection handles the rest)
  141. const dirPattern = relativePath + '/';
  142. let excluded = false;
  143. for (const pattern of config.exclude) {
  144. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  145. excluded = true;
  146. break;
  147. }
  148. }
  149. if (!excluded) {
  150. walk(fullPath);
  151. }
  152. } else if (stat.isFile()) {
  153. if (shouldIncludeFile(relativePath, config)) {
  154. files.push(relativePath);
  155. count++;
  156. if (onProgress) {
  157. onProgress(count, relativePath);
  158. }
  159. }
  160. }
  161. } catch {
  162. logDebug('Skipping broken symlink', { path: fullPath });
  163. }
  164. continue;
  165. }
  166. if (entry.isDirectory()) {
  167. // Check if directory should be excluded
  168. const dirPattern = relativePath + '/';
  169. let excluded = false;
  170. for (const pattern of config.exclude) {
  171. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  172. excluded = true;
  173. break;
  174. }
  175. }
  176. if (!excluded) {
  177. walk(fullPath);
  178. }
  179. } else if (entry.isFile()) {
  180. if (shouldIncludeFile(relativePath, config)) {
  181. files.push(relativePath);
  182. count++;
  183. if (onProgress) {
  184. onProgress(count, relativePath);
  185. }
  186. }
  187. }
  188. }
  189. }
  190. walk(rootDir);
  191. return files;
  192. }
  193. /**
  194. * Extraction orchestrator
  195. */
  196. export class ExtractionOrchestrator {
  197. private rootDir: string;
  198. private config: CodeGraphConfig;
  199. private queries: QueryBuilder;
  200. constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
  201. this.rootDir = rootDir;
  202. this.config = config;
  203. this.queries = queries;
  204. }
  205. /**
  206. * Index all files in the project
  207. */
  208. async indexAll(
  209. onProgress?: (progress: IndexProgress) => void,
  210. signal?: AbortSignal
  211. ): Promise<IndexResult> {
  212. const startTime = Date.now();
  213. const errors: ExtractionError[] = [];
  214. let filesIndexed = 0;
  215. let filesSkipped = 0;
  216. let totalNodes = 0;
  217. let totalEdges = 0;
  218. // Phase 1: Scan for files
  219. onProgress?.({
  220. phase: 'scanning',
  221. current: 0,
  222. total: 0,
  223. });
  224. const files = scanDirectory(this.rootDir, this.config, (current, file) => {
  225. onProgress?.({
  226. phase: 'scanning',
  227. current,
  228. total: 0,
  229. currentFile: file,
  230. });
  231. });
  232. if (signal?.aborted) {
  233. return {
  234. success: false,
  235. filesIndexed: 0,
  236. filesSkipped: 0,
  237. nodesCreated: 0,
  238. edgesCreated: 0,
  239. errors: [{ message: 'Aborted', severity: 'error' }],
  240. durationMs: Date.now() - startTime,
  241. };
  242. }
  243. // Phase 2: Parse files
  244. const total = files.length;
  245. for (let i = 0; i < files.length; i++) {
  246. if (signal?.aborted) {
  247. return {
  248. success: false,
  249. filesIndexed,
  250. filesSkipped,
  251. nodesCreated: totalNodes,
  252. edgesCreated: totalEdges,
  253. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  254. durationMs: Date.now() - startTime,
  255. };
  256. }
  257. const filePath = files[i]!;
  258. onProgress?.({
  259. phase: 'parsing',
  260. current: i + 1,
  261. total,
  262. currentFile: filePath,
  263. });
  264. const result = await this.indexFile(filePath);
  265. if (result.errors.length > 0) {
  266. errors.push(...result.errors);
  267. }
  268. if (result.nodes.length > 0) {
  269. filesIndexed++;
  270. totalNodes += result.nodes.length;
  271. totalEdges += result.edges.length;
  272. } else if (result.errors.length === 0) {
  273. filesSkipped++;
  274. }
  275. }
  276. // Phase 3: Resolve references
  277. onProgress?.({
  278. phase: 'resolving',
  279. current: 0,
  280. total: 1,
  281. });
  282. // TODO: Implement reference resolution in Phase 3
  283. return {
  284. success: errors.filter((e) => e.severity === 'error').length === 0,
  285. filesIndexed,
  286. filesSkipped,
  287. nodesCreated: totalNodes,
  288. edgesCreated: totalEdges,
  289. errors,
  290. durationMs: Date.now() - startTime,
  291. };
  292. }
  293. /**
  294. * Index specific files
  295. */
  296. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  297. const startTime = Date.now();
  298. const errors: ExtractionError[] = [];
  299. let filesIndexed = 0;
  300. let filesSkipped = 0;
  301. let totalNodes = 0;
  302. let totalEdges = 0;
  303. for (const filePath of filePaths) {
  304. const result = await this.indexFile(filePath);
  305. if (result.errors.length > 0) {
  306. errors.push(...result.errors);
  307. }
  308. if (result.nodes.length > 0) {
  309. filesIndexed++;
  310. totalNodes += result.nodes.length;
  311. totalEdges += result.edges.length;
  312. } else {
  313. filesSkipped++;
  314. }
  315. }
  316. return {
  317. success: errors.filter((e) => e.severity === 'error').length === 0,
  318. filesIndexed,
  319. filesSkipped,
  320. nodesCreated: totalNodes,
  321. edgesCreated: totalEdges,
  322. errors,
  323. durationMs: Date.now() - startTime,
  324. };
  325. }
  326. /**
  327. * Index a single file
  328. */
  329. async indexFile(relativePath: string): Promise<ExtractionResult> {
  330. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  331. if (!fullPath) {
  332. return {
  333. nodes: [],
  334. edges: [],
  335. unresolvedReferences: [],
  336. errors: [{ message: `Path traversal blocked: ${relativePath}`, severity: 'error' }],
  337. durationMs: 0,
  338. };
  339. }
  340. // Check file exists and is readable
  341. let content: string;
  342. let stats: fs.Stats;
  343. try {
  344. stats = await fsp.stat(fullPath);
  345. content = await fsp.readFile(fullPath, 'utf-8');
  346. } catch (error) {
  347. captureException(error, { operation: 'extract-file', filePath: fullPath });
  348. return {
  349. nodes: [],
  350. edges: [],
  351. unresolvedReferences: [],
  352. errors: [
  353. {
  354. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  355. severity: 'error',
  356. },
  357. ],
  358. durationMs: 0,
  359. };
  360. }
  361. // Check file size
  362. if (stats.size > this.config.maxFileSize) {
  363. return {
  364. nodes: [],
  365. edges: [],
  366. unresolvedReferences: [],
  367. errors: [
  368. {
  369. message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
  370. severity: 'warning',
  371. },
  372. ],
  373. durationMs: 0,
  374. };
  375. }
  376. // Detect language
  377. const language = detectLanguage(relativePath);
  378. if (!isLanguageSupported(language)) {
  379. return {
  380. nodes: [],
  381. edges: [],
  382. unresolvedReferences: [],
  383. errors: [],
  384. durationMs: 0,
  385. };
  386. }
  387. // Extract from source
  388. const result = extractFromSource(relativePath, content, language);
  389. // Store in database
  390. if (result.nodes.length > 0 || result.errors.length === 0) {
  391. this.storeExtractionResult(relativePath, content, language, stats, result);
  392. }
  393. return result;
  394. }
  395. /**
  396. * Store extraction result in database
  397. */
  398. private storeExtractionResult(
  399. filePath: string,
  400. content: string,
  401. language: Language,
  402. stats: fs.Stats,
  403. result: ExtractionResult
  404. ): void {
  405. const contentHash = hashContent(content);
  406. // Check if file already exists and hasn't changed
  407. const existingFile = this.queries.getFileByPath(filePath);
  408. if (existingFile && existingFile.contentHash === contentHash) {
  409. return; // No changes
  410. }
  411. // Delete existing data for this file
  412. if (existingFile) {
  413. this.queries.deleteFile(filePath);
  414. }
  415. // Insert nodes
  416. if (result.nodes.length > 0) {
  417. this.queries.insertNodes(result.nodes);
  418. }
  419. // Insert edges
  420. if (result.edges.length > 0) {
  421. this.queries.insertEdges(result.edges);
  422. }
  423. // Insert unresolved references in batch with denormalized filePath/language
  424. if (result.unresolvedReferences.length > 0) {
  425. const refsWithContext = result.unresolvedReferences.map((ref) => ({
  426. ...ref,
  427. filePath: ref.filePath ?? filePath,
  428. language: ref.language ?? language,
  429. }));
  430. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  431. }
  432. // Insert file record
  433. const fileRecord: FileRecord = {
  434. path: filePath,
  435. contentHash,
  436. language,
  437. size: stats.size,
  438. modifiedAt: stats.mtimeMs,
  439. indexedAt: Date.now(),
  440. nodeCount: result.nodes.length,
  441. errors: result.errors.length > 0 ? result.errors : undefined,
  442. };
  443. this.queries.upsertFile(fileRecord);
  444. }
  445. /**
  446. * Sync with current file state
  447. */
  448. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  449. const startTime = Date.now();
  450. let filesChecked = 0;
  451. let filesAdded = 0;
  452. let filesModified = 0;
  453. let filesRemoved = 0;
  454. let nodesUpdated = 0;
  455. // Get current files on disk
  456. onProgress?.({
  457. phase: 'scanning',
  458. current: 0,
  459. total: 0,
  460. });
  461. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  462. filesChecked = currentFiles.size;
  463. // Get tracked files from database
  464. const trackedFiles = this.queries.getAllFiles();
  465. // Find files to remove (in DB but not on disk)
  466. for (const tracked of trackedFiles) {
  467. if (!currentFiles.has(tracked.path)) {
  468. this.queries.deleteFile(tracked.path);
  469. filesRemoved++;
  470. }
  471. }
  472. // Find files to add or update
  473. const filesToIndex: string[] = [];
  474. for (const filePath of currentFiles) {
  475. const fullPath = path.join(this.rootDir, filePath);
  476. let content: string;
  477. try {
  478. content = fs.readFileSync(fullPath, 'utf-8');
  479. } catch (error) {
  480. captureException(error, { operation: 'sync-read-file', filePath });
  481. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  482. continue;
  483. }
  484. const contentHash = hashContent(content);
  485. const tracked = trackedFiles.find((f) => f.path === filePath);
  486. if (!tracked) {
  487. // New file
  488. filesToIndex.push(filePath);
  489. filesAdded++;
  490. } else if (tracked.contentHash !== contentHash) {
  491. // Modified file
  492. filesToIndex.push(filePath);
  493. filesModified++;
  494. }
  495. }
  496. // Index changed files
  497. const total = filesToIndex.length;
  498. for (let i = 0; i < filesToIndex.length; i++) {
  499. const filePath = filesToIndex[i]!;
  500. onProgress?.({
  501. phase: 'parsing',
  502. current: i + 1,
  503. total,
  504. currentFile: filePath,
  505. });
  506. const result = await this.indexFile(filePath);
  507. nodesUpdated += result.nodes.length;
  508. }
  509. return {
  510. filesChecked,
  511. filesAdded,
  512. filesModified,
  513. filesRemoved,
  514. nodesUpdated,
  515. durationMs: Date.now() - startTime,
  516. };
  517. }
  518. /**
  519. * Get files that have changed since last index
  520. */
  521. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  522. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  523. const trackedFiles = this.queries.getAllFiles();
  524. const added: string[] = [];
  525. const modified: string[] = [];
  526. const removed: string[] = [];
  527. // Find removed files
  528. for (const tracked of trackedFiles) {
  529. if (!currentFiles.has(tracked.path)) {
  530. removed.push(tracked.path);
  531. }
  532. }
  533. // Find added and modified files
  534. for (const filePath of currentFiles) {
  535. const fullPath = path.join(this.rootDir, filePath);
  536. let content: string;
  537. try {
  538. content = fs.readFileSync(fullPath, 'utf-8');
  539. } catch (error) {
  540. captureException(error, { operation: 'detect-changes-read-file', filePath });
  541. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  542. continue;
  543. }
  544. const contentHash = hashContent(content);
  545. const tracked = trackedFiles.find((f) => f.path === filePath);
  546. if (!tracked) {
  547. added.push(filePath);
  548. } else if (tracked.contentHash !== contentHash) {
  549. modified.push(filePath);
  550. }
  551. }
  552. return { added, modified, removed };
  553. }
  554. }
  555. // Re-export useful types and functions
  556. export { extractFromSource } from './tree-sitter';
  557. export { detectLanguage, isLanguageSupported, getSupportedLanguages } from './grammars';