index.ts 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. CodeGraphConfig,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isLanguageSupported } from './grammars';
  21. import { logDebug } from '../errors';
  22. import { captureException } from '../sentry';
  23. import { validatePathWithinRoot, normalizePath } from '../utils';
  24. /**
  25. * Progress callback for indexing operations
  26. */
  27. export interface IndexProgress {
  28. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  29. current: number;
  30. total: number;
  31. currentFile?: string;
  32. }
  33. /**
  34. * Result of an indexing operation
  35. */
  36. export interface IndexResult {
  37. success: boolean;
  38. filesIndexed: number;
  39. filesSkipped: number;
  40. nodesCreated: number;
  41. edgesCreated: number;
  42. errors: ExtractionError[];
  43. durationMs: number;
  44. }
  45. /**
  46. * Result of a sync operation
  47. */
  48. export interface SyncResult {
  49. filesChecked: number;
  50. filesAdded: number;
  51. filesModified: number;
  52. filesRemoved: number;
  53. nodesUpdated: number;
  54. durationMs: number;
  55. }
  56. /**
  57. * Calculate SHA256 hash of file contents
  58. */
  59. export function hashContent(content: string): string {
  60. return crypto.createHash('sha256').update(content).digest('hex');
  61. }
  62. /**
  63. * Check if a path matches any glob pattern (simplified)
  64. */
  65. function matchesGlob(filePath: string, pattern: string): boolean {
  66. // Normalize to forward slashes so Windows backslash paths match glob patterns
  67. filePath = normalizePath(filePath);
  68. // Convert glob to regex using placeholders to avoid conflicts
  69. let regexStr = pattern;
  70. // Replace glob patterns with placeholders first
  71. regexStr = regexStr.replace(/\*\*\//g, '\x00GLOBSTAR_SLASH\x00');
  72. regexStr = regexStr.replace(/\*\*/g, '\x00GLOBSTAR\x00');
  73. regexStr = regexStr.replace(/\*/g, '\x00STAR\x00');
  74. regexStr = regexStr.replace(/\?/g, '\x00QUESTION\x00');
  75. // Escape regex special characters
  76. regexStr = regexStr.replace(/[.+^${}()|[\]\\]/g, '\\$&');
  77. // Replace placeholders with regex equivalents
  78. regexStr = regexStr.replace(/\x00GLOBSTAR_SLASH\x00/g, '(?:.*/)?'); // **/ = zero or more dirs
  79. regexStr = regexStr.replace(/\x00GLOBSTAR\x00/g, '.*'); // ** = anything
  80. regexStr = regexStr.replace(/\x00STAR\x00/g, '[^/]*'); // * = anything except /
  81. regexStr = regexStr.replace(/\x00QUESTION\x00/g, '.'); // ? = single char
  82. const regex = new RegExp(`^${regexStr}$`);
  83. return regex.test(filePath);
  84. }
  85. /**
  86. * Check if a file should be included based on config
  87. */
  88. export function shouldIncludeFile(
  89. filePath: string,
  90. config: CodeGraphConfig
  91. ): boolean {
  92. // Check exclude patterns first
  93. for (const pattern of config.exclude) {
  94. if (matchesGlob(filePath, pattern)) {
  95. return false;
  96. }
  97. }
  98. // Check include patterns
  99. for (const pattern of config.include) {
  100. if (matchesGlob(filePath, pattern)) {
  101. return true;
  102. }
  103. }
  104. return false;
  105. }
  106. /**
  107. * Get directories ignored by .gitignore using git ls-files.
  108. * Returns a Set of normalized relative directory paths (forward slashes, no trailing slash).
  109. * Gracefully returns empty Set on any failure.
  110. */
  111. function getGitIgnoredDirectories(rootDir: string): Set<string> {
  112. try {
  113. const output = execFileSync(
  114. 'git',
  115. ['ls-files', '-oi', '--exclude-standard', '--directory'],
  116. { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
  117. );
  118. const dirs = new Set<string>();
  119. for (const line of output.split('\n')) {
  120. const trimmed = line.trim();
  121. if (trimmed.endsWith('/')) {
  122. dirs.add(normalizePath(trimmed.slice(0, -1)));
  123. }
  124. }
  125. return dirs;
  126. } catch {
  127. return new Set<string>();
  128. }
  129. }
  130. /**
  131. * Marker file name that indicates a directory (and all children) should be skipped
  132. */
  133. const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
  134. /**
  135. * Recursively scan directory for source files
  136. */
  137. export function scanDirectory(
  138. rootDir: string,
  139. config: CodeGraphConfig,
  140. onProgress?: (current: number, file: string) => void
  141. ): string[] {
  142. const files: string[] = [];
  143. let count = 0;
  144. const visitedRealPaths = new Set<string>(); // Symlink cycle detection
  145. const gitIgnoredDirs = getGitIgnoredDirectories(rootDir);
  146. function walk(dir: string): void {
  147. // Symlink cycle detection: resolve real path and skip if already visited
  148. try {
  149. const realDir = fs.realpathSync(dir);
  150. if (visitedRealPaths.has(realDir)) {
  151. logDebug('Skipping directory to prevent symlink cycle', { dir, realDir });
  152. return;
  153. }
  154. visitedRealPaths.add(realDir);
  155. } catch {
  156. // If realpath fails, skip this directory
  157. return;
  158. }
  159. // Check for .codegraphignore marker file - skip entire directory tree if present
  160. const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
  161. if (fs.existsSync(ignoreMarker)) {
  162. logDebug('Skipping directory due to .codegraphignore marker', { dir });
  163. return;
  164. }
  165. let entries: fs.Dirent[];
  166. try {
  167. entries = fs.readdirSync(dir, { withFileTypes: true });
  168. } catch (error) {
  169. captureException(error, { operation: 'walk-directory', dir });
  170. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  171. return;
  172. }
  173. for (const entry of entries) {
  174. const fullPath = path.join(dir, entry.name);
  175. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  176. // Follow symlinked directories, but skip symlinked files to non-project targets
  177. if (entry.isSymbolicLink()) {
  178. try {
  179. const realTarget = fs.realpathSync(fullPath);
  180. const stat = fs.statSync(realTarget);
  181. if (stat.isDirectory()) {
  182. // Check gitignore first (fast O(1) lookup)
  183. if (gitIgnoredDirs.has(relativePath)) {
  184. continue;
  185. }
  186. // Check exclusion, then recurse (cycle detection handles the rest)
  187. const dirPattern = relativePath + '/';
  188. let excluded = false;
  189. for (const pattern of config.exclude) {
  190. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  191. excluded = true;
  192. break;
  193. }
  194. }
  195. if (!excluded) {
  196. walk(fullPath);
  197. }
  198. } else if (stat.isFile()) {
  199. if (shouldIncludeFile(relativePath, config)) {
  200. files.push(relativePath);
  201. count++;
  202. if (onProgress) {
  203. onProgress(count, relativePath);
  204. }
  205. }
  206. }
  207. } catch {
  208. logDebug('Skipping broken symlink', { path: fullPath });
  209. }
  210. continue;
  211. }
  212. if (entry.isDirectory()) {
  213. // Check gitignore first (fast O(1) lookup)
  214. if (gitIgnoredDirs.has(relativePath)) {
  215. continue;
  216. }
  217. // Check if directory should be excluded
  218. const dirPattern = relativePath + '/';
  219. let excluded = false;
  220. for (const pattern of config.exclude) {
  221. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  222. excluded = true;
  223. break;
  224. }
  225. }
  226. if (!excluded) {
  227. walk(fullPath);
  228. }
  229. } else if (entry.isFile()) {
  230. if (shouldIncludeFile(relativePath, config)) {
  231. files.push(relativePath);
  232. count++;
  233. if (onProgress) {
  234. onProgress(count, relativePath);
  235. }
  236. }
  237. }
  238. }
  239. }
  240. walk(rootDir);
  241. return files;
  242. }
  243. /**
  244. * Extraction orchestrator
  245. */
  246. export class ExtractionOrchestrator {
  247. private rootDir: string;
  248. private config: CodeGraphConfig;
  249. private queries: QueryBuilder;
  250. constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
  251. this.rootDir = rootDir;
  252. this.config = config;
  253. this.queries = queries;
  254. }
  255. /**
  256. * Index all files in the project
  257. */
  258. async indexAll(
  259. onProgress?: (progress: IndexProgress) => void,
  260. signal?: AbortSignal
  261. ): Promise<IndexResult> {
  262. const startTime = Date.now();
  263. const errors: ExtractionError[] = [];
  264. let filesIndexed = 0;
  265. let filesSkipped = 0;
  266. let totalNodes = 0;
  267. let totalEdges = 0;
  268. // Phase 1: Scan for files
  269. onProgress?.({
  270. phase: 'scanning',
  271. current: 0,
  272. total: 0,
  273. });
  274. const files = scanDirectory(this.rootDir, this.config, (current, file) => {
  275. onProgress?.({
  276. phase: 'scanning',
  277. current,
  278. total: 0,
  279. currentFile: file,
  280. });
  281. });
  282. if (signal?.aborted) {
  283. return {
  284. success: false,
  285. filesIndexed: 0,
  286. filesSkipped: 0,
  287. nodesCreated: 0,
  288. edgesCreated: 0,
  289. errors: [{ message: 'Aborted', severity: 'error' }],
  290. durationMs: Date.now() - startTime,
  291. };
  292. }
  293. // Phase 2: Parse files
  294. const total = files.length;
  295. for (let i = 0; i < files.length; i++) {
  296. if (signal?.aborted) {
  297. return {
  298. success: false,
  299. filesIndexed,
  300. filesSkipped,
  301. nodesCreated: totalNodes,
  302. edgesCreated: totalEdges,
  303. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  304. durationMs: Date.now() - startTime,
  305. };
  306. }
  307. const filePath = files[i]!;
  308. onProgress?.({
  309. phase: 'parsing',
  310. current: i + 1,
  311. total,
  312. currentFile: filePath,
  313. });
  314. const result = await this.indexFile(filePath);
  315. if (result.errors.length > 0) {
  316. errors.push(...result.errors);
  317. }
  318. if (result.nodes.length > 0) {
  319. filesIndexed++;
  320. totalNodes += result.nodes.length;
  321. totalEdges += result.edges.length;
  322. } else if (result.errors.length === 0) {
  323. filesSkipped++;
  324. }
  325. }
  326. // Phase 3: Resolve references
  327. onProgress?.({
  328. phase: 'resolving',
  329. current: 0,
  330. total: 1,
  331. });
  332. // TODO: Implement reference resolution in Phase 3
  333. return {
  334. success: errors.filter((e) => e.severity === 'error').length === 0,
  335. filesIndexed,
  336. filesSkipped,
  337. nodesCreated: totalNodes,
  338. edgesCreated: totalEdges,
  339. errors,
  340. durationMs: Date.now() - startTime,
  341. };
  342. }
  343. /**
  344. * Index specific files
  345. */
  346. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  347. const startTime = Date.now();
  348. const errors: ExtractionError[] = [];
  349. let filesIndexed = 0;
  350. let filesSkipped = 0;
  351. let totalNodes = 0;
  352. let totalEdges = 0;
  353. for (const filePath of filePaths) {
  354. const result = await this.indexFile(filePath);
  355. if (result.errors.length > 0) {
  356. errors.push(...result.errors);
  357. }
  358. if (result.nodes.length > 0) {
  359. filesIndexed++;
  360. totalNodes += result.nodes.length;
  361. totalEdges += result.edges.length;
  362. } else {
  363. filesSkipped++;
  364. }
  365. }
  366. return {
  367. success: errors.filter((e) => e.severity === 'error').length === 0,
  368. filesIndexed,
  369. filesSkipped,
  370. nodesCreated: totalNodes,
  371. edgesCreated: totalEdges,
  372. errors,
  373. durationMs: Date.now() - startTime,
  374. };
  375. }
  376. /**
  377. * Index a single file
  378. */
  379. async indexFile(relativePath: string): Promise<ExtractionResult> {
  380. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  381. if (!fullPath) {
  382. return {
  383. nodes: [],
  384. edges: [],
  385. unresolvedReferences: [],
  386. errors: [{ message: `Path traversal blocked: ${relativePath}`, severity: 'error' }],
  387. durationMs: 0,
  388. };
  389. }
  390. // Check file exists and is readable
  391. let content: string;
  392. let stats: fs.Stats;
  393. try {
  394. stats = await fsp.stat(fullPath);
  395. content = await fsp.readFile(fullPath, 'utf-8');
  396. } catch (error) {
  397. captureException(error, { operation: 'extract-file', filePath: fullPath });
  398. return {
  399. nodes: [],
  400. edges: [],
  401. unresolvedReferences: [],
  402. errors: [
  403. {
  404. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  405. severity: 'error',
  406. },
  407. ],
  408. durationMs: 0,
  409. };
  410. }
  411. // Check file size
  412. if (stats.size > this.config.maxFileSize) {
  413. return {
  414. nodes: [],
  415. edges: [],
  416. unresolvedReferences: [],
  417. errors: [
  418. {
  419. message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
  420. severity: 'warning',
  421. },
  422. ],
  423. durationMs: 0,
  424. };
  425. }
  426. // Detect language
  427. const language = detectLanguage(relativePath);
  428. if (!isLanguageSupported(language)) {
  429. return {
  430. nodes: [],
  431. edges: [],
  432. unresolvedReferences: [],
  433. errors: [],
  434. durationMs: 0,
  435. };
  436. }
  437. // Extract from source
  438. const result = extractFromSource(relativePath, content, language);
  439. // Store in database
  440. if (result.nodes.length > 0 || result.errors.length === 0) {
  441. this.storeExtractionResult(relativePath, content, language, stats, result);
  442. }
  443. return result;
  444. }
  445. /**
  446. * Store extraction result in database
  447. */
  448. private storeExtractionResult(
  449. filePath: string,
  450. content: string,
  451. language: Language,
  452. stats: fs.Stats,
  453. result: ExtractionResult
  454. ): void {
  455. const contentHash = hashContent(content);
  456. // Check if file already exists and hasn't changed
  457. const existingFile = this.queries.getFileByPath(filePath);
  458. if (existingFile && existingFile.contentHash === contentHash) {
  459. return; // No changes
  460. }
  461. // Delete existing data for this file
  462. if (existingFile) {
  463. this.queries.deleteFile(filePath);
  464. }
  465. // Insert nodes
  466. if (result.nodes.length > 0) {
  467. this.queries.insertNodes(result.nodes);
  468. }
  469. // Insert edges
  470. if (result.edges.length > 0) {
  471. this.queries.insertEdges(result.edges);
  472. }
  473. // Insert unresolved references in batch with denormalized filePath/language
  474. if (result.unresolvedReferences.length > 0) {
  475. const refsWithContext = result.unresolvedReferences.map((ref) => ({
  476. ...ref,
  477. filePath: ref.filePath ?? filePath,
  478. language: ref.language ?? language,
  479. }));
  480. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  481. }
  482. // Insert file record
  483. const fileRecord: FileRecord = {
  484. path: filePath,
  485. contentHash,
  486. language,
  487. size: stats.size,
  488. modifiedAt: stats.mtimeMs,
  489. indexedAt: Date.now(),
  490. nodeCount: result.nodes.length,
  491. errors: result.errors.length > 0 ? result.errors : undefined,
  492. };
  493. this.queries.upsertFile(fileRecord);
  494. }
  495. /**
  496. * Sync with current file state
  497. */
  498. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  499. const startTime = Date.now();
  500. let filesChecked = 0;
  501. let filesAdded = 0;
  502. let filesModified = 0;
  503. let filesRemoved = 0;
  504. let nodesUpdated = 0;
  505. // Get current files on disk
  506. onProgress?.({
  507. phase: 'scanning',
  508. current: 0,
  509. total: 0,
  510. });
  511. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  512. filesChecked = currentFiles.size;
  513. // Get tracked files from database
  514. const trackedFiles = this.queries.getAllFiles();
  515. // Find files to remove (in DB but not on disk)
  516. for (const tracked of trackedFiles) {
  517. if (!currentFiles.has(tracked.path)) {
  518. this.queries.deleteFile(tracked.path);
  519. filesRemoved++;
  520. }
  521. }
  522. // Find files to add or update
  523. const filesToIndex: string[] = [];
  524. for (const filePath of currentFiles) {
  525. const fullPath = path.join(this.rootDir, filePath);
  526. let content: string;
  527. try {
  528. content = fs.readFileSync(fullPath, 'utf-8');
  529. } catch (error) {
  530. captureException(error, { operation: 'sync-read-file', filePath });
  531. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  532. continue;
  533. }
  534. const contentHash = hashContent(content);
  535. const tracked = trackedFiles.find((f) => f.path === filePath);
  536. if (!tracked) {
  537. // New file
  538. filesToIndex.push(filePath);
  539. filesAdded++;
  540. } else if (tracked.contentHash !== contentHash) {
  541. // Modified file
  542. filesToIndex.push(filePath);
  543. filesModified++;
  544. }
  545. }
  546. // Index changed files
  547. const total = filesToIndex.length;
  548. for (let i = 0; i < filesToIndex.length; i++) {
  549. const filePath = filesToIndex[i]!;
  550. onProgress?.({
  551. phase: 'parsing',
  552. current: i + 1,
  553. total,
  554. currentFile: filePath,
  555. });
  556. const result = await this.indexFile(filePath);
  557. nodesUpdated += result.nodes.length;
  558. }
  559. return {
  560. filesChecked,
  561. filesAdded,
  562. filesModified,
  563. filesRemoved,
  564. nodesUpdated,
  565. durationMs: Date.now() - startTime,
  566. };
  567. }
  568. /**
  569. * Get files that have changed since last index
  570. */
  571. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  572. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  573. const trackedFiles = this.queries.getAllFiles();
  574. const added: string[] = [];
  575. const modified: string[] = [];
  576. const removed: string[] = [];
  577. // Find removed files
  578. for (const tracked of trackedFiles) {
  579. if (!currentFiles.has(tracked.path)) {
  580. removed.push(tracked.path);
  581. }
  582. }
  583. // Find added and modified files
  584. for (const filePath of currentFiles) {
  585. const fullPath = path.join(this.rootDir, filePath);
  586. let content: string;
  587. try {
  588. content = fs.readFileSync(fullPath, 'utf-8');
  589. } catch (error) {
  590. captureException(error, { operation: 'detect-changes-read-file', filePath });
  591. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  592. continue;
  593. }
  594. const contentHash = hashContent(content);
  595. const tracked = trackedFiles.find((f) => f.path === filePath);
  596. if (!tracked) {
  597. added.push(filePath);
  598. } else if (tracked.contentHash !== contentHash) {
  599. modified.push(filePath);
  600. }
  601. }
  602. return { added, modified, removed };
  603. }
  604. }
  605. // Re-export useful types and functions
  606. export { extractFromSource } from './tree-sitter';
  607. export { detectLanguage, isLanguageSupported, getSupportedLanguages } from './grammars';