index.ts 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. CodeGraphConfig,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages } from './grammars';
  21. import { logDebug, logWarn } from '../errors';
  22. import { validatePathWithinRoot, normalizePath } from '../utils';
  23. import picomatch from 'picomatch';
  24. /**
  25. * Number of files to read in parallel during indexing.
  26. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  27. */
  28. const FILE_IO_BATCH_SIZE = 10;
  29. /**
  30. * Progress callback for indexing operations
  31. */
  32. export interface IndexProgress {
  33. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  34. current: number;
  35. total: number;
  36. currentFile?: string;
  37. }
  38. /**
  39. * Result of an indexing operation
  40. */
  41. export interface IndexResult {
  42. success: boolean;
  43. filesIndexed: number;
  44. filesSkipped: number;
  45. nodesCreated: number;
  46. edgesCreated: number;
  47. errors: ExtractionError[];
  48. durationMs: number;
  49. }
  50. /**
  51. * Result of a sync operation
  52. */
  53. export interface SyncResult {
  54. filesChecked: number;
  55. filesAdded: number;
  56. filesModified: number;
  57. filesRemoved: number;
  58. nodesUpdated: number;
  59. durationMs: number;
  60. changedFilePaths?: string[];
  61. }
  62. /**
  63. * Calculate SHA256 hash of file contents
  64. */
  65. export function hashContent(content: string): string {
  66. return crypto.createHash('sha256').update(content).digest('hex');
  67. }
  68. /**
  69. * Check if a path matches any glob pattern (simplified)
  70. */
  71. function matchesGlob(filePath: string, pattern: string): boolean {
  72. filePath = normalizePath(filePath);
  73. return picomatch.isMatch(filePath, pattern, { dot: true });
  74. }
  75. /**
  76. * Check if a file should be included based on config
  77. */
  78. export function shouldIncludeFile(
  79. filePath: string,
  80. config: CodeGraphConfig
  81. ): boolean {
  82. // Check exclude patterns first
  83. for (const pattern of config.exclude) {
  84. if (matchesGlob(filePath, pattern)) {
  85. return false;
  86. }
  87. }
  88. // Check include patterns
  89. for (const pattern of config.include) {
  90. if (matchesGlob(filePath, pattern)) {
  91. return true;
  92. }
  93. }
  94. return false;
  95. }
  96. /**
  97. * Get all files visible to git (tracked + untracked but not ignored).
  98. * Respects .gitignore at all levels (root, subdirectories).
  99. * Returns null on failure (non-git project) so callers can fall back.
  100. */
  101. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  102. try {
  103. // Check if the project directory is gitignored by a parent repo.
  104. // When rootDir lives inside a parent git repo that ignores it,
  105. // `git ls-files` returns nothing — fall back to filesystem walk.
  106. const gitRoot = execFileSync(
  107. 'git',
  108. ['rev-parse', '--show-toplevel'],
  109. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  110. ).trim();
  111. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  112. try {
  113. // git check-ignore exits 0 if the path IS ignored, 1 if not
  114. execFileSync(
  115. 'git',
  116. ['check-ignore', '-q', path.resolve(rootDir)],
  117. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  118. );
  119. // Directory is gitignored by parent repo — fall back to filesystem walk
  120. return null;
  121. } catch {
  122. // Not ignored — safe to use git ls-files
  123. }
  124. }
  125. // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
  126. const output = execFileSync(
  127. 'git',
  128. ['ls-files', '-co', '--exclude-standard'],
  129. { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
  130. );
  131. const files = new Set<string>();
  132. for (const line of output.split('\n')) {
  133. const trimmed = line.trim();
  134. if (trimmed) {
  135. files.add(normalizePath(trimmed));
  136. }
  137. }
  138. return files;
  139. } catch {
  140. return null;
  141. }
  142. }
  143. /**
  144. * Result of git-based change detection.
  145. * Returns null when git is unavailable (non-git project or command failure),
  146. * signaling the caller to fall back to full filesystem scan.
  147. */
  148. interface GitChanges {
  149. modified: string[]; // M, MM, AM — files to re-hash + re-index
  150. added: string[]; // ?? — new untracked files to index
  151. deleted: string[]; // D — files to remove from DB
  152. }
  153. /**
  154. * Use `git status` to detect changed files instead of scanning every file.
  155. * Returns null on failure so callers fall back to full scan.
  156. */
  157. function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
  158. try {
  159. const output = execFileSync(
  160. 'git',
  161. ['status', '--porcelain', '--no-renames'],
  162. { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
  163. );
  164. const modified: string[] = [];
  165. const added: string[] = [];
  166. const deleted: string[] = [];
  167. for (const line of output.split('\n')) {
  168. if (line.length < 4) continue; // Minimum: "XY file"
  169. const statusCode = line.substring(0, 2);
  170. const filePath = normalizePath(line.substring(3));
  171. // Skip files that don't match include/exclude config
  172. if (!shouldIncludeFile(filePath, config)) continue;
  173. if (statusCode === '??') {
  174. added.push(filePath);
  175. } else if (statusCode.includes('D')) {
  176. deleted.push(filePath);
  177. } else {
  178. // M, MM, AM, A (staged), etc. — treat as modified
  179. modified.push(filePath);
  180. }
  181. }
  182. return { modified, added, deleted };
  183. } catch {
  184. return null;
  185. }
  186. }
  187. /**
  188. * Marker file name that indicates a directory (and all children) should be skipped
  189. */
  190. const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
  191. /**
  192. * Recursively scan directory for source files.
  193. *
  194. * In git repos, uses `git ls-files` to get the file list (inherently
  195. * respects .gitignore at all levels), then filters by config include patterns.
  196. * Falls back to filesystem walk for non-git projects.
  197. */
  198. export function scanDirectory(
  199. rootDir: string,
  200. config: CodeGraphConfig,
  201. onProgress?: (current: number, file: string) => void
  202. ): string[] {
  203. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  204. const gitFiles = getGitVisibleFiles(rootDir);
  205. if (gitFiles) {
  206. const files: string[] = [];
  207. let count = 0;
  208. for (const filePath of gitFiles) {
  209. if (shouldIncludeFile(filePath, config)) {
  210. files.push(filePath);
  211. count++;
  212. onProgress?.(count, filePath);
  213. }
  214. }
  215. return files;
  216. }
  217. // Fallback: walk filesystem for non-git projects
  218. return scanDirectoryWalk(rootDir, config, onProgress);
  219. }
  220. /**
  221. * Filesystem walk fallback for non-git projects.
  222. */
  223. function scanDirectoryWalk(
  224. rootDir: string,
  225. config: CodeGraphConfig,
  226. onProgress?: (current: number, file: string) => void
  227. ): string[] {
  228. const files: string[] = [];
  229. let count = 0;
  230. const visitedDirs = new Set<string>();
  231. function walk(dir: string): void {
  232. let realDir: string;
  233. try {
  234. realDir = fs.realpathSync(dir);
  235. } catch {
  236. logDebug('Skipping unresolvable directory', { dir });
  237. return;
  238. }
  239. if (visitedDirs.has(realDir)) {
  240. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  241. return;
  242. }
  243. visitedDirs.add(realDir);
  244. // Check for .codegraphignore marker file
  245. const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
  246. if (fs.existsSync(ignoreMarker)) {
  247. logDebug('Skipping directory due to .codegraphignore marker', { dir });
  248. return;
  249. }
  250. let entries: fs.Dirent[];
  251. try {
  252. entries = fs.readdirSync(dir, { withFileTypes: true });
  253. } catch (error) {
  254. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  255. return;
  256. }
  257. for (const entry of entries) {
  258. const fullPath = path.join(dir, entry.name);
  259. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  260. if (entry.isSymbolicLink()) {
  261. try {
  262. const realTarget = fs.realpathSync(fullPath);
  263. const stat = fs.statSync(realTarget);
  264. if (stat.isDirectory()) {
  265. const dirPattern = relativePath + '/';
  266. let excluded = false;
  267. for (const pattern of config.exclude) {
  268. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  269. excluded = true;
  270. break;
  271. }
  272. }
  273. if (!excluded) {
  274. walk(fullPath);
  275. }
  276. } else if (stat.isFile()) {
  277. if (shouldIncludeFile(relativePath, config)) {
  278. files.push(relativePath);
  279. count++;
  280. onProgress?.(count, relativePath);
  281. }
  282. }
  283. } catch {
  284. logDebug('Skipping broken symlink', { path: fullPath });
  285. }
  286. continue;
  287. }
  288. if (entry.isDirectory()) {
  289. const dirPattern = relativePath + '/';
  290. let excluded = false;
  291. for (const pattern of config.exclude) {
  292. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  293. excluded = true;
  294. break;
  295. }
  296. }
  297. if (!excluded) {
  298. walk(fullPath);
  299. }
  300. } else if (entry.isFile()) {
  301. if (shouldIncludeFile(relativePath, config)) {
  302. files.push(relativePath);
  303. count++;
  304. onProgress?.(count, relativePath);
  305. }
  306. }
  307. }
  308. }
  309. walk(rootDir);
  310. return files;
  311. }
  312. /**
  313. * Extraction orchestrator
  314. */
  315. export class ExtractionOrchestrator {
  316. private rootDir: string;
  317. private config: CodeGraphConfig;
  318. private queries: QueryBuilder;
  319. constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
  320. this.rootDir = rootDir;
  321. this.config = config;
  322. this.queries = queries;
  323. }
  324. /**
  325. * Index all files in the project
  326. */
  327. async indexAll(
  328. onProgress?: (progress: IndexProgress) => void,
  329. signal?: AbortSignal
  330. ): Promise<IndexResult> {
  331. await initGrammars();
  332. const startTime = Date.now();
  333. const errors: ExtractionError[] = [];
  334. let filesIndexed = 0;
  335. let filesSkipped = 0;
  336. let totalNodes = 0;
  337. let totalEdges = 0;
  338. // Phase 1: Scan for files
  339. onProgress?.({
  340. phase: 'scanning',
  341. current: 0,
  342. total: 0,
  343. });
  344. const files = scanDirectory(this.rootDir, this.config, (current, file) => {
  345. onProgress?.({
  346. phase: 'scanning',
  347. current,
  348. total: 0,
  349. currentFile: file,
  350. });
  351. });
  352. if (signal?.aborted) {
  353. return {
  354. success: false,
  355. filesIndexed: 0,
  356. filesSkipped: 0,
  357. nodesCreated: 0,
  358. edgesCreated: 0,
  359. errors: [{ message: 'Aborted', severity: 'error' }],
  360. durationMs: Date.now() - startTime,
  361. };
  362. }
  363. // Load only the grammars needed for languages actually present in the project.
  364. // This avoids compiling all 16+ WASM grammar modules upfront, which can cause
  365. // V8 WASM Zone OOM on large codebases (see issue #54).
  366. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
  367. await loadGrammarsForLanguages(neededLanguages);
  368. // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
  369. const total = files.length;
  370. let processed = 0;
  371. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  372. if (signal?.aborted) {
  373. return {
  374. success: false,
  375. filesIndexed,
  376. filesSkipped,
  377. nodesCreated: totalNodes,
  378. edgesCreated: totalEdges,
  379. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  380. durationMs: Date.now() - startTime,
  381. };
  382. }
  383. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  384. // Read files in parallel (with path validation before any I/O)
  385. const fileContents = await Promise.all(
  386. batch.map(async (fp) => {
  387. try {
  388. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  389. if (!fullPath) {
  390. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  391. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  392. }
  393. const content = await fsp.readFile(fullPath, 'utf-8');
  394. const stats = await fsp.stat(fullPath);
  395. return { filePath: fp, content, stats, error: null as Error | null };
  396. } catch (err) {
  397. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  398. }
  399. })
  400. );
  401. // Parse and store sequentially
  402. for (const { filePath, content, stats, error } of fileContents) {
  403. if (signal?.aborted) {
  404. return {
  405. success: false,
  406. filesIndexed,
  407. filesSkipped,
  408. nodesCreated: totalNodes,
  409. edgesCreated: totalEdges,
  410. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  411. durationMs: Date.now() - startTime,
  412. };
  413. }
  414. processed++;
  415. onProgress?.({
  416. phase: 'parsing',
  417. current: processed,
  418. total,
  419. currentFile: filePath,
  420. });
  421. if (error || content === null || stats === null) {
  422. errors.push({
  423. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  424. severity: 'error',
  425. });
  426. continue;
  427. }
  428. const result = await this.indexFileWithContent(filePath, content, stats);
  429. if (result.errors.length > 0) {
  430. errors.push(...result.errors);
  431. }
  432. if (result.nodes.length > 0) {
  433. filesIndexed++;
  434. totalNodes += result.nodes.length;
  435. totalEdges += result.edges.length;
  436. } else if (result.errors.length === 0) {
  437. filesSkipped++;
  438. }
  439. }
  440. }
  441. // Phase 3: Resolve references
  442. onProgress?.({
  443. phase: 'resolving',
  444. current: 0,
  445. total: 1,
  446. });
  447. // TODO: Implement reference resolution in Phase 3
  448. return {
  449. success: errors.filter((e) => e.severity === 'error').length === 0,
  450. filesIndexed,
  451. filesSkipped,
  452. nodesCreated: totalNodes,
  453. edgesCreated: totalEdges,
  454. errors,
  455. durationMs: Date.now() - startTime,
  456. };
  457. }
  458. /**
  459. * Index specific files
  460. */
  461. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  462. const startTime = Date.now();
  463. const errors: ExtractionError[] = [];
  464. let filesIndexed = 0;
  465. let filesSkipped = 0;
  466. let totalNodes = 0;
  467. let totalEdges = 0;
  468. for (const filePath of filePaths) {
  469. const result = await this.indexFile(filePath);
  470. if (result.errors.length > 0) {
  471. errors.push(...result.errors);
  472. }
  473. if (result.nodes.length > 0) {
  474. filesIndexed++;
  475. totalNodes += result.nodes.length;
  476. totalEdges += result.edges.length;
  477. } else {
  478. filesSkipped++;
  479. }
  480. }
  481. return {
  482. success: errors.filter((e) => e.severity === 'error').length === 0,
  483. filesIndexed,
  484. filesSkipped,
  485. nodesCreated: totalNodes,
  486. edgesCreated: totalEdges,
  487. errors,
  488. durationMs: Date.now() - startTime,
  489. };
  490. }
  491. /**
  492. * Index a single file
  493. */
  494. async indexFile(relativePath: string): Promise<ExtractionResult> {
  495. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  496. if (!fullPath) {
  497. return {
  498. nodes: [],
  499. edges: [],
  500. unresolvedReferences: [],
  501. errors: [{ message: `Path traversal blocked: ${relativePath}`, severity: 'error' }],
  502. durationMs: 0,
  503. };
  504. }
  505. // Read file content and stats
  506. let content: string;
  507. let stats: fs.Stats;
  508. try {
  509. stats = await fsp.stat(fullPath);
  510. content = await fsp.readFile(fullPath, 'utf-8');
  511. } catch (error) {
  512. return {
  513. nodes: [],
  514. edges: [],
  515. unresolvedReferences: [],
  516. errors: [
  517. {
  518. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  519. severity: 'error',
  520. },
  521. ],
  522. durationMs: 0,
  523. };
  524. }
  525. return this.indexFileWithContent(relativePath, content, stats);
  526. }
  527. /**
  528. * Index a single file with pre-read content and stats.
  529. * Used by the parallel batch reader to avoid redundant file I/O.
  530. */
  531. async indexFileWithContent(
  532. relativePath: string,
  533. content: string,
  534. stats: fs.Stats
  535. ): Promise<ExtractionResult> {
  536. // Prevent path traversal
  537. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  538. if (!fullPath) {
  539. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  540. return {
  541. nodes: [],
  542. edges: [],
  543. unresolvedReferences: [],
  544. errors: [{ message: 'Path traversal blocked', severity: 'error' }],
  545. durationMs: 0,
  546. };
  547. }
  548. // Check file size
  549. if (stats.size > this.config.maxFileSize) {
  550. return {
  551. nodes: [],
  552. edges: [],
  553. unresolvedReferences: [],
  554. errors: [
  555. {
  556. message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
  557. severity: 'warning',
  558. },
  559. ],
  560. durationMs: 0,
  561. };
  562. }
  563. // Detect language
  564. const language = detectLanguage(relativePath);
  565. if (!isLanguageSupported(language)) {
  566. return {
  567. nodes: [],
  568. edges: [],
  569. unresolvedReferences: [],
  570. errors: [],
  571. durationMs: 0,
  572. };
  573. }
  574. // Extract from source
  575. const result = extractFromSource(relativePath, content, language);
  576. // Store in database
  577. if (result.nodes.length > 0 || result.errors.length === 0) {
  578. this.storeExtractionResult(relativePath, content, language, stats, result);
  579. }
  580. return result;
  581. }
  582. /**
  583. * Store extraction result in database
  584. */
  585. private storeExtractionResult(
  586. filePath: string,
  587. content: string,
  588. language: Language,
  589. stats: fs.Stats,
  590. result: ExtractionResult
  591. ): void {
  592. const contentHash = hashContent(content);
  593. // Check if file already exists and hasn't changed
  594. const existingFile = this.queries.getFileByPath(filePath);
  595. if (existingFile && existingFile.contentHash === contentHash) {
  596. return; // No changes
  597. }
  598. // Delete existing data for this file
  599. if (existingFile) {
  600. this.queries.deleteFile(filePath);
  601. }
  602. // Filter out nodes with missing required fields before insertion.
  603. // This prevents FK violations when edges reference nodes that would
  604. // be silently skipped by insertNode() (see issue #42).
  605. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  606. // Insert nodes
  607. if (validNodes.length > 0) {
  608. this.queries.insertNodes(validNodes);
  609. }
  610. // Filter edges to only reference nodes that were actually inserted
  611. if (result.edges.length > 0) {
  612. const insertedIds = new Set(validNodes.map((n) => n.id));
  613. const validEdges = result.edges.filter(
  614. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  615. );
  616. if (validEdges.length > 0) {
  617. this.queries.insertEdges(validEdges);
  618. }
  619. }
  620. // Insert unresolved references in batch with denormalized filePath/language
  621. if (result.unresolvedReferences.length > 0) {
  622. const insertedIds = new Set(validNodes.map((n) => n.id));
  623. const refsWithContext = result.unresolvedReferences
  624. .filter((ref) => insertedIds.has(ref.fromNodeId))
  625. .map((ref) => ({
  626. ...ref,
  627. filePath: ref.filePath ?? filePath,
  628. language: ref.language ?? language,
  629. }));
  630. if (refsWithContext.length > 0) {
  631. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  632. }
  633. }
  634. // Insert file record
  635. const fileRecord: FileRecord = {
  636. path: filePath,
  637. contentHash,
  638. language,
  639. size: stats.size,
  640. modifiedAt: stats.mtimeMs,
  641. indexedAt: Date.now(),
  642. nodeCount: result.nodes.length,
  643. errors: result.errors.length > 0 ? result.errors : undefined,
  644. };
  645. this.queries.upsertFile(fileRecord);
  646. }
  647. /**
  648. * Sync with current file state.
  649. * Uses git status as a fast path when available, falling back to full scan.
  650. */
  651. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  652. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  653. const startTime = Date.now();
  654. let filesChecked = 0;
  655. let filesAdded = 0;
  656. let filesModified = 0;
  657. let filesRemoved = 0;
  658. let nodesUpdated = 0;
  659. const changedFilePaths: string[] = [];
  660. onProgress?.({
  661. phase: 'scanning',
  662. current: 0,
  663. total: 0,
  664. });
  665. const filesToIndex: string[] = [];
  666. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  667. if (gitChanges) {
  668. // === Git fast path ===
  669. // Only inspect the files git reports as changed instead of scanning everything.
  670. filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
  671. // Handle deleted files
  672. for (const filePath of gitChanges.deleted) {
  673. const tracked = this.queries.getFileByPath(filePath);
  674. if (tracked) {
  675. this.queries.deleteFile(filePath);
  676. filesRemoved++;
  677. }
  678. }
  679. // Handle modified files — read + hash only these files
  680. for (const filePath of gitChanges.modified) {
  681. const fullPath = path.join(this.rootDir, filePath);
  682. let content: string;
  683. try {
  684. content = fs.readFileSync(fullPath, 'utf-8');
  685. } catch (error) {
  686. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  687. continue;
  688. }
  689. const contentHash = hashContent(content);
  690. const tracked = this.queries.getFileByPath(filePath);
  691. if (!tracked) {
  692. filesToIndex.push(filePath);
  693. changedFilePaths.push(filePath);
  694. filesAdded++;
  695. } else if (tracked.contentHash !== contentHash) {
  696. filesToIndex.push(filePath);
  697. changedFilePaths.push(filePath);
  698. filesModified++;
  699. }
  700. }
  701. // Handle added (untracked) files
  702. for (const filePath of gitChanges.added) {
  703. filesToIndex.push(filePath);
  704. changedFilePaths.push(filePath);
  705. filesAdded++;
  706. }
  707. } else {
  708. // === Fallback: full scan (non-git project or git failure) ===
  709. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  710. filesChecked = currentFiles.size;
  711. // Build Map for O(1) lookups instead of .find() per file
  712. const trackedFiles = this.queries.getAllFiles();
  713. const trackedMap = new Map<string, FileRecord>();
  714. for (const f of trackedFiles) {
  715. trackedMap.set(f.path, f);
  716. }
  717. // Find files to remove (in DB but not on disk)
  718. for (const tracked of trackedFiles) {
  719. if (!currentFiles.has(tracked.path)) {
  720. this.queries.deleteFile(tracked.path);
  721. filesRemoved++;
  722. }
  723. }
  724. // Find files to add or update
  725. for (const filePath of currentFiles) {
  726. const fullPath = path.join(this.rootDir, filePath);
  727. let content: string;
  728. try {
  729. content = fs.readFileSync(fullPath, 'utf-8');
  730. } catch (error) {
  731. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  732. continue;
  733. }
  734. const contentHash = hashContent(content);
  735. const tracked = trackedMap.get(filePath);
  736. if (!tracked) {
  737. filesToIndex.push(filePath);
  738. changedFilePaths.push(filePath);
  739. filesAdded++;
  740. } else if (tracked.contentHash !== contentHash) {
  741. filesToIndex.push(filePath);
  742. changedFilePaths.push(filePath);
  743. filesModified++;
  744. }
  745. }
  746. }
  747. // Load only grammars needed for changed files
  748. if (filesToIndex.length > 0) {
  749. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
  750. await loadGrammarsForLanguages(neededLanguages);
  751. }
  752. // Index changed files
  753. const total = filesToIndex.length;
  754. for (let i = 0; i < filesToIndex.length; i++) {
  755. const filePath = filesToIndex[i]!;
  756. onProgress?.({
  757. phase: 'parsing',
  758. current: i + 1,
  759. total,
  760. currentFile: filePath,
  761. });
  762. const result = await this.indexFile(filePath);
  763. nodesUpdated += result.nodes.length;
  764. }
  765. return {
  766. filesChecked,
  767. filesAdded,
  768. filesModified,
  769. filesRemoved,
  770. nodesUpdated,
  771. durationMs: Date.now() - startTime,
  772. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  773. };
  774. }
  775. /**
  776. * Get files that have changed since last index.
  777. * Uses git status as a fast path when available, falling back to full scan.
  778. */
  779. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  780. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  781. if (gitChanges) {
  782. // === Git fast path ===
  783. const added: string[] = [];
  784. const modified: string[] = [];
  785. const removed: string[] = [];
  786. // Deleted files — only report if tracked in DB
  787. for (const filePath of gitChanges.deleted) {
  788. const tracked = this.queries.getFileByPath(filePath);
  789. if (tracked) {
  790. removed.push(filePath);
  791. }
  792. }
  793. // Modified files — read + hash only these, compare with DB
  794. for (const filePath of gitChanges.modified) {
  795. const fullPath = path.join(this.rootDir, filePath);
  796. let content: string;
  797. try {
  798. content = fs.readFileSync(fullPath, 'utf-8');
  799. } catch (error) {
  800. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  801. continue;
  802. }
  803. const contentHash = hashContent(content);
  804. const tracked = this.queries.getFileByPath(filePath);
  805. if (!tracked) {
  806. added.push(filePath);
  807. } else if (tracked.contentHash !== contentHash) {
  808. modified.push(filePath);
  809. }
  810. }
  811. // Added (untracked) files
  812. for (const filePath of gitChanges.added) {
  813. added.push(filePath);
  814. }
  815. return { added, modified, removed };
  816. }
  817. // === Fallback: full scan (non-git project or git failure) ===
  818. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  819. const trackedFiles = this.queries.getAllFiles();
  820. // Build Map for O(1) lookups
  821. const trackedMap = new Map<string, FileRecord>();
  822. for (const f of trackedFiles) {
  823. trackedMap.set(f.path, f);
  824. }
  825. const added: string[] = [];
  826. const modified: string[] = [];
  827. const removed: string[] = [];
  828. // Find removed files
  829. for (const tracked of trackedFiles) {
  830. if (!currentFiles.has(tracked.path)) {
  831. removed.push(tracked.path);
  832. }
  833. }
  834. // Find added and modified files
  835. for (const filePath of currentFiles) {
  836. const fullPath = path.join(this.rootDir, filePath);
  837. let content: string;
  838. try {
  839. content = fs.readFileSync(fullPath, 'utf-8');
  840. } catch (error) {
  841. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  842. continue;
  843. }
  844. const contentHash = hashContent(content);
  845. const tracked = trackedMap.get(filePath);
  846. if (!tracked) {
  847. added.push(filePath);
  848. } else if (tracked.contentHash !== contentHash) {
  849. modified.push(filePath);
  850. }
  851. }
  852. return { added, modified, removed };
  853. }
  854. }
  855. // Re-export useful types and functions
  856. export { extractFromSource } from './tree-sitter';
  857. export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';