index.ts 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008
  1. /**
  2. * Extraction Orchestrator
  3. *
  4. * Coordinates file scanning, parsing, and database storage.
  5. */
  6. import * as fs from 'fs';
  7. import * as fsp from 'fs/promises';
  8. import * as path from 'path';
  9. import * as crypto from 'crypto';
  10. import { execFileSync } from 'child_process';
  11. import {
  12. Language,
  13. FileRecord,
  14. ExtractionResult,
  15. ExtractionError,
  16. CodeGraphConfig,
  17. } from '../types';
  18. import { QueryBuilder } from '../db/queries';
  19. import { extractFromSource } from './tree-sitter';
  20. import { detectLanguage, isLanguageSupported, initGrammars, loadGrammarsForLanguages, resetParser } from './grammars';
  21. import { logDebug, logWarn } from '../errors';
  22. import { validatePathWithinRoot, normalizePath } from '../utils';
  23. import picomatch from 'picomatch';
  24. /**
  25. * Number of files to read in parallel during indexing.
  26. * File reads are I/O-bound; batching overlaps I/O wait with CPU parse work.
  27. */
  28. const FILE_IO_BATCH_SIZE = 10;
  29. /**
  30. * Reset tree-sitter parser after this many parses per language to reclaim
  31. * WASM heap memory and prevent "memory access out of bounds" crashes.
  32. */
  33. const PARSER_RESET_INTERVAL = 5000;
  34. /**
  35. * Progress callback for indexing operations
  36. */
  37. export interface IndexProgress {
  38. phase: 'scanning' | 'parsing' | 'storing' | 'resolving';
  39. current: number;
  40. total: number;
  41. currentFile?: string;
  42. }
  43. /**
  44. * Result of an indexing operation
  45. */
  46. export interface IndexResult {
  47. success: boolean;
  48. filesIndexed: number;
  49. filesSkipped: number;
  50. filesErrored: number;
  51. nodesCreated: number;
  52. edgesCreated: number;
  53. errors: ExtractionError[];
  54. durationMs: number;
  55. }
  56. /**
  57. * Result of a sync operation
  58. */
  59. export interface SyncResult {
  60. filesChecked: number;
  61. filesAdded: number;
  62. filesModified: number;
  63. filesRemoved: number;
  64. nodesUpdated: number;
  65. durationMs: number;
  66. changedFilePaths?: string[];
  67. }
  68. /**
  69. * Calculate SHA256 hash of file contents
  70. */
  71. export function hashContent(content: string): string {
  72. return crypto.createHash('sha256').update(content).digest('hex');
  73. }
  74. /**
  75. * Check if a path matches any glob pattern (simplified)
  76. */
  77. function matchesGlob(filePath: string, pattern: string): boolean {
  78. filePath = normalizePath(filePath);
  79. return picomatch.isMatch(filePath, pattern, { dot: true });
  80. }
  81. /**
  82. * Check if a file should be included based on config
  83. */
  84. export function shouldIncludeFile(
  85. filePath: string,
  86. config: CodeGraphConfig
  87. ): boolean {
  88. // Check exclude patterns first
  89. for (const pattern of config.exclude) {
  90. if (matchesGlob(filePath, pattern)) {
  91. return false;
  92. }
  93. }
  94. // Check include patterns
  95. for (const pattern of config.include) {
  96. if (matchesGlob(filePath, pattern)) {
  97. return true;
  98. }
  99. }
  100. return false;
  101. }
  102. /**
  103. * Get all files visible to git (tracked + untracked but not ignored).
  104. * Respects .gitignore at all levels (root, subdirectories).
  105. * Returns null on failure (non-git project) so callers can fall back.
  106. */
  107. function getGitVisibleFiles(rootDir: string): Set<string> | null {
  108. try {
  109. // Check if the project directory is gitignored by a parent repo.
  110. // When rootDir lives inside a parent git repo that ignores it,
  111. // `git ls-files` returns nothing — fall back to filesystem walk.
  112. const gitRoot = execFileSync(
  113. 'git',
  114. ['rev-parse', '--show-toplevel'],
  115. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  116. ).trim();
  117. if (path.resolve(gitRoot) !== path.resolve(rootDir)) {
  118. try {
  119. // git check-ignore exits 0 if the path IS ignored, 1 if not
  120. execFileSync(
  121. 'git',
  122. ['check-ignore', '-q', path.resolve(rootDir)],
  123. { cwd: rootDir, encoding: 'utf-8', timeout: 5000, stdio: ['pipe', 'pipe', 'pipe'] }
  124. );
  125. // Directory is gitignored by parent repo — fall back to filesystem walk
  126. return null;
  127. } catch {
  128. // Not ignored — safe to use git ls-files
  129. }
  130. }
  131. // -c = cached (tracked), -o = others (untracked), --exclude-standard = respect .gitignore
  132. const output = execFileSync(
  133. 'git',
  134. ['ls-files', '-co', '--exclude-standard'],
  135. { cwd: rootDir, encoding: 'utf-8', timeout: 30000, maxBuffer: 50 * 1024 * 1024, stdio: ['pipe', 'pipe', 'pipe'] }
  136. );
  137. const files = new Set<string>();
  138. for (const line of output.split('\n')) {
  139. const trimmed = line.trim();
  140. if (trimmed) {
  141. files.add(normalizePath(trimmed));
  142. }
  143. }
  144. return files;
  145. } catch {
  146. return null;
  147. }
  148. }
  149. /**
  150. * Result of git-based change detection.
  151. * Returns null when git is unavailable (non-git project or command failure),
  152. * signaling the caller to fall back to full filesystem scan.
  153. */
  154. interface GitChanges {
  155. modified: string[]; // M, MM, AM — files to re-hash + re-index
  156. added: string[]; // ?? — new untracked files to index
  157. deleted: string[]; // D — files to remove from DB
  158. }
  159. /**
  160. * Use `git status` to detect changed files instead of scanning every file.
  161. * Returns null on failure so callers fall back to full scan.
  162. */
  163. function getGitChangedFiles(rootDir: string, config: CodeGraphConfig): GitChanges | null {
  164. try {
  165. const output = execFileSync(
  166. 'git',
  167. ['status', '--porcelain', '--no-renames'],
  168. { cwd: rootDir, encoding: 'utf-8', timeout: 10000, stdio: ['pipe', 'pipe', 'pipe'] }
  169. );
  170. const modified: string[] = [];
  171. const added: string[] = [];
  172. const deleted: string[] = [];
  173. for (const line of output.split('\n')) {
  174. if (line.length < 4) continue; // Minimum: "XY file"
  175. const statusCode = line.substring(0, 2);
  176. const filePath = normalizePath(line.substring(3));
  177. // Skip files that don't match include/exclude config
  178. if (!shouldIncludeFile(filePath, config)) continue;
  179. if (statusCode === '??') {
  180. added.push(filePath);
  181. } else if (statusCode.includes('D')) {
  182. deleted.push(filePath);
  183. } else {
  184. // M, MM, AM, A (staged), etc. — treat as modified
  185. modified.push(filePath);
  186. }
  187. }
  188. return { modified, added, deleted };
  189. } catch {
  190. return null;
  191. }
  192. }
  193. /**
  194. * Marker file name that indicates a directory (and all children) should be skipped
  195. */
  196. const CODEGRAPH_IGNORE_MARKER = '.codegraphignore';
  197. /**
  198. * Recursively scan directory for source files.
  199. *
  200. * In git repos, uses `git ls-files` to get the file list (inherently
  201. * respects .gitignore at all levels), then filters by config include patterns.
  202. * Falls back to filesystem walk for non-git projects.
  203. */
  204. export function scanDirectory(
  205. rootDir: string,
  206. config: CodeGraphConfig,
  207. onProgress?: (current: number, file: string) => void
  208. ): string[] {
  209. // Fast path: use git to get all visible files (respects .gitignore everywhere)
  210. const gitFiles = getGitVisibleFiles(rootDir);
  211. if (gitFiles) {
  212. const files: string[] = [];
  213. let count = 0;
  214. for (const filePath of gitFiles) {
  215. if (shouldIncludeFile(filePath, config)) {
  216. files.push(filePath);
  217. count++;
  218. onProgress?.(count, filePath);
  219. }
  220. }
  221. return files;
  222. }
  223. // Fallback: walk filesystem for non-git projects
  224. return scanDirectoryWalk(rootDir, config, onProgress);
  225. }
  226. /**
  227. * Filesystem walk fallback for non-git projects.
  228. */
  229. function scanDirectoryWalk(
  230. rootDir: string,
  231. config: CodeGraphConfig,
  232. onProgress?: (current: number, file: string) => void
  233. ): string[] {
  234. const files: string[] = [];
  235. let count = 0;
  236. const visitedDirs = new Set<string>();
  237. function walk(dir: string): void {
  238. let realDir: string;
  239. try {
  240. realDir = fs.realpathSync(dir);
  241. } catch {
  242. logDebug('Skipping unresolvable directory', { dir });
  243. return;
  244. }
  245. if (visitedDirs.has(realDir)) {
  246. logDebug('Skipping already-visited directory (symlink cycle)', { dir, realDir });
  247. return;
  248. }
  249. visitedDirs.add(realDir);
  250. // Check for .codegraphignore marker file
  251. const ignoreMarker = path.join(dir, CODEGRAPH_IGNORE_MARKER);
  252. if (fs.existsSync(ignoreMarker)) {
  253. logDebug('Skipping directory due to .codegraphignore marker', { dir });
  254. return;
  255. }
  256. let entries: fs.Dirent[];
  257. try {
  258. entries = fs.readdirSync(dir, { withFileTypes: true });
  259. } catch (error) {
  260. logDebug('Skipping unreadable directory', { dir, error: String(error) });
  261. return;
  262. }
  263. for (const entry of entries) {
  264. const fullPath = path.join(dir, entry.name);
  265. const relativePath = normalizePath(path.relative(rootDir, fullPath));
  266. if (entry.isSymbolicLink()) {
  267. try {
  268. const realTarget = fs.realpathSync(fullPath);
  269. const stat = fs.statSync(realTarget);
  270. if (stat.isDirectory()) {
  271. const dirPattern = relativePath + '/';
  272. let excluded = false;
  273. for (const pattern of config.exclude) {
  274. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  275. excluded = true;
  276. break;
  277. }
  278. }
  279. if (!excluded) {
  280. walk(fullPath);
  281. }
  282. } else if (stat.isFile()) {
  283. if (shouldIncludeFile(relativePath, config)) {
  284. files.push(relativePath);
  285. count++;
  286. onProgress?.(count, relativePath);
  287. }
  288. }
  289. } catch {
  290. logDebug('Skipping broken symlink', { path: fullPath });
  291. }
  292. continue;
  293. }
  294. if (entry.isDirectory()) {
  295. const dirPattern = relativePath + '/';
  296. let excluded = false;
  297. for (const pattern of config.exclude) {
  298. if (matchesGlob(dirPattern, pattern) || matchesGlob(relativePath, pattern)) {
  299. excluded = true;
  300. break;
  301. }
  302. }
  303. if (!excluded) {
  304. walk(fullPath);
  305. }
  306. } else if (entry.isFile()) {
  307. if (shouldIncludeFile(relativePath, config)) {
  308. files.push(relativePath);
  309. count++;
  310. onProgress?.(count, relativePath);
  311. }
  312. }
  313. }
  314. }
  315. walk(rootDir);
  316. return files;
  317. }
  318. /**
  319. * Extraction orchestrator
  320. */
  321. export class ExtractionOrchestrator {
  322. private rootDir: string;
  323. private config: CodeGraphConfig;
  324. private queries: QueryBuilder;
  325. constructor(rootDir: string, config: CodeGraphConfig, queries: QueryBuilder) {
  326. this.rootDir = rootDir;
  327. this.config = config;
  328. this.queries = queries;
  329. }
  330. /**
  331. * Index all files in the project
  332. */
  333. async indexAll(
  334. onProgress?: (progress: IndexProgress) => void,
  335. signal?: AbortSignal
  336. ): Promise<IndexResult> {
  337. await initGrammars();
  338. const startTime = Date.now();
  339. const errors: ExtractionError[] = [];
  340. let filesIndexed = 0;
  341. let filesSkipped = 0;
  342. let filesErrored = 0;
  343. let totalNodes = 0;
  344. let totalEdges = 0;
  345. // Phase 1: Scan for files
  346. onProgress?.({
  347. phase: 'scanning',
  348. current: 0,
  349. total: 0,
  350. });
  351. const files = scanDirectory(this.rootDir, this.config, (current, file) => {
  352. onProgress?.({
  353. phase: 'scanning',
  354. current,
  355. total: 0,
  356. currentFile: file,
  357. });
  358. });
  359. if (signal?.aborted) {
  360. return {
  361. success: false,
  362. filesIndexed: 0,
  363. filesSkipped: 0,
  364. filesErrored: 0,
  365. nodesCreated: 0,
  366. edgesCreated: 0,
  367. errors: [{ message: 'Aborted', severity: 'error' }],
  368. durationMs: Date.now() - startTime,
  369. };
  370. }
  371. // Load only the grammars needed for languages actually present in the project.
  372. // This avoids compiling all 16+ WASM grammar modules upfront, which can cause
  373. // V8 WASM Zone OOM on large codebases (see issue #54).
  374. const neededLanguages = [...new Set(files.map((f) => detectLanguage(f)))];
  375. await loadGrammarsForLanguages(neededLanguages);
  376. // Phase 2: Parse files (read in parallel batches, parse/store sequentially)
  377. const total = files.length;
  378. let processed = 0;
  379. const parseCounts = new Map<Language, number>(); // track parses per language for WASM reset
  380. for (let i = 0; i < files.length; i += FILE_IO_BATCH_SIZE) {
  381. if (signal?.aborted) {
  382. return {
  383. success: false,
  384. filesIndexed,
  385. filesSkipped,
  386. filesErrored,
  387. nodesCreated: totalNodes,
  388. edgesCreated: totalEdges,
  389. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  390. durationMs: Date.now() - startTime,
  391. };
  392. }
  393. const batch = files.slice(i, i + FILE_IO_BATCH_SIZE);
  394. // Read files in parallel (with path validation before any I/O)
  395. const fileContents = await Promise.all(
  396. batch.map(async (fp) => {
  397. try {
  398. const fullPath = validatePathWithinRoot(this.rootDir, fp);
  399. if (!fullPath) {
  400. logWarn('Path traversal blocked in batch reader', { filePath: fp });
  401. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: new Error('Path traversal blocked') };
  402. }
  403. const content = await fsp.readFile(fullPath, 'utf-8');
  404. const stats = await fsp.stat(fullPath);
  405. return { filePath: fp, content, stats, error: null as Error | null };
  406. } catch (err) {
  407. return { filePath: fp, content: null as string | null, stats: null as fs.Stats | null, error: err as Error };
  408. }
  409. })
  410. );
  411. // Parse and store sequentially
  412. for (const { filePath, content, stats, error } of fileContents) {
  413. if (signal?.aborted) {
  414. return {
  415. success: false,
  416. filesIndexed,
  417. filesSkipped,
  418. filesErrored,
  419. nodesCreated: totalNodes,
  420. edgesCreated: totalEdges,
  421. errors: [{ message: 'Aborted', severity: 'error' }, ...errors],
  422. durationMs: Date.now() - startTime,
  423. };
  424. }
  425. processed++;
  426. onProgress?.({
  427. phase: 'parsing',
  428. current: processed,
  429. total,
  430. currentFile: filePath,
  431. });
  432. if (error || content === null || stats === null) {
  433. filesErrored++;
  434. errors.push({
  435. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  436. filePath,
  437. severity: 'error',
  438. code: 'read_error',
  439. });
  440. continue;
  441. }
  442. const result = await this.indexFileWithContent(filePath, content, stats);
  443. // Periodically reset the parser to reclaim WASM heap memory.
  444. // Without this, tree-sitter's WASM runtime fragments its heap
  445. // across thousands of parses and eventually crashes.
  446. const lang = detectLanguage(filePath);
  447. const count = (parseCounts.get(lang) ?? 0) + 1;
  448. parseCounts.set(lang, count);
  449. if (count % PARSER_RESET_INTERVAL === 0) {
  450. resetParser(lang);
  451. }
  452. if (result.errors.length > 0) {
  453. // Annotate errors with file path if not already set
  454. for (const err of result.errors) {
  455. if (!err.filePath) err.filePath = filePath;
  456. }
  457. errors.push(...result.errors);
  458. }
  459. if (result.nodes.length > 0) {
  460. filesIndexed++;
  461. totalNodes += result.nodes.length;
  462. totalEdges += result.edges.length;
  463. } else if (result.errors.some((e) => e.severity === 'error')) {
  464. filesErrored++;
  465. } else {
  466. filesSkipped++;
  467. }
  468. }
  469. }
  470. // Phase 3: Resolve references
  471. onProgress?.({
  472. phase: 'resolving',
  473. current: 0,
  474. total: 1,
  475. });
  476. // TODO: Implement reference resolution in Phase 3
  477. return {
  478. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  479. filesIndexed,
  480. filesSkipped,
  481. filesErrored,
  482. nodesCreated: totalNodes,
  483. edgesCreated: totalEdges,
  484. errors,
  485. durationMs: Date.now() - startTime,
  486. };
  487. }
  488. /**
  489. * Index specific files
  490. */
  491. async indexFiles(filePaths: string[]): Promise<IndexResult> {
  492. const startTime = Date.now();
  493. const errors: ExtractionError[] = [];
  494. let filesIndexed = 0;
  495. let filesSkipped = 0;
  496. let filesErrored = 0;
  497. let totalNodes = 0;
  498. let totalEdges = 0;
  499. for (const filePath of filePaths) {
  500. const result = await this.indexFile(filePath);
  501. if (result.errors.length > 0) {
  502. errors.push(...result.errors);
  503. }
  504. if (result.nodes.length > 0) {
  505. filesIndexed++;
  506. totalNodes += result.nodes.length;
  507. totalEdges += result.edges.length;
  508. } else if (result.errors.some((e) => e.severity === 'error')) {
  509. filesErrored++;
  510. } else {
  511. filesSkipped++;
  512. }
  513. }
  514. return {
  515. success: filesIndexed > 0 || errors.filter((e) => e.severity === 'error').length === 0,
  516. filesIndexed,
  517. filesSkipped,
  518. filesErrored,
  519. nodesCreated: totalNodes,
  520. edgesCreated: totalEdges,
  521. errors,
  522. durationMs: Date.now() - startTime,
  523. };
  524. }
  525. /**
  526. * Index a single file
  527. */
  528. async indexFile(relativePath: string): Promise<ExtractionResult> {
  529. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  530. if (!fullPath) {
  531. return {
  532. nodes: [],
  533. edges: [],
  534. unresolvedReferences: [],
  535. errors: [{ message: `Path traversal blocked: ${relativePath}`, filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  536. durationMs: 0,
  537. };
  538. }
  539. // Read file content and stats
  540. let content: string;
  541. let stats: fs.Stats;
  542. try {
  543. stats = await fsp.stat(fullPath);
  544. content = await fsp.readFile(fullPath, 'utf-8');
  545. } catch (error) {
  546. return {
  547. nodes: [],
  548. edges: [],
  549. unresolvedReferences: [],
  550. errors: [
  551. {
  552. message: `Failed to read file: ${error instanceof Error ? error.message : String(error)}`,
  553. filePath: relativePath,
  554. severity: 'error',
  555. code: 'read_error',
  556. },
  557. ],
  558. durationMs: 0,
  559. };
  560. }
  561. return this.indexFileWithContent(relativePath, content, stats);
  562. }
  563. /**
  564. * Index a single file with pre-read content and stats.
  565. * Used by the parallel batch reader to avoid redundant file I/O.
  566. */
  567. async indexFileWithContent(
  568. relativePath: string,
  569. content: string,
  570. stats: fs.Stats
  571. ): Promise<ExtractionResult> {
  572. // Prevent path traversal
  573. const fullPath = validatePathWithinRoot(this.rootDir, relativePath);
  574. if (!fullPath) {
  575. logWarn('Path traversal blocked in indexFileWithContent', { relativePath });
  576. return {
  577. nodes: [],
  578. edges: [],
  579. unresolvedReferences: [],
  580. errors: [{ message: 'Path traversal blocked', filePath: relativePath, severity: 'error', code: 'path_traversal' }],
  581. durationMs: 0,
  582. };
  583. }
  584. // Check file size
  585. if (stats.size > this.config.maxFileSize) {
  586. return {
  587. nodes: [],
  588. edges: [],
  589. unresolvedReferences: [],
  590. errors: [
  591. {
  592. message: `File exceeds max size (${stats.size} > ${this.config.maxFileSize})`,
  593. filePath: relativePath,
  594. severity: 'warning',
  595. code: 'size_exceeded',
  596. },
  597. ],
  598. durationMs: 0,
  599. };
  600. }
  601. // Detect language
  602. const language = detectLanguage(relativePath);
  603. if (!isLanguageSupported(language)) {
  604. return {
  605. nodes: [],
  606. edges: [],
  607. unresolvedReferences: [],
  608. errors: [],
  609. durationMs: 0,
  610. };
  611. }
  612. // Extract from source
  613. const result = extractFromSource(relativePath, content, language);
  614. // Store in database
  615. if (result.nodes.length > 0 || result.errors.length === 0) {
  616. this.storeExtractionResult(relativePath, content, language, stats, result);
  617. }
  618. return result;
  619. }
  620. /**
  621. * Store extraction result in database
  622. */
  623. private storeExtractionResult(
  624. filePath: string,
  625. content: string,
  626. language: Language,
  627. stats: fs.Stats,
  628. result: ExtractionResult
  629. ): void {
  630. const contentHash = hashContent(content);
  631. // Check if file already exists and hasn't changed
  632. const existingFile = this.queries.getFileByPath(filePath);
  633. if (existingFile && existingFile.contentHash === contentHash) {
  634. return; // No changes
  635. }
  636. // Delete existing data for this file
  637. if (existingFile) {
  638. this.queries.deleteFile(filePath);
  639. }
  640. // Filter out nodes with missing required fields before insertion.
  641. // This prevents FK violations when edges reference nodes that would
  642. // be silently skipped by insertNode() (see issue #42).
  643. const validNodes = result.nodes.filter((n) => n.id && n.kind && n.name && n.filePath && n.language);
  644. // Insert nodes
  645. if (validNodes.length > 0) {
  646. this.queries.insertNodes(validNodes);
  647. }
  648. // Filter edges to only reference nodes that were actually inserted
  649. if (result.edges.length > 0) {
  650. const insertedIds = new Set(validNodes.map((n) => n.id));
  651. const validEdges = result.edges.filter(
  652. (e) => insertedIds.has(e.source) && insertedIds.has(e.target)
  653. );
  654. if (validEdges.length > 0) {
  655. this.queries.insertEdges(validEdges);
  656. }
  657. }
  658. // Insert unresolved references in batch with denormalized filePath/language
  659. if (result.unresolvedReferences.length > 0) {
  660. const insertedIds = new Set(validNodes.map((n) => n.id));
  661. const refsWithContext = result.unresolvedReferences
  662. .filter((ref) => insertedIds.has(ref.fromNodeId))
  663. .map((ref) => ({
  664. ...ref,
  665. filePath: ref.filePath ?? filePath,
  666. language: ref.language ?? language,
  667. }));
  668. if (refsWithContext.length > 0) {
  669. this.queries.insertUnresolvedRefsBatch(refsWithContext);
  670. }
  671. }
  672. // Insert file record
  673. const fileRecord: FileRecord = {
  674. path: filePath,
  675. contentHash,
  676. language,
  677. size: stats.size,
  678. modifiedAt: stats.mtimeMs,
  679. indexedAt: Date.now(),
  680. nodeCount: result.nodes.length,
  681. errors: result.errors.length > 0 ? result.errors : undefined,
  682. };
  683. this.queries.upsertFile(fileRecord);
  684. }
  685. /**
  686. * Sync with current file state.
  687. * Uses git status as a fast path when available, falling back to full scan.
  688. */
  689. async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
  690. await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
  691. const startTime = Date.now();
  692. let filesChecked = 0;
  693. let filesAdded = 0;
  694. let filesModified = 0;
  695. let filesRemoved = 0;
  696. let nodesUpdated = 0;
  697. const changedFilePaths: string[] = [];
  698. onProgress?.({
  699. phase: 'scanning',
  700. current: 0,
  701. total: 0,
  702. });
  703. const filesToIndex: string[] = [];
  704. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  705. if (gitChanges) {
  706. // === Git fast path ===
  707. // Only inspect the files git reports as changed instead of scanning everything.
  708. filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
  709. // Handle deleted files
  710. for (const filePath of gitChanges.deleted) {
  711. const tracked = this.queries.getFileByPath(filePath);
  712. if (tracked) {
  713. this.queries.deleteFile(filePath);
  714. filesRemoved++;
  715. }
  716. }
  717. // Handle modified files — read + hash only these files
  718. for (const filePath of gitChanges.modified) {
  719. const fullPath = path.join(this.rootDir, filePath);
  720. let content: string;
  721. try {
  722. content = fs.readFileSync(fullPath, 'utf-8');
  723. } catch (error) {
  724. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  725. continue;
  726. }
  727. const contentHash = hashContent(content);
  728. const tracked = this.queries.getFileByPath(filePath);
  729. if (!tracked) {
  730. filesToIndex.push(filePath);
  731. changedFilePaths.push(filePath);
  732. filesAdded++;
  733. } else if (tracked.contentHash !== contentHash) {
  734. filesToIndex.push(filePath);
  735. changedFilePaths.push(filePath);
  736. filesModified++;
  737. }
  738. }
  739. // Handle added (untracked) files
  740. for (const filePath of gitChanges.added) {
  741. filesToIndex.push(filePath);
  742. changedFilePaths.push(filePath);
  743. filesAdded++;
  744. }
  745. } else {
  746. // === Fallback: full scan (non-git project or git failure) ===
  747. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  748. filesChecked = currentFiles.size;
  749. // Build Map for O(1) lookups instead of .find() per file
  750. const trackedFiles = this.queries.getAllFiles();
  751. const trackedMap = new Map<string, FileRecord>();
  752. for (const f of trackedFiles) {
  753. trackedMap.set(f.path, f);
  754. }
  755. // Find files to remove (in DB but not on disk)
  756. for (const tracked of trackedFiles) {
  757. if (!currentFiles.has(tracked.path)) {
  758. this.queries.deleteFile(tracked.path);
  759. filesRemoved++;
  760. }
  761. }
  762. // Find files to add or update
  763. for (const filePath of currentFiles) {
  764. const fullPath = path.join(this.rootDir, filePath);
  765. let content: string;
  766. try {
  767. content = fs.readFileSync(fullPath, 'utf-8');
  768. } catch (error) {
  769. logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
  770. continue;
  771. }
  772. const contentHash = hashContent(content);
  773. const tracked = trackedMap.get(filePath);
  774. if (!tracked) {
  775. filesToIndex.push(filePath);
  776. changedFilePaths.push(filePath);
  777. filesAdded++;
  778. } else if (tracked.contentHash !== contentHash) {
  779. filesToIndex.push(filePath);
  780. changedFilePaths.push(filePath);
  781. filesModified++;
  782. }
  783. }
  784. }
  785. // Load only grammars needed for changed files
  786. if (filesToIndex.length > 0) {
  787. const neededLanguages = [...new Set(filesToIndex.map((f) => detectLanguage(f)))];
  788. await loadGrammarsForLanguages(neededLanguages);
  789. }
  790. // Index changed files
  791. const total = filesToIndex.length;
  792. for (let i = 0; i < filesToIndex.length; i++) {
  793. const filePath = filesToIndex[i]!;
  794. onProgress?.({
  795. phase: 'parsing',
  796. current: i + 1,
  797. total,
  798. currentFile: filePath,
  799. });
  800. const result = await this.indexFile(filePath);
  801. nodesUpdated += result.nodes.length;
  802. }
  803. return {
  804. filesChecked,
  805. filesAdded,
  806. filesModified,
  807. filesRemoved,
  808. nodesUpdated,
  809. durationMs: Date.now() - startTime,
  810. changedFilePaths: changedFilePaths.length > 0 ? changedFilePaths : undefined,
  811. };
  812. }
  813. /**
  814. * Get files that have changed since last index.
  815. * Uses git status as a fast path when available, falling back to full scan.
  816. */
  817. getChangedFiles(): { added: string[]; modified: string[]; removed: string[] } {
  818. const gitChanges = getGitChangedFiles(this.rootDir, this.config);
  819. if (gitChanges) {
  820. // === Git fast path ===
  821. const added: string[] = [];
  822. const modified: string[] = [];
  823. const removed: string[] = [];
  824. // Deleted files — only report if tracked in DB
  825. for (const filePath of gitChanges.deleted) {
  826. const tracked = this.queries.getFileByPath(filePath);
  827. if (tracked) {
  828. removed.push(filePath);
  829. }
  830. }
  831. // Modified files — read + hash only these, compare with DB
  832. for (const filePath of gitChanges.modified) {
  833. const fullPath = path.join(this.rootDir, filePath);
  834. let content: string;
  835. try {
  836. content = fs.readFileSync(fullPath, 'utf-8');
  837. } catch (error) {
  838. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  839. continue;
  840. }
  841. const contentHash = hashContent(content);
  842. const tracked = this.queries.getFileByPath(filePath);
  843. if (!tracked) {
  844. added.push(filePath);
  845. } else if (tracked.contentHash !== contentHash) {
  846. modified.push(filePath);
  847. }
  848. }
  849. // Added (untracked) files
  850. for (const filePath of gitChanges.added) {
  851. added.push(filePath);
  852. }
  853. return { added, modified, removed };
  854. }
  855. // === Fallback: full scan (non-git project or git failure) ===
  856. const currentFiles = new Set(scanDirectory(this.rootDir, this.config));
  857. const trackedFiles = this.queries.getAllFiles();
  858. // Build Map for O(1) lookups
  859. const trackedMap = new Map<string, FileRecord>();
  860. for (const f of trackedFiles) {
  861. trackedMap.set(f.path, f);
  862. }
  863. const added: string[] = [];
  864. const modified: string[] = [];
  865. const removed: string[] = [];
  866. // Find removed files
  867. for (const tracked of trackedFiles) {
  868. if (!currentFiles.has(tracked.path)) {
  869. removed.push(tracked.path);
  870. }
  871. }
  872. // Find added and modified files
  873. for (const filePath of currentFiles) {
  874. const fullPath = path.join(this.rootDir, filePath);
  875. let content: string;
  876. try {
  877. content = fs.readFileSync(fullPath, 'utf-8');
  878. } catch (error) {
  879. logDebug('Skipping unreadable file while detecting changes', { filePath, error: String(error) });
  880. continue;
  881. }
  882. const contentHash = hashContent(content);
  883. const tracked = trackedMap.get(filePath);
  884. if (!tracked) {
  885. added.push(filePath);
  886. } else if (tracked.contentHash !== contentHash) {
  887. modified.push(filePath);
  888. }
  889. }
  890. return { added, modified, removed };
  891. }
  892. }
  893. // Re-export useful types and functions
  894. export { extractFromSource } from './tree-sitter';
  895. export { detectLanguage, isLanguageSupported, isGrammarLoaded, getSupportedLanguages, initGrammars, loadGrammarsForLanguages, loadAllGrammars } from './grammars';