utils.ts 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. /**
  2. * CodeGraph Utilities
  3. *
  4. * Common utility functions for memory management, concurrency, batching,
  5. * and security validation.
  6. *
  7. * @module utils
  8. *
  9. * @example
  10. * ```typescript
  11. * import { Mutex, processInBatches, MemoryMonitor, validatePathWithinRoot } from 'codegraph';
  12. *
  13. * // Use mutex for concurrent safety
  14. * const mutex = new Mutex();
  15. * await mutex.withLock(async () => {
  16. * await performCriticalOperation();
  17. * });
  18. *
  19. * // Process items in batches to manage memory
  20. * const results = await processInBatches(items, 100, async (item) => {
  21. * return await processItem(item);
  22. * });
  23. *
  24. * // Monitor memory usage
  25. * const monitor = new MemoryMonitor(512, (usage) => {
  26. * console.warn(`Memory usage exceeded 512MB: ${usage / 1024 / 1024}MB`);
  27. * });
  28. * monitor.start();
  29. * ```
  30. */
  31. import * as fs from 'fs';
  32. import * as path from 'path';
  33. // ============================================================
  34. // SECURITY UTILITIES
  35. // ============================================================
  36. /**
  37. * Sensitive system directories that should never be used as project roots.
  38. * Checked on all platforms; non-applicable paths are harmlessly skipped.
  39. */
  40. const SENSITIVE_PATHS = new Set([
  41. '/', '/etc', '/usr', '/bin', '/sbin', '/var', '/tmp', '/dev', '/proc', '/sys',
  42. '/root', '/boot', '/lib', '/lib64', '/opt',
  43. 'c:\\', 'c:\\windows', 'c:\\windows\\system32',
  44. ]);
  45. /**
  46. * Config "languages" whose nodes are pure key/value DATA lifted from a config
  47. * file (e.g. Spring `application.{yml,properties}`), not source code.
  48. */
  49. export const CONFIG_LEAF_LANGUAGES: ReadonlySet<string> = new Set(['yaml', 'properties']);
  50. /**
  51. * A config-leaf node is a single key lifted out of a pure config/data file —
  52. * `kind: 'constant'` in a {@link CONFIG_LEAF_LANGUAGES} language. Its on-disk
  53. * line is `key = <value>`, and that value is routinely a secret (DB password,
  54. * API key, JDBC URL with embedded creds). CodeGraph must surface the KEY only
  55. * and never read/return the value, or it pushes secrets into agent context
  56. * unbidden — the value isn't needed for resolution, and an agent that genuinely
  57. * needs it can read the file directly. (#383)
  58. */
  59. export function isConfigLeafNode(node: { kind: string; language?: string }): boolean {
  60. return node.kind === 'constant' && !!node.language && CONFIG_LEAF_LANGUAGES.has(node.language);
  61. }
  62. /**
  63. * Whether `child` is `parent` itself or sits underneath it. Case-insensitive on
  64. * Windows — NTFS is case-insensitive, and realpathSync can hand back a different
  65. * case than the lexical root, which would otherwise false-reject a valid file.
  66. */
  67. function isWithinDir(child: string, parent: string): boolean {
  68. let c = child;
  69. let p = parent;
  70. if (process.platform === 'win32') {
  71. c = c.toLowerCase();
  72. p = p.toLowerCase();
  73. }
  74. return c === p || c.startsWith(p + path.sep);
  75. }
  76. /**
  77. * Validate that a file path stays within the project root, resolving symlinks.
  78. *
  79. * Two layers: a cheap lexical check that catches `../` traversal, then a
  80. * realpath check that catches symlink escapes — an in-repo symlink whose
  81. * logical path is inside the root but whose real target points outside it
  82. * (issue #527). A symlink that stays within the root is still allowed, so
  83. * legitimate in-tree symlinks keep working. Both content-serving read sinks
  84. * (codegraph_node `includeCode`, codegraph_explore source) go through here, so
  85. * this is the chokepoint that keeps out-of-root file contents from leaking.
  86. *
  87. * `allowSymlinkEscape` waives **only** the realpath-escape rejection (the
  88. * lexical `../` guard still applies) for the INDEXING read path. The directory
  89. * walk deliberately descends into in-root symlinks whose targets live outside
  90. * the root (e.g. a `game/` symlink in a Dota custom-game tree, #935); discovery
  91. * and the reader must agree, or every file the walk enumerated fails to index.
  92. * Indexing only reads paths it just discovered, into a local index — it never
  93. * serves them to an agent, so this does not widen the #527 leak surface. The
  94. * content-serving sinks must never pass this flag.
  95. *
  96. * @param projectRoot - The project root directory
  97. * @param filePath - The (relative or absolute) file path to validate
  98. * @param options.allowSymlinkEscape - Follow in-root symlinks out of the root
  99. * (indexing read path only); defaults to the strict, leak-safe behavior.
  100. * @returns The resolved absolute path (realpath when it exists), or null if it
  101. * escapes the root
  102. */
  103. export function validatePathWithinRoot(
  104. projectRoot: string,
  105. filePath: string,
  106. options?: { allowSymlinkEscape?: boolean }
  107. ): string | null {
  108. const resolved = path.resolve(projectRoot, filePath);
  109. const normalizedRoot = path.resolve(projectRoot);
  110. // 1. Lexical containment — cheap, catches `../` traversal. Applies even on
  111. // the indexing read path: a crafted `../` escape is still rejected.
  112. if (!isWithinDir(resolved, normalizedRoot)) {
  113. return null;
  114. }
  115. // 2. Symlink-aware containment — resolve symlinks on both sides and re-check,
  116. // so an in-repo symlink whose real target escapes the root is rejected.
  117. // The indexing read path (allowSymlinkEscape) skips only this rejection so
  118. // it stays consistent with the directory walk, which already followed the
  119. // in-root symlink to enumerate these files (#935).
  120. try {
  121. const realRoot = fs.realpathSync(normalizedRoot);
  122. const realResolved = fs.realpathSync(resolved);
  123. if (options?.allowSymlinkEscape) {
  124. return realResolved;
  125. }
  126. return isWithinDir(realResolved, realRoot) ? realResolved : null;
  127. } catch (err) {
  128. // ENOENT: the path doesn't exist yet (a file about to be written, or an
  129. // index entry for a since-deleted file) — no symlink to follow, and the
  130. // lexical check already passed, so allow the lexical path. Any other
  131. // resolution failure (ELOOP, EACCES, …) is treated as unsafe → reject.
  132. if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
  133. return resolved;
  134. }
  135. return null;
  136. }
  137. }
  138. /**
  139. * Validate that a path is a safe project root directory.
  140. *
  141. * Rejects sensitive system directories and ensures the path is
  142. * a real, existing directory. Used at MCP and API entry points
  143. * to prevent arbitrary directory access.
  144. *
  145. * @param dirPath - The path to validate
  146. * @returns An error message if invalid, or null if valid
  147. */
  148. export function validateProjectPath(dirPath: string): string | null {
  149. const resolved = path.resolve(dirPath);
  150. // Block sensitive system directories
  151. if (SENSITIVE_PATHS.has(resolved) || SENSITIVE_PATHS.has(resolved.toLowerCase())) {
  152. return `Refusing to operate on sensitive system directory: ${resolved}`;
  153. }
  154. // Also block common sensitive home subdirectories
  155. const homeDir = require('os').homedir();
  156. const sensitiveHomeDirs = ['.ssh', '.gnupg', '.aws', '.config'];
  157. for (const dir of sensitiveHomeDirs) {
  158. const sensitivePath = path.join(homeDir, dir);
  159. if (resolved === sensitivePath || resolved.startsWith(sensitivePath + path.sep)) {
  160. return `Refusing to operate on sensitive directory: ${resolved}`;
  161. }
  162. }
  163. // Verify it's a real directory
  164. try {
  165. const stats = fs.statSync(resolved);
  166. if (!stats.isDirectory()) {
  167. return `Path is not a directory: ${resolved}`;
  168. }
  169. } catch {
  170. return `Path does not exist or is not accessible: ${resolved}`;
  171. }
  172. return null;
  173. }
  174. /**
  175. * Safely parse JSON with a fallback value.
  176. * Prevents crashes from corrupted database metadata.
  177. */
  178. export function safeJsonParse<T>(value: string, fallback: T): T {
  179. try {
  180. return JSON.parse(value);
  181. } catch {
  182. return fallback;
  183. }
  184. }
  185. /**
  186. * Clamp a numeric value to a range.
  187. * Used to enforce sane limits on MCP tool inputs.
  188. */
  189. export function clamp(value: number, min: number, max: number): number {
  190. return Math.max(min, Math.min(max, value));
  191. }
  192. /**
  193. * Normalize a file path to use forward slashes.
  194. * Fixes Windows backslash paths so glob matching works consistently.
  195. */
  196. export function normalizePath(filePath: string): string {
  197. return filePath.replace(/\\/g, '/');
  198. }
  199. /**
  200. * Cross-process file lock using a lock file with PID tracking.
  201. *
  202. * Prevents multiple processes (e.g., git hooks, CLI, MCP server) from
  203. * writing to the same database simultaneously.
  204. */
  205. export class FileLock {
  206. private lockPath: string;
  207. private held = false;
  208. /** Locks older than this are considered stale regardless of PID status */
  209. private static readonly STALE_TIMEOUT_MS = 2 * 60 * 1000; // 2 minutes
  210. constructor(lockPath: string) {
  211. this.lockPath = lockPath;
  212. }
  213. /**
  214. * Acquire the lock. Throws if the lock is held by another live process.
  215. */
  216. acquire(): void {
  217. // Check for existing lock
  218. if (fs.existsSync(this.lockPath)) {
  219. try {
  220. const content = fs.readFileSync(this.lockPath, 'utf-8').trim();
  221. const pid = parseInt(content, 10);
  222. const stat = fs.statSync(this.lockPath);
  223. const lockAge = Date.now() - stat.mtimeMs;
  224. // Treat locks older than the timeout as stale, regardless of PID
  225. if (lockAge < FileLock.STALE_TIMEOUT_MS && !isNaN(pid) && this.isProcessAlive(pid)) {
  226. throw new Error(
  227. `CodeGraph database is locked by another process (PID ${pid}). ` +
  228. `If this is stale, run 'codegraph unlock' or delete ${this.lockPath}`
  229. );
  230. }
  231. // Stale lock (dead process or timed out) - remove it
  232. fs.unlinkSync(this.lockPath);
  233. } catch (err) {
  234. if (err instanceof Error && err.message.includes('locked by another')) {
  235. throw err;
  236. }
  237. // Other errors reading lock file - try to remove it
  238. try { fs.unlinkSync(this.lockPath); } catch { /* ignore */ }
  239. }
  240. }
  241. // Write our PID to the lock file using exclusive create flag
  242. try {
  243. fs.writeFileSync(this.lockPath, String(process.pid), { flag: 'wx' });
  244. this.held = true;
  245. } catch (err: any) {
  246. if (err.code === 'EEXIST') {
  247. // Race condition: another process grabbed the lock between our check and write
  248. throw new Error(
  249. 'CodeGraph database is locked by another process. ' +
  250. `If this is stale, run 'codegraph unlock' or delete ${this.lockPath}`
  251. );
  252. }
  253. throw err;
  254. }
  255. }
  256. /**
  257. * Release the lock
  258. */
  259. release(): void {
  260. if (!this.held) return;
  261. try {
  262. // Only remove if we still own it (check PID)
  263. const content = fs.readFileSync(this.lockPath, 'utf-8').trim();
  264. if (parseInt(content, 10) === process.pid) {
  265. fs.unlinkSync(this.lockPath);
  266. }
  267. } catch {
  268. // Lock file already gone - that's fine
  269. }
  270. this.held = false;
  271. }
  272. /**
  273. * Execute a function while holding the lock
  274. */
  275. withLock<T>(fn: () => T): T {
  276. this.acquire();
  277. try {
  278. return fn();
  279. } finally {
  280. this.release();
  281. }
  282. }
  283. /**
  284. * Execute an async function while holding the lock
  285. */
  286. async withLockAsync<T>(fn: () => Promise<T>): Promise<T> {
  287. this.acquire();
  288. try {
  289. return await fn();
  290. } finally {
  291. this.release();
  292. }
  293. }
  294. /**
  295. * Check if a process is still running
  296. */
  297. private isProcessAlive(pid: number): boolean {
  298. try {
  299. process.kill(pid, 0);
  300. return true;
  301. } catch {
  302. return false;
  303. }
  304. }
  305. }
  306. /**
  307. * Process items in batches to manage memory
  308. *
  309. * @param items - Array of items to process
  310. * @param batchSize - Number of items per batch
  311. * @param processor - Function to process each item
  312. * @param onBatchComplete - Optional callback after each batch
  313. * @returns Array of results
  314. */
  315. export async function processInBatches<T, R>(
  316. items: T[],
  317. batchSize: number,
  318. processor: (item: T, index: number) => Promise<R>,
  319. onBatchComplete?: (completed: number, total: number) => void
  320. ): Promise<R[]> {
  321. const results: R[] = [];
  322. for (let i = 0; i < items.length; i += batchSize) {
  323. const batch = items.slice(i, Math.min(i + batchSize, items.length));
  324. const batchResults = await Promise.all(
  325. batch.map((item, idx) => processor(item, i + idx))
  326. );
  327. results.push(...batchResults);
  328. if (onBatchComplete) {
  329. onBatchComplete(Math.min(i + batchSize, items.length), items.length);
  330. }
  331. // Allow GC between batches
  332. if (global.gc) {
  333. global.gc();
  334. }
  335. }
  336. return results;
  337. }
  338. /**
  339. * Simple mutex lock for preventing concurrent operations
  340. */
  341. export class Mutex {
  342. private locked = false;
  343. private waitQueue: Array<() => void> = [];
  344. /**
  345. * Acquire the lock
  346. *
  347. * @returns A release function to call when done
  348. */
  349. async acquire(): Promise<() => void> {
  350. while (this.locked) {
  351. await new Promise<void>((resolve) => {
  352. this.waitQueue.push(resolve);
  353. });
  354. }
  355. this.locked = true;
  356. return () => {
  357. this.locked = false;
  358. const next = this.waitQueue.shift();
  359. if (next) {
  360. next();
  361. }
  362. };
  363. }
  364. /**
  365. * Execute a function while holding the lock
  366. */
  367. async withLock<T>(fn: () => Promise<T> | T): Promise<T> {
  368. const release = await this.acquire();
  369. try {
  370. return await fn();
  371. } finally {
  372. release();
  373. }
  374. }
  375. /**
  376. * Check if the lock is currently held
  377. */
  378. isLocked(): boolean {
  379. return this.locked;
  380. }
  381. }
  382. /**
  383. * Chunked file reader for large files
  384. *
  385. * Reads a file in chunks to avoid loading entire file into memory.
  386. */
  387. export async function* readFileInChunks(
  388. filePath: string,
  389. chunkSize: number = 64 * 1024
  390. ): AsyncGenerator<string, void, undefined> {
  391. const fs = await import('fs');
  392. const fd = fs.openSync(filePath, 'r');
  393. const buffer = Buffer.alloc(chunkSize);
  394. try {
  395. let bytesRead: number;
  396. while ((bytesRead = fs.readSync(fd, buffer, 0, chunkSize, null)) > 0) {
  397. yield buffer.toString('utf-8', 0, bytesRead);
  398. }
  399. } finally {
  400. fs.closeSync(fd);
  401. }
  402. }
  403. /**
  404. * Debounce a function
  405. *
  406. * @param fn - Function to debounce
  407. * @param delay - Delay in milliseconds
  408. * @returns Debounced function
  409. */
  410. export function debounce<T extends (...args: unknown[]) => unknown>(
  411. fn: T,
  412. delay: number
  413. ): (...args: Parameters<T>) => void {
  414. let timeoutId: ReturnType<typeof setTimeout> | null = null;
  415. return (...args: Parameters<T>) => {
  416. if (timeoutId) {
  417. clearTimeout(timeoutId);
  418. }
  419. timeoutId = setTimeout(() => {
  420. fn(...args);
  421. timeoutId = null;
  422. }, delay);
  423. };
  424. }
  425. /**
  426. * Throttle a function
  427. *
  428. * @param fn - Function to throttle
  429. * @param limit - Minimum time between calls in milliseconds
  430. * @returns Throttled function
  431. */
  432. export function throttle<T extends (...args: unknown[]) => unknown>(
  433. fn: T,
  434. limit: number
  435. ): (...args: Parameters<T>) => void {
  436. let lastCall = 0;
  437. let timeoutId: ReturnType<typeof setTimeout> | null = null;
  438. return (...args: Parameters<T>) => {
  439. const now = Date.now();
  440. const remaining = limit - (now - lastCall);
  441. if (remaining <= 0) {
  442. if (timeoutId) {
  443. clearTimeout(timeoutId);
  444. timeoutId = null;
  445. }
  446. lastCall = now;
  447. fn(...args);
  448. } else if (!timeoutId) {
  449. timeoutId = setTimeout(() => {
  450. lastCall = Date.now();
  451. timeoutId = null;
  452. fn(...args);
  453. }, remaining);
  454. }
  455. };
  456. }
  457. /**
  458. * Estimate memory usage of an object (rough approximation)
  459. *
  460. * @param obj - Object to measure
  461. * @returns Approximate size in bytes
  462. */
  463. export function estimateSize(obj: unknown): number {
  464. const seen = new WeakSet();
  465. function sizeOf(value: unknown): number {
  466. if (value === null || value === undefined) {
  467. return 0;
  468. }
  469. switch (typeof value) {
  470. case 'boolean':
  471. return 4;
  472. case 'number':
  473. return 8;
  474. case 'string':
  475. return 2 * (value as string).length;
  476. case 'object':
  477. if (seen.has(value as object)) {
  478. return 0;
  479. }
  480. seen.add(value as object);
  481. if (Array.isArray(value)) {
  482. return value.reduce((acc: number, item) => acc + sizeOf(item), 0);
  483. }
  484. return Object.entries(value as object).reduce(
  485. (acc, [key, val]) => acc + sizeOf(key) + sizeOf(val),
  486. 0
  487. );
  488. default:
  489. return 0;
  490. }
  491. }
  492. return sizeOf(obj);
  493. }
  494. /**
  495. * Memory monitor for tracking usage during operations
  496. */
  497. export class MemoryMonitor {
  498. private checkInterval: ReturnType<typeof setInterval> | null = null;
  499. private peakUsage = 0;
  500. private threshold: number;
  501. private onThresholdExceeded?: (usage: number) => void;
  502. constructor(
  503. thresholdMB: number = 500,
  504. onThresholdExceeded?: (usage: number) => void
  505. ) {
  506. this.threshold = thresholdMB * 1024 * 1024;
  507. this.onThresholdExceeded = onThresholdExceeded;
  508. }
  509. /**
  510. * Start monitoring memory usage
  511. */
  512. start(intervalMs: number = 1000): void {
  513. this.stop();
  514. this.peakUsage = 0;
  515. this.checkInterval = setInterval(() => {
  516. const usage = process.memoryUsage().heapUsed;
  517. if (usage > this.peakUsage) {
  518. this.peakUsage = usage;
  519. }
  520. if (usage > this.threshold && this.onThresholdExceeded) {
  521. this.onThresholdExceeded(usage);
  522. }
  523. }, intervalMs);
  524. }
  525. /**
  526. * Stop monitoring
  527. */
  528. stop(): void {
  529. if (this.checkInterval) {
  530. clearInterval(this.checkInterval);
  531. this.checkInterval = null;
  532. }
  533. }
  534. /**
  535. * Get peak memory usage in bytes
  536. */
  537. getPeakUsage(): number {
  538. return this.peakUsage;
  539. }
  540. /**
  541. * Get current memory usage in bytes
  542. */
  543. getCurrentUsage(): number {
  544. return process.memoryUsage().heapUsed;
  545. }
  546. }