| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584 |
- /**
- * CodeGraph Utilities
- *
- * Common utility functions for memory management, concurrency, batching,
- * and security validation.
- *
- * @module utils
- *
- * @example
- * ```typescript
- * import { Mutex, processInBatches, MemoryMonitor, validatePathWithinRoot } from 'codegraph';
- *
- * // Use mutex for concurrent safety
- * const mutex = new Mutex();
- * await mutex.withLock(async () => {
- * await performCriticalOperation();
- * });
- *
- * // Process items in batches to manage memory
- * const results = await processInBatches(items, 100, async (item) => {
- * return await processItem(item);
- * });
- *
- * // Monitor memory usage
- * const monitor = new MemoryMonitor(512, (usage) => {
- * console.warn(`Memory usage exceeded 512MB: ${usage / 1024 / 1024}MB`);
- * });
- * monitor.start();
- * ```
- */
- import * as fs from 'fs';
- import * as path from 'path';
- // ============================================================
- // SECURITY UTILITIES
- // ============================================================
- /**
- * Sensitive system directories that should never be used as project roots.
- * Checked on all platforms; non-applicable paths are harmlessly skipped.
- */
- const SENSITIVE_PATHS = new Set([
- '/', '/etc', '/usr', '/bin', '/sbin', '/var', '/tmp', '/dev', '/proc', '/sys',
- '/root', '/boot', '/lib', '/lib64', '/opt',
- 'c:\\', 'c:\\windows', 'c:\\windows\\system32',
- ]);
- /**
- * Config "languages" whose nodes are pure key/value DATA lifted from a config
- * file (e.g. Spring `application.{yml,properties}`), not source code.
- */
- export const CONFIG_LEAF_LANGUAGES: ReadonlySet<string> = new Set(['yaml', 'properties']);
- /**
- * A config-leaf node is a single key lifted out of a pure config/data file —
- * `kind: 'constant'` in a {@link CONFIG_LEAF_LANGUAGES} language. Its on-disk
- * line is `key = <value>`, and that value is routinely a secret (DB password,
- * API key, JDBC URL with embedded creds). CodeGraph must surface the KEY only
- * and never read/return the value, or it pushes secrets into agent context
- * unbidden — the value isn't needed for resolution, and an agent that genuinely
- * needs it can read the file directly. (#383)
- */
- export function isConfigLeafNode(node: { kind: string; language?: string }): boolean {
- return node.kind === 'constant' && !!node.language && CONFIG_LEAF_LANGUAGES.has(node.language);
- }
- /**
- * Whether `child` is `parent` itself or sits underneath it. Case-insensitive on
- * Windows — NTFS is case-insensitive, and realpathSync can hand back a different
- * case than the lexical root, which would otherwise false-reject a valid file.
- */
- function isWithinDir(child: string, parent: string): boolean {
- let c = child;
- let p = parent;
- if (process.platform === 'win32') {
- c = c.toLowerCase();
- p = p.toLowerCase();
- }
- return c === p || c.startsWith(p + path.sep);
- }
- /**
- * Validate that a file path stays within the project root, resolving symlinks.
- *
- * Two layers: a cheap lexical check that catches `../` traversal, then a
- * realpath check that catches symlink escapes — an in-repo symlink whose
- * logical path is inside the root but whose real target points outside it
- * (issue #527). A symlink that stays within the root is still allowed, so
- * legitimate in-tree symlinks keep working. Both content-serving read sinks
- * (codegraph_node `includeCode`, codegraph_explore source) go through here, so
- * this is the chokepoint that keeps out-of-root file contents from leaking.
- *
- * @param projectRoot - The project root directory
- * @param filePath - The (relative or absolute) file path to validate
- * @returns The resolved absolute path (realpath when it exists), or null if it
- * escapes the root
- */
- export function validatePathWithinRoot(projectRoot: string, filePath: string): string | null {
- const resolved = path.resolve(projectRoot, filePath);
- const normalizedRoot = path.resolve(projectRoot);
- // 1. Lexical containment — cheap, catches `../` traversal.
- if (!isWithinDir(resolved, normalizedRoot)) {
- return null;
- }
- // 2. Symlink-aware containment — resolve symlinks on both sides and re-check,
- // so an in-repo symlink whose real target escapes the root is rejected.
- try {
- const realRoot = fs.realpathSync(normalizedRoot);
- const realResolved = fs.realpathSync(resolved);
- return isWithinDir(realResolved, realRoot) ? realResolved : null;
- } catch (err) {
- // ENOENT: the path doesn't exist yet (a file about to be written, or an
- // index entry for a since-deleted file) — no symlink to follow, and the
- // lexical check already passed, so allow the lexical path. Any other
- // resolution failure (ELOOP, EACCES, …) is treated as unsafe → reject.
- if ((err as NodeJS.ErrnoException).code === 'ENOENT') {
- return resolved;
- }
- return null;
- }
- }
- /**
- * Validate that a path is a safe project root directory.
- *
- * Rejects sensitive system directories and ensures the path is
- * a real, existing directory. Used at MCP and API entry points
- * to prevent arbitrary directory access.
- *
- * @param dirPath - The path to validate
- * @returns An error message if invalid, or null if valid
- */
- export function validateProjectPath(dirPath: string): string | null {
- const resolved = path.resolve(dirPath);
- // Block sensitive system directories
- if (SENSITIVE_PATHS.has(resolved) || SENSITIVE_PATHS.has(resolved.toLowerCase())) {
- return `Refusing to operate on sensitive system directory: ${resolved}`;
- }
- // Also block common sensitive home subdirectories
- const homeDir = require('os').homedir();
- const sensitiveHomeDirs = ['.ssh', '.gnupg', '.aws', '.config'];
- for (const dir of sensitiveHomeDirs) {
- const sensitivePath = path.join(homeDir, dir);
- if (resolved === sensitivePath || resolved.startsWith(sensitivePath + path.sep)) {
- return `Refusing to operate on sensitive directory: ${resolved}`;
- }
- }
- // Verify it's a real directory
- try {
- const stats = fs.statSync(resolved);
- if (!stats.isDirectory()) {
- return `Path is not a directory: ${resolved}`;
- }
- } catch {
- return `Path does not exist or is not accessible: ${resolved}`;
- }
- return null;
- }
- /**
- * Safely parse JSON with a fallback value.
- * Prevents crashes from corrupted database metadata.
- */
- export function safeJsonParse<T>(value: string, fallback: T): T {
- try {
- return JSON.parse(value);
- } catch {
- return fallback;
- }
- }
- /**
- * Clamp a numeric value to a range.
- * Used to enforce sane limits on MCP tool inputs.
- */
- export function clamp(value: number, min: number, max: number): number {
- return Math.max(min, Math.min(max, value));
- }
- /**
- * Normalize a file path to use forward slashes.
- * Fixes Windows backslash paths so glob matching works consistently.
- */
- export function normalizePath(filePath: string): string {
- return filePath.replace(/\\/g, '/');
- }
- /**
- * Cross-process file lock using a lock file with PID tracking.
- *
- * Prevents multiple processes (e.g., git hooks, CLI, MCP server) from
- * writing to the same database simultaneously.
- */
- export class FileLock {
- private lockPath: string;
- private held = false;
- /** Locks older than this are considered stale regardless of PID status */
- private static readonly STALE_TIMEOUT_MS = 2 * 60 * 1000; // 2 minutes
- constructor(lockPath: string) {
- this.lockPath = lockPath;
- }
- /**
- * Acquire the lock. Throws if the lock is held by another live process.
- */
- acquire(): void {
- // Check for existing lock
- if (fs.existsSync(this.lockPath)) {
- try {
- const content = fs.readFileSync(this.lockPath, 'utf-8').trim();
- const pid = parseInt(content, 10);
- const stat = fs.statSync(this.lockPath);
- const lockAge = Date.now() - stat.mtimeMs;
- // Treat locks older than the timeout as stale, regardless of PID
- if (lockAge < FileLock.STALE_TIMEOUT_MS && !isNaN(pid) && this.isProcessAlive(pid)) {
- throw new Error(
- `CodeGraph database is locked by another process (PID ${pid}). ` +
- `If this is stale, run 'codegraph unlock' or delete ${this.lockPath}`
- );
- }
- // Stale lock (dead process or timed out) - remove it
- fs.unlinkSync(this.lockPath);
- } catch (err) {
- if (err instanceof Error && err.message.includes('locked by another')) {
- throw err;
- }
- // Other errors reading lock file - try to remove it
- try { fs.unlinkSync(this.lockPath); } catch { /* ignore */ }
- }
- }
- // Write our PID to the lock file using exclusive create flag
- try {
- fs.writeFileSync(this.lockPath, String(process.pid), { flag: 'wx' });
- this.held = true;
- } catch (err: any) {
- if (err.code === 'EEXIST') {
- // Race condition: another process grabbed the lock between our check and write
- throw new Error(
- 'CodeGraph database is locked by another process. ' +
- `If this is stale, run 'codegraph unlock' or delete ${this.lockPath}`
- );
- }
- throw err;
- }
- }
- /**
- * Release the lock
- */
- release(): void {
- if (!this.held) return;
- try {
- // Only remove if we still own it (check PID)
- const content = fs.readFileSync(this.lockPath, 'utf-8').trim();
- if (parseInt(content, 10) === process.pid) {
- fs.unlinkSync(this.lockPath);
- }
- } catch {
- // Lock file already gone - that's fine
- }
- this.held = false;
- }
- /**
- * Execute a function while holding the lock
- */
- withLock<T>(fn: () => T): T {
- this.acquire();
- try {
- return fn();
- } finally {
- this.release();
- }
- }
- /**
- * Execute an async function while holding the lock
- */
- async withLockAsync<T>(fn: () => Promise<T>): Promise<T> {
- this.acquire();
- try {
- return await fn();
- } finally {
- this.release();
- }
- }
- /**
- * Check if a process is still running
- */
- private isProcessAlive(pid: number): boolean {
- try {
- process.kill(pid, 0);
- return true;
- } catch {
- return false;
- }
- }
- }
- /**
- * Process items in batches to manage memory
- *
- * @param items - Array of items to process
- * @param batchSize - Number of items per batch
- * @param processor - Function to process each item
- * @param onBatchComplete - Optional callback after each batch
- * @returns Array of results
- */
- export async function processInBatches<T, R>(
- items: T[],
- batchSize: number,
- processor: (item: T, index: number) => Promise<R>,
- onBatchComplete?: (completed: number, total: number) => void
- ): Promise<R[]> {
- const results: R[] = [];
- for (let i = 0; i < items.length; i += batchSize) {
- const batch = items.slice(i, Math.min(i + batchSize, items.length));
- const batchResults = await Promise.all(
- batch.map((item, idx) => processor(item, i + idx))
- );
- results.push(...batchResults);
- if (onBatchComplete) {
- onBatchComplete(Math.min(i + batchSize, items.length), items.length);
- }
- // Allow GC between batches
- if (global.gc) {
- global.gc();
- }
- }
- return results;
- }
- /**
- * Simple mutex lock for preventing concurrent operations
- */
- export class Mutex {
- private locked = false;
- private waitQueue: Array<() => void> = [];
- /**
- * Acquire the lock
- *
- * @returns A release function to call when done
- */
- async acquire(): Promise<() => void> {
- while (this.locked) {
- await new Promise<void>((resolve) => {
- this.waitQueue.push(resolve);
- });
- }
- this.locked = true;
- return () => {
- this.locked = false;
- const next = this.waitQueue.shift();
- if (next) {
- next();
- }
- };
- }
- /**
- * Execute a function while holding the lock
- */
- async withLock<T>(fn: () => Promise<T> | T): Promise<T> {
- const release = await this.acquire();
- try {
- return await fn();
- } finally {
- release();
- }
- }
- /**
- * Check if the lock is currently held
- */
- isLocked(): boolean {
- return this.locked;
- }
- }
- /**
- * Chunked file reader for large files
- *
- * Reads a file in chunks to avoid loading entire file into memory.
- */
- export async function* readFileInChunks(
- filePath: string,
- chunkSize: number = 64 * 1024
- ): AsyncGenerator<string, void, undefined> {
- const fs = await import('fs');
- const fd = fs.openSync(filePath, 'r');
- const buffer = Buffer.alloc(chunkSize);
- try {
- let bytesRead: number;
- while ((bytesRead = fs.readSync(fd, buffer, 0, chunkSize, null)) > 0) {
- yield buffer.toString('utf-8', 0, bytesRead);
- }
- } finally {
- fs.closeSync(fd);
- }
- }
- /**
- * Debounce a function
- *
- * @param fn - Function to debounce
- * @param delay - Delay in milliseconds
- * @returns Debounced function
- */
- export function debounce<T extends (...args: unknown[]) => unknown>(
- fn: T,
- delay: number
- ): (...args: Parameters<T>) => void {
- let timeoutId: ReturnType<typeof setTimeout> | null = null;
- return (...args: Parameters<T>) => {
- if (timeoutId) {
- clearTimeout(timeoutId);
- }
- timeoutId = setTimeout(() => {
- fn(...args);
- timeoutId = null;
- }, delay);
- };
- }
- /**
- * Throttle a function
- *
- * @param fn - Function to throttle
- * @param limit - Minimum time between calls in milliseconds
- * @returns Throttled function
- */
- export function throttle<T extends (...args: unknown[]) => unknown>(
- fn: T,
- limit: number
- ): (...args: Parameters<T>) => void {
- let lastCall = 0;
- let timeoutId: ReturnType<typeof setTimeout> | null = null;
- return (...args: Parameters<T>) => {
- const now = Date.now();
- const remaining = limit - (now - lastCall);
- if (remaining <= 0) {
- if (timeoutId) {
- clearTimeout(timeoutId);
- timeoutId = null;
- }
- lastCall = now;
- fn(...args);
- } else if (!timeoutId) {
- timeoutId = setTimeout(() => {
- lastCall = Date.now();
- timeoutId = null;
- fn(...args);
- }, remaining);
- }
- };
- }
- /**
- * Estimate memory usage of an object (rough approximation)
- *
- * @param obj - Object to measure
- * @returns Approximate size in bytes
- */
- export function estimateSize(obj: unknown): number {
- const seen = new WeakSet();
- function sizeOf(value: unknown): number {
- if (value === null || value === undefined) {
- return 0;
- }
- switch (typeof value) {
- case 'boolean':
- return 4;
- case 'number':
- return 8;
- case 'string':
- return 2 * (value as string).length;
- case 'object':
- if (seen.has(value as object)) {
- return 0;
- }
- seen.add(value as object);
- if (Array.isArray(value)) {
- return value.reduce((acc: number, item) => acc + sizeOf(item), 0);
- }
- return Object.entries(value as object).reduce(
- (acc, [key, val]) => acc + sizeOf(key) + sizeOf(val),
- 0
- );
- default:
- return 0;
- }
- }
- return sizeOf(obj);
- }
- /**
- * Memory monitor for tracking usage during operations
- */
- export class MemoryMonitor {
- private checkInterval: ReturnType<typeof setInterval> | null = null;
- private peakUsage = 0;
- private threshold: number;
- private onThresholdExceeded?: (usage: number) => void;
- constructor(
- thresholdMB: number = 500,
- onThresholdExceeded?: (usage: number) => void
- ) {
- this.threshold = thresholdMB * 1024 * 1024;
- this.onThresholdExceeded = onThresholdExceeded;
- }
- /**
- * Start monitoring memory usage
- */
- start(intervalMs: number = 1000): void {
- this.stop();
- this.peakUsage = 0;
- this.checkInterval = setInterval(() => {
- const usage = process.memoryUsage().heapUsed;
- if (usage > this.peakUsage) {
- this.peakUsage = usage;
- }
- if (usage > this.threshold && this.onThresholdExceeded) {
- this.onThresholdExceeded(usage);
- }
- }, intervalMs);
- }
- /**
- * Stop monitoring
- */
- stop(): void {
- if (this.checkInterval) {
- clearInterval(this.checkInterval);
- this.checkInterval = null;
- }
- }
- /**
- * Get peak memory usage in bytes
- */
- getPeakUsage(): number {
- return this.peakUsage;
- }
- /**
- * Get current memory usage in bytes
- */
- getCurrentUsage(): number {
- return process.memoryUsage().heapUsed;
- }
- }
|