index.ts 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479
  1. /**
  2. * CodeGraph MCP Server
  3. *
  4. * Model Context Protocol server that exposes CodeGraph functionality
  5. * as tools for AI assistants like Claude.
  6. *
  7. * @module mcp
  8. *
  9. * @example
  10. * ```typescript
  11. * import { MCPServer } from 'codegraph';
  12. *
  13. * const server = new MCPServer('/path/to/project');
  14. * await server.start();
  15. * ```
  16. *
  17. * Runtime modes (decided in {@link MCPServer.start}):
  18. *
  19. * - **Direct** — one process serves one MCP client over stdio. The pre-#411
  20. * behavior; used when the user opts out (`CODEGRAPH_NO_DAEMON=1`), no
  21. * `.codegraph/` is reachable, or the daemon machinery fails for any reason.
  22. * - **Proxy** — what an MCP host actually talks to when sharing is on: a thin
  23. * stdio↔socket pipe to the shared daemon. The proxy carries the #277 PPID
  24. * watchdog, so a SIGKILL'd host reaps its proxy promptly. See {@link ./proxy.ts}.
  25. * - **Daemon** — a *detached* background process (its own session/process
  26. * group) that serves N proxies over a Unix-domain socket / named pipe,
  27. * sharing one CodeGraph + watcher + SQLite handle. Spawned on demand; never a
  28. * child of any host, so it survives individual sessions and is reaped by
  29. * client-refcount + idle timeout. See {@link ./daemon.ts} and issue #411.
  30. *
  31. * The detached-daemon + always-proxy split is the fix for the review finding
  32. * that the original in-process daemon (a) was the first host's child, so closing
  33. * that terminal severed every other client, and (b) disabled the PPID watchdog,
  34. * regressing #277 (orphaned daemons on host SIGKILL).
  35. */
  36. import * as fs from 'fs';
  37. import * as path from 'path';
  38. import { spawn, StdioOptions } from 'child_process';
  39. import { findNearestCodeGraphRoot, getCodeGraphDir } from '../directory';
  40. import { StdioTransport } from './transport';
  41. import { MCPEngine } from './engine';
  42. import { MCPSession } from './session';
  43. import {
  44. Daemon,
  45. clearStaleDaemonLock,
  46. isProcessAlive,
  47. tryAcquireDaemonLock,
  48. } from './daemon';
  49. import { connectWithHello, runLocalHandshakeProxy } from './proxy';
  50. import { getDaemonSocketPath } from './daemon-paths';
  51. import { getTelemetry } from '../telemetry';
  52. import { supervisionLostReason } from './ppid-watchdog';
  53. import { installMainThreadWatchdog, WatchdogHandle } from './liveness-watchdog';
  54. import { treatStdinFailureAsShutdown } from './stdin-teardown';
  55. import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags';
  56. /**
  57. * How often to poll `process.ppid` to detect parent process death (see #277).
  58. * 5s is a deliberate trade-off: the failure mode being guarded against is rare
  59. * (parent SIGKILL'd), and longer poll = less wakeup overhead while idle.
  60. */
  61. const DEFAULT_PPID_POLL_MS = 5000;
  62. /**
  63. * Env var that marks a process as the *detached daemon* itself (set by
  64. * {@link spawnDetachedDaemon} when it re-invokes the CLI). Without it a
  65. * `serve --mcp` invocation is a launcher that connects-or-spawns; with it, the
  66. * process IS the daemon and must never try to spawn another (infinite spawn).
  67. */
  68. const DAEMON_INTERNAL_ENV = 'CODEGRAPH_DAEMON_INTERNAL';
  69. /**
  70. * Retries for the detached daemon arbitrating the O_EXCL lock against a racing
  71. * sibling. Tiny — the lock resolves on the first round in practice; the retries
  72. * only cover clearing a genuinely stale (dead-pid) lockfile.
  73. */
  74. const TAKEOVER_MAX_RETRIES = 5;
  75. const TAKEOVER_RETRY_DELAY_MS = 100;
  76. /**
  77. * How long a launcher waits for a freshly-spawned daemon to bind its socket
  78. * before giving up and running in-process. The daemon binds the socket *before*
  79. * the (backgrounded) engine/grammar warm-up, so this only needs to cover node
  80. * process startup. 60 × 100ms = 6s of headroom for a cold/slow box; on the
  81. * common path the socket appears within a few rounds.
  82. */
  83. // Poll finely (25ms) so the proxy attaches the instant the freshly-spawned
  84. // daemon binds, instead of waiting up to a coarse 100ms after — shaves the
  85. // cold-start handshake (the window the headless agent races). Same ~6s total
  86. // give-up budget (240 × 25ms), just finer granularity; socket-connect probes
  87. // are cheap. Paired with deferring the CodeGraph load (engine.ts) off the bind
  88. // path, this narrows the "No such tool available" race window.
  89. const DAEMON_CONNECT_MAX_RETRIES = 240;
  90. const DAEMON_CONNECT_RETRY_DELAY_MS = 25;
  91. /**
  92. * Resolve the PPID watchdog poll interval from an env override. A value of
  93. * `0` disables the watchdog entirely (escape hatch for embedded scenarios
  94. * where the parent legitimately re-parents the server on purpose). Anything
  95. * non-numeric or negative falls back to the default.
  96. */
  97. function parsePpidPollMs(raw: string | undefined): number {
  98. if (raw === undefined || raw === '') return DEFAULT_PPID_POLL_MS;
  99. const parsed = Number(raw);
  100. if (!Number.isFinite(parsed)) return DEFAULT_PPID_POLL_MS;
  101. if (parsed < 0) return DEFAULT_PPID_POLL_MS;
  102. return Math.floor(parsed);
  103. }
  104. /**
  105. * Parse the host PID propagated across the `--liftoff-only` re-exec
  106. * ({@link HOST_PPID_ENV}). Returns a positive integer PID, or null when
  107. * unset/invalid — the direct-launch path, where the watchdog falls back to
  108. * `process.ppid` divergence. PIDs of 0/1 are rejected (0 = unknown, 1 = init,
  109. * i.e. already orphaned), so the watchdog doesn't latch onto init.
  110. */
  111. function parseHostPpid(raw: string | undefined): number | null {
  112. if (raw === undefined || raw === '') return null;
  113. const parsed = Number(raw);
  114. if (!Number.isInteger(parsed) || parsed <= 1) return null;
  115. return parsed;
  116. }
  117. /** Whether `CODEGRAPH_NO_DAEMON` was set to a truthy value. */
  118. function daemonOptOutSet(): boolean {
  119. const raw = process.env.CODEGRAPH_NO_DAEMON;
  120. if (!raw) return false;
  121. return raw !== '0' && raw.toLowerCase() !== 'false';
  122. }
  123. /** Whether this process was spawned to BE the detached daemon. */
  124. function daemonInternalSet(): boolean {
  125. const raw = process.env[DAEMON_INTERNAL_ENV];
  126. return !!raw && raw !== '0' && raw.toLowerCase() !== 'false';
  127. }
  128. /**
  129. * Resolve the project root the daemon machinery should key on. Returns
  130. * `null` when no `.codegraph/` is reachable from the candidate path — in
  131. * that case the caller must run in direct mode, since the daemon lockfile
  132. * and socket both live under `.codegraph/`.
  133. *
  134. * The result is canonicalized with `realpathSync` so every client converges on
  135. * the same socket/lock path regardless of how it expressed the path: a client
  136. * launched with cwd under a symlink (e.g. macOS `/var` → `/private/var`, where
  137. * spawned `process.cwd()` is already realpath'd) and one that passed a
  138. * symlinked `rootUri` would otherwise hash to different sockets and silently
  139. * fail to share the daemon.
  140. */
  141. function resolveDaemonRoot(explicitPath: string | null): string | null {
  142. const candidate = explicitPath ?? process.cwd();
  143. const root = findNearestCodeGraphRoot(candidate);
  144. if (!root) return null;
  145. try { return fs.realpathSync(root); } catch { return root; }
  146. }
  147. /**
  148. * Spawn the shared daemon as a fully detached background process: its own
  149. * session/process group (so a SIGHUP/SIGINT to the launcher's terminal can't
  150. * reach it) with stdio decoupled from the launcher (logs to
  151. * `.codegraph/daemon.log`). Re-invokes the *same* CLI faithfully across dev and
  152. * bundled launches by reusing `process.argv[0]` (the right node), the current
  153. * `process.execArgv` (carries `--liftoff-only`, so the daemon never re-execs)
  154. * and `process.argv[1]` (this script). The spawned process self-arbitrates the
  155. * O_EXCL lock, so racing launchers may each spawn one — losers exit and every
  156. * launcher proxies through the single winner.
  157. */
  158. function spawnDetachedDaemon(root: string): void {
  159. const scriptPath = process.argv[1];
  160. if (!scriptPath) {
  161. // No resolvable CLI entry point to re-invoke — let the caller fall back to
  162. // direct mode rather than spawn something broken.
  163. throw new Error('cannot resolve CLI script path to spawn the daemon');
  164. }
  165. let logFd: number | null = null;
  166. let stdio: StdioOptions = 'ignore';
  167. try {
  168. logFd = fs.openSync(path.join(getCodeGraphDir(root), 'daemon.log'), 'a');
  169. stdio = ['ignore', logFd, logFd];
  170. } catch {
  171. stdio = 'ignore'; // no log file — discard daemon output rather than fail
  172. }
  173. try {
  174. const child = spawn(
  175. process.execPath,
  176. [...process.execArgv, scriptPath, 'serve', '--mcp', '--path', root],
  177. {
  178. detached: true,
  179. stdio,
  180. windowsHide: true,
  181. env: { ...process.env, [DAEMON_INTERNAL_ENV]: '1' },
  182. },
  183. );
  184. child.unref();
  185. } finally {
  186. // The child holds its own dup of the log fd now; the launcher doesn't need it.
  187. if (logFd !== null) {
  188. try { fs.closeSync(logFd); } catch { /* ignore */ }
  189. }
  190. }
  191. }
  192. /**
  193. * MCP Server for CodeGraph
  194. *
  195. * Implements the Model Context Protocol to expose CodeGraph
  196. * functionality as tools that can be called by AI assistants.
  197. *
  198. * Backwards-compatible constructor and `start()` signature with the
  199. * pre-issue-#411 implementation: callers continue to do
  200. * `new MCPServer(path).start()`. Internally we now pick from direct / proxy /
  201. * daemon at start time.
  202. */
  203. export class MCPServer {
  204. private projectPath: string | null;
  205. // Direct-mode-only state. In daemon mode the per-connection sessions live
  206. // inside the Daemon class; in proxy mode there is no session at all.
  207. private session: MCPSession | null = null;
  208. private engine: MCPEngine | null = null;
  209. private daemon: Daemon | null = null;
  210. private ppidWatchdog: ReturnType<typeof setInterval> | null = null;
  211. // Worker-thread liveness watchdog (#850). Long-lived modes only; SIGKILLs the
  212. // process if the main thread wedges in a non-yielding sync loop.
  213. private livenessWatchdog: WatchdogHandle | null = null;
  214. // PPID watchdog baseline — captured at construction so we always have a
  215. // baseline, even if start() runs after a fork-style reparent.
  216. private originalPpid: number = process.ppid;
  217. private hostPpid: number | null = parseHostPpid(process.env[HOST_PPID_ENV]);
  218. // Idempotency guard for stop().
  219. private stopped = false;
  220. private mode: 'unstarted' | 'direct' | 'proxy' | 'daemon' = 'unstarted';
  221. constructor(projectPath?: string) {
  222. this.projectPath = projectPath || null;
  223. }
  224. /**
  225. * Start the MCP server.
  226. *
  227. * Decision order:
  228. * 1. `CODEGRAPH_NO_DAEMON=1` → direct mode (unchanged pre-#411 behavior).
  229. * 2. `CODEGRAPH_DAEMON_INTERNAL=1` → we ARE the detached daemon; listen.
  230. * 3. No `.codegraph/` reachable → direct mode (the daemon's lockfile and
  231. * socket both live under `.codegraph/`).
  232. * 4. Otherwise connect to (or spawn) the shared daemon and proxy to it.
  233. *
  234. * On any unexpected failure in step 4 we transparently fall back to direct
  235. * mode — a misbehaving daemon must never block a session from starting.
  236. */
  237. async start(): Promise<void> {
  238. // Long-lived process (direct / proxy / daemon alike): flush buffered
  239. // telemetry opportunistically. Fire-and-forget + unref'd — adds nothing
  240. // to the handshake path and never keeps the process alive.
  241. getTelemetry().startInterval();
  242. // The detached daemon process itself. Checked before the opt-out so the
  243. // daemon honors the same env it was spawned with (it never sets NO_DAEMON).
  244. if (daemonInternalSet()) {
  245. return this.startDaemonProcess();
  246. }
  247. // Direct mode if the user opted out. Setting the env var is sufficient to
  248. // get the pre-#411 single-process behavior.
  249. if (daemonOptOutSet()) {
  250. return this.startDirect('CODEGRAPH_NO_DAEMON set');
  251. }
  252. const root = resolveDaemonRoot(this.projectPath);
  253. if (!root) {
  254. // No initialized project found — daemon mode has nowhere to put its
  255. // socket. The fresh-checkout / outside-project case; behave as before.
  256. return this.startDirect('no .codegraph/ root found');
  257. }
  258. try {
  259. // Answer the MCP handshake LOCALLY (instant tool registration — no waiting
  260. // ~600ms for the daemon to spawn+bind, which produced the cold-start race)
  261. // and forward tool CALLS to the shared daemon, connected in the background.
  262. // Runs until the host disconnects; the proxy installs its own watchdog and
  263. // falls back to an in-process engine if the daemon never comes up.
  264. this.mode = 'proxy';
  265. await this.runProxyWithLocalHandshake(root);
  266. return;
  267. } catch (err) {
  268. // Belt-and-braces: a throw during proxy SETUP (before the client was served)
  269. // is still safe to recover from with a direct-mode session.
  270. const msg = err instanceof Error ? err.message : String(err);
  271. process.stderr.write(`[CodeGraph MCP] Proxy path failed (${msg}); falling back to direct mode.\n`);
  272. return this.startDirect('proxy path threw');
  273. }
  274. }
  275. /**
  276. * Stop the server. In daemon mode this triggers graceful shutdown of every
  277. * connected session; in direct mode it mirrors the pre-#411 behavior (close
  278. * cg, exit). Proxy mode never routes through here — the proxy exits itself.
  279. */
  280. stop(): void {
  281. if (this.stopped) return;
  282. this.stopped = true;
  283. if (this.ppidWatchdog) {
  284. clearInterval(this.ppidWatchdog);
  285. this.ppidWatchdog = null;
  286. }
  287. if (this.livenessWatchdog) {
  288. this.livenessWatchdog.stop();
  289. this.livenessWatchdog = null;
  290. }
  291. if (this.daemon) {
  292. void this.daemon.stop('stop()');
  293. // Daemon.stop calls process.exit; nothing else to do.
  294. return;
  295. }
  296. if (this.session) {
  297. this.session.stop();
  298. this.session = null;
  299. }
  300. if (this.engine) {
  301. this.engine.stop();
  302. this.engine = null;
  303. }
  304. process.exit(0);
  305. }
  306. /** Single-process stdio MCP session — the pre-issue-#411 code path. */
  307. private async startDirect(reason: string): Promise<void> {
  308. if (reason && process.env.CODEGRAPH_MCP_DEBUG) {
  309. process.stderr.write(`[CodeGraph MCP] Direct mode: ${reason}.\n`);
  310. }
  311. this.engine = new MCPEngine();
  312. const transport = new StdioTransport();
  313. this.session = new MCPSession(transport, this.engine, {
  314. explicitProjectPath: this.projectPath,
  315. });
  316. if (this.projectPath) {
  317. // Background init so the initialize response stays fast (#172).
  318. void this.engine.ensureInitialized(this.projectPath);
  319. }
  320. this.session.start();
  321. // Detect parent-process death — same logic as pre-refactor. When stdin
  322. // closes we go through StdioTransport's `process.exit(0)` already, but
  323. // SIGKILL of the parent doesn't reliably close stdin on Linux (#277).
  324. // Also treat a stdin `'error'` (a socket-backed stdin can fail with
  325. // ECONNRESET/hangup instead of a clean close) as shutdown, and destroy the
  326. // stream so a hung fd can't busy-spin the event loop (#799).
  327. treatStdinFailureAsShutdown(() => this.stop());
  328. this.mode = 'direct';
  329. this.installSignalHandlers();
  330. this.installPpidWatchdog();
  331. this.livenessWatchdog = installMainThreadWatchdog();
  332. }
  333. /**
  334. * Run as the detached shared daemon (process spawned with
  335. * `CODEGRAPH_DAEMON_INTERNAL=1`). Arbitrate the O_EXCL lock, then either
  336. * become the daemon (bind the socket, serve forever) or — if a live daemon
  337. * already holds the lock — exit so we don't leak a redundant process.
  338. *
  339. * No PPID watchdog and no stdin handlers: the daemon is detached on purpose
  340. * and reaps itself via client-refcount + idle timeout (see {@link Daemon}).
  341. */
  342. private async startDaemonProcess(): Promise<void> {
  343. const root = resolveDaemonRoot(this.projectPath) ?? this.projectPath ?? process.cwd();
  344. for (let attempt = 0; attempt < TAKEOVER_MAX_RETRIES; attempt++) {
  345. const lock = tryAcquireDaemonLock(root);
  346. if (lock.kind === 'acquired') {
  347. const daemon = new Daemon(root);
  348. await daemon.start();
  349. this.daemon = daemon;
  350. this.mode = 'daemon';
  351. // The detached daemon has no PPID watchdog or stdin lifeline, so a
  352. // wedged main thread would pin a core forever (#850). The liveness
  353. // watchdog is its only recovery path.
  354. this.livenessWatchdog = installMainThreadWatchdog();
  355. return; // the net.Server keeps the process alive
  356. }
  357. // Taken. If the holder is alive, another daemon already serves (or is
  358. // binding) — we're redundant; exit cleanly so the launcher proxies to it.
  359. const existing = lock.existing;
  360. if (existing && existing.pid > 0 && isProcessAlive(existing.pid)) {
  361. process.stderr.write(
  362. `[CodeGraph daemon] Another daemon (pid ${existing.pid}) already holds the lock; exiting.\n`
  363. );
  364. process.exit(0);
  365. }
  366. // Holder is dead (or the record is unreadable) — clear it (pid-verified,
  367. // so we never delete a live daemon's lock) and retry the acquire.
  368. clearStaleDaemonLock(lock.pidPath, existing?.pid);
  369. await sleep(TAKEOVER_RETRY_DELAY_MS);
  370. }
  371. process.stderr.write('[CodeGraph daemon] Could not acquire the daemon lock; exiting.\n');
  372. process.exit(0);
  373. }
  374. /**
  375. * Proxy mode (the common case). Serve the MCP handshake LOCALLY for instant
  376. * tool registration, forwarding tool calls to the shared daemon — which is
  377. * connected in the background (probed, then spawned + polled if absent) so the
  378. * handshake never waits ~600ms on it. Runs until the host disconnects; the
  379. * proxy falls back to an in-process engine if the daemon never binds, so this
  380. * never wedges a session.
  381. */
  382. private async runProxyWithLocalHandshake(root: string): Promise<void> {
  383. const socketPath = getDaemonSocketPath(root);
  384. const getDaemonSocket = async () => {
  385. // Fast path: a daemon may already be listening.
  386. const probe = await connectWithHello(socketPath);
  387. if (probe === 'version-mismatch') return null; // definitive — serve in-process, don't poll for 6s
  388. if (probe) return probe;
  389. // None reachable — spawn one (detached) and poll for its bind.
  390. spawnDetachedDaemon(root);
  391. for (let attempt = 0; attempt < DAEMON_CONNECT_MAX_RETRIES; attempt++) {
  392. await sleep(DAEMON_CONNECT_RETRY_DELAY_MS);
  393. const s = await connectWithHello(socketPath);
  394. if (s === 'version-mismatch') return null;
  395. if (s) return s;
  396. }
  397. return null; // never bound — the proxy serves this session in-process
  398. };
  399. await runLocalHandshakeProxy({ getDaemonSocket, makeEngine: () => new MCPEngine(), root });
  400. }
  401. /** Standard SIGINT/SIGTERM handlers that route to our `stop()` (direct mode). */
  402. private installSignalHandlers(): void {
  403. process.on('SIGINT', () => this.stop());
  404. process.on('SIGTERM', () => this.stop());
  405. }
  406. /**
  407. * PPID watchdog (#277) — direct mode only. Daemon mode is detached on purpose
  408. * and reaps via idle timeout; proxy mode installs its own watchdog inside
  409. * {@link runProxy}. So this only ever runs for an in-process direct session.
  410. */
  411. private installPpidWatchdog(): void {
  412. if (this.mode !== 'direct') return;
  413. const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS);
  414. if (pollMs <= 0) return;
  415. this.ppidWatchdog = setInterval(() => {
  416. const reason = supervisionLostReason({
  417. originalPpid: this.originalPpid,
  418. currentPpid: process.ppid,
  419. hostPpid: this.hostPpid,
  420. isAlive: isProcessAlive,
  421. });
  422. if (reason) {
  423. process.stderr.write(
  424. `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`
  425. );
  426. this.stop();
  427. }
  428. }, pollMs);
  429. this.ppidWatchdog.unref();
  430. }
  431. }
  432. function sleep(ms: number): Promise<void> {
  433. // Deliberately NOT unref'd. During the daemon connect/takeover retry loop we
  434. // may be between processes — no socket bound yet, no transport, no listener
  435. // pinning the event loop. An unref'd timer would let Node drain the loop and
  436. // exit silently before we get a chance to try again.
  437. return new Promise((resolve) => { setTimeout(resolve, ms); });
  438. }
  439. // Export for use in CLI
  440. export { StdioTransport } from './transport';
  441. export { tools, ToolHandler } from './tools';
  442. // Surface a few daemon-mode bits for tests + diagnostics.
  443. export { Daemon } from './daemon';
  444. export { CodeGraphPackageVersion } from './version';