index.ts 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. /**
  2. * CodeGraph MCP Server
  3. *
  4. * Model Context Protocol server that exposes CodeGraph functionality
  5. * as tools for AI assistants like Claude.
  6. *
  7. * @module mcp
  8. *
  9. * @example
  10. * ```typescript
  11. * import { MCPServer } from 'codegraph';
  12. *
  13. * const server = new MCPServer('/path/to/project');
  14. * await server.start();
  15. * ```
  16. *
  17. * Runtime modes (decided in {@link MCPServer.start}):
  18. *
  19. * - **Direct** — one process serves one MCP client over stdio. The pre-#411
  20. * behavior; used when the user opts out (`CODEGRAPH_NO_DAEMON=1`), no
  21. * `.codegraph/` is reachable, or the daemon machinery fails for any reason.
  22. * - **Proxy** — what an MCP host actually talks to when sharing is on: a thin
  23. * stdio↔socket pipe to the shared daemon. The proxy carries the #277 PPID
  24. * watchdog, so a SIGKILL'd host reaps its proxy promptly. See {@link ./proxy.ts}.
  25. * - **Daemon** — a *detached* background process (its own session/process
  26. * group) that serves N proxies over a Unix-domain socket / named pipe,
  27. * sharing one CodeGraph + watcher + SQLite handle. Spawned on demand; never a
  28. * child of any host, so it survives individual sessions and is reaped by
  29. * client-refcount + idle timeout. See {@link ./daemon.ts} and issue #411.
  30. *
  31. * The detached-daemon + always-proxy split is the fix for the review finding
  32. * that the original in-process daemon (a) was the first host's child, so closing
  33. * that terminal severed every other client, and (b) disabled the PPID watchdog,
  34. * regressing #277 (orphaned daemons on host SIGKILL).
  35. */
  36. import * as fs from 'fs';
  37. import * as path from 'path';
  38. import { spawn, StdioOptions } from 'child_process';
  39. import { findNearestCodeGraphRoot, getCodeGraphDir } from '../directory';
  40. import { StdioTransport } from './transport';
  41. import { MCPEngine } from './engine';
  42. import { MCPSession } from './session';
  43. import {
  44. Daemon,
  45. clearStaleDaemonLock,
  46. isProcessAlive,
  47. tryAcquireDaemonLock,
  48. } from './daemon';
  49. import { connectWithHello, runLocalHandshakeProxy } from './proxy';
  50. import { getDaemonSocketPath } from './daemon-paths';
  51. import { supervisionLostReason } from './ppid-watchdog';
  52. import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags';
  53. /**
  54. * How often to poll `process.ppid` to detect parent process death (see #277).
  55. * 5s is a deliberate trade-off: the failure mode being guarded against is rare
  56. * (parent SIGKILL'd), and longer poll = less wakeup overhead while idle.
  57. */
  58. const DEFAULT_PPID_POLL_MS = 5000;
  59. /**
  60. * Env var that marks a process as the *detached daemon* itself (set by
  61. * {@link spawnDetachedDaemon} when it re-invokes the CLI). Without it a
  62. * `serve --mcp` invocation is a launcher that connects-or-spawns; with it, the
  63. * process IS the daemon and must never try to spawn another (infinite spawn).
  64. */
  65. const DAEMON_INTERNAL_ENV = 'CODEGRAPH_DAEMON_INTERNAL';
  66. /**
  67. * Retries for the detached daemon arbitrating the O_EXCL lock against a racing
  68. * sibling. Tiny — the lock resolves on the first round in practice; the retries
  69. * only cover clearing a genuinely stale (dead-pid) lockfile.
  70. */
  71. const TAKEOVER_MAX_RETRIES = 5;
  72. const TAKEOVER_RETRY_DELAY_MS = 100;
  73. /**
  74. * How long a launcher waits for a freshly-spawned daemon to bind its socket
  75. * before giving up and running in-process. The daemon binds the socket *before*
  76. * the (backgrounded) engine/grammar warm-up, so this only needs to cover node
  77. * process startup. 60 × 100ms = 6s of headroom for a cold/slow box; on the
  78. * common path the socket appears within a few rounds.
  79. */
  80. // Poll finely (25ms) so the proxy attaches the instant the freshly-spawned
  81. // daemon binds, instead of waiting up to a coarse 100ms after — shaves the
  82. // cold-start handshake (the window the headless agent races). Same ~6s total
  83. // give-up budget (240 × 25ms), just finer granularity; socket-connect probes
  84. // are cheap. Paired with deferring the CodeGraph load (engine.ts) off the bind
  85. // path, this narrows the "No such tool available" race window.
  86. const DAEMON_CONNECT_MAX_RETRIES = 240;
  87. const DAEMON_CONNECT_RETRY_DELAY_MS = 25;
  88. /**
  89. * Resolve the PPID watchdog poll interval from an env override. A value of
  90. * `0` disables the watchdog entirely (escape hatch for embedded scenarios
  91. * where the parent legitimately re-parents the server on purpose). Anything
  92. * non-numeric or negative falls back to the default.
  93. */
  94. function parsePpidPollMs(raw: string | undefined): number {
  95. if (raw === undefined || raw === '') return DEFAULT_PPID_POLL_MS;
  96. const parsed = Number(raw);
  97. if (!Number.isFinite(parsed)) return DEFAULT_PPID_POLL_MS;
  98. if (parsed < 0) return DEFAULT_PPID_POLL_MS;
  99. return Math.floor(parsed);
  100. }
  101. /**
  102. * Parse the host PID propagated across the `--liftoff-only` re-exec
  103. * ({@link HOST_PPID_ENV}). Returns a positive integer PID, or null when
  104. * unset/invalid — the direct-launch path, where the watchdog falls back to
  105. * `process.ppid` divergence. PIDs of 0/1 are rejected (0 = unknown, 1 = init,
  106. * i.e. already orphaned), so the watchdog doesn't latch onto init.
  107. */
  108. function parseHostPpid(raw: string | undefined): number | null {
  109. if (raw === undefined || raw === '') return null;
  110. const parsed = Number(raw);
  111. if (!Number.isInteger(parsed) || parsed <= 1) return null;
  112. return parsed;
  113. }
  114. /** Whether `CODEGRAPH_NO_DAEMON` was set to a truthy value. */
  115. function daemonOptOutSet(): boolean {
  116. const raw = process.env.CODEGRAPH_NO_DAEMON;
  117. if (!raw) return false;
  118. return raw !== '0' && raw.toLowerCase() !== 'false';
  119. }
  120. /** Whether this process was spawned to BE the detached daemon. */
  121. function daemonInternalSet(): boolean {
  122. const raw = process.env[DAEMON_INTERNAL_ENV];
  123. return !!raw && raw !== '0' && raw.toLowerCase() !== 'false';
  124. }
  125. /**
  126. * Resolve the project root the daemon machinery should key on. Returns
  127. * `null` when no `.codegraph/` is reachable from the candidate path — in
  128. * that case the caller must run in direct mode, since the daemon lockfile
  129. * and socket both live under `.codegraph/`.
  130. *
  131. * The result is canonicalized with `realpathSync` so every client converges on
  132. * the same socket/lock path regardless of how it expressed the path: a client
  133. * launched with cwd under a symlink (e.g. macOS `/var` → `/private/var`, where
  134. * spawned `process.cwd()` is already realpath'd) and one that passed a
  135. * symlinked `rootUri` would otherwise hash to different sockets and silently
  136. * fail to share the daemon.
  137. */
  138. function resolveDaemonRoot(explicitPath: string | null): string | null {
  139. const candidate = explicitPath ?? process.cwd();
  140. const root = findNearestCodeGraphRoot(candidate);
  141. if (!root) return null;
  142. try { return fs.realpathSync(root); } catch { return root; }
  143. }
  144. /**
  145. * Spawn the shared daemon as a fully detached background process: its own
  146. * session/process group (so a SIGHUP/SIGINT to the launcher's terminal can't
  147. * reach it) with stdio decoupled from the launcher (logs to
  148. * `.codegraph/daemon.log`). Re-invokes the *same* CLI faithfully across dev and
  149. * bundled launches by reusing `process.argv[0]` (the right node), the current
  150. * `process.execArgv` (carries `--liftoff-only`, so the daemon never re-execs)
  151. * and `process.argv[1]` (this script). The spawned process self-arbitrates the
  152. * O_EXCL lock, so racing launchers may each spawn one — losers exit and every
  153. * launcher proxies through the single winner.
  154. */
  155. function spawnDetachedDaemon(root: string): void {
  156. const scriptPath = process.argv[1];
  157. if (!scriptPath) {
  158. // No resolvable CLI entry point to re-invoke — let the caller fall back to
  159. // direct mode rather than spawn something broken.
  160. throw new Error('cannot resolve CLI script path to spawn the daemon');
  161. }
  162. let logFd: number | null = null;
  163. let stdio: StdioOptions = 'ignore';
  164. try {
  165. logFd = fs.openSync(path.join(getCodeGraphDir(root), 'daemon.log'), 'a');
  166. stdio = ['ignore', logFd, logFd];
  167. } catch {
  168. stdio = 'ignore'; // no log file — discard daemon output rather than fail
  169. }
  170. try {
  171. const child = spawn(
  172. process.execPath,
  173. [...process.execArgv, scriptPath, 'serve', '--mcp', '--path', root],
  174. {
  175. detached: true,
  176. stdio,
  177. windowsHide: true,
  178. env: { ...process.env, [DAEMON_INTERNAL_ENV]: '1' },
  179. },
  180. );
  181. child.unref();
  182. } finally {
  183. // The child holds its own dup of the log fd now; the launcher doesn't need it.
  184. if (logFd !== null) {
  185. try { fs.closeSync(logFd); } catch { /* ignore */ }
  186. }
  187. }
  188. }
  189. /**
  190. * MCP Server for CodeGraph
  191. *
  192. * Implements the Model Context Protocol to expose CodeGraph
  193. * functionality as tools that can be called by AI assistants.
  194. *
  195. * Backwards-compatible constructor and `start()` signature with the
  196. * pre-issue-#411 implementation: callers continue to do
  197. * `new MCPServer(path).start()`. Internally we now pick from direct / proxy /
  198. * daemon at start time.
  199. */
  200. export class MCPServer {
  201. private projectPath: string | null;
  202. // Direct-mode-only state. In daemon mode the per-connection sessions live
  203. // inside the Daemon class; in proxy mode there is no session at all.
  204. private session: MCPSession | null = null;
  205. private engine: MCPEngine | null = null;
  206. private daemon: Daemon | null = null;
  207. private ppidWatchdog: ReturnType<typeof setInterval> | null = null;
  208. // PPID watchdog baseline — captured at construction so we always have a
  209. // baseline, even if start() runs after a fork-style reparent.
  210. private originalPpid: number = process.ppid;
  211. private hostPpid: number | null = parseHostPpid(process.env[HOST_PPID_ENV]);
  212. // Idempotency guard for stop().
  213. private stopped = false;
  214. private mode: 'unstarted' | 'direct' | 'proxy' | 'daemon' = 'unstarted';
  215. constructor(projectPath?: string) {
  216. this.projectPath = projectPath || null;
  217. }
  218. /**
  219. * Start the MCP server.
  220. *
  221. * Decision order:
  222. * 1. `CODEGRAPH_NO_DAEMON=1` → direct mode (unchanged pre-#411 behavior).
  223. * 2. `CODEGRAPH_DAEMON_INTERNAL=1` → we ARE the detached daemon; listen.
  224. * 3. No `.codegraph/` reachable → direct mode (the daemon's lockfile and
  225. * socket both live under `.codegraph/`).
  226. * 4. Otherwise connect to (or spawn) the shared daemon and proxy to it.
  227. *
  228. * On any unexpected failure in step 4 we transparently fall back to direct
  229. * mode — a misbehaving daemon must never block a session from starting.
  230. */
  231. async start(): Promise<void> {
  232. // The detached daemon process itself. Checked before the opt-out so the
  233. // daemon honors the same env it was spawned with (it never sets NO_DAEMON).
  234. if (daemonInternalSet()) {
  235. return this.startDaemonProcess();
  236. }
  237. // Direct mode if the user opted out. Setting the env var is sufficient to
  238. // get the pre-#411 single-process behavior.
  239. if (daemonOptOutSet()) {
  240. return this.startDirect('CODEGRAPH_NO_DAEMON set');
  241. }
  242. const root = resolveDaemonRoot(this.projectPath);
  243. if (!root) {
  244. // No initialized project found — daemon mode has nowhere to put its
  245. // socket. The fresh-checkout / outside-project case; behave as before.
  246. return this.startDirect('no .codegraph/ root found');
  247. }
  248. try {
  249. // Answer the MCP handshake LOCALLY (instant tool registration — no waiting
  250. // ~600ms for the daemon to spawn+bind, which produced the cold-start race)
  251. // and forward tool CALLS to the shared daemon, connected in the background.
  252. // Runs until the host disconnects; the proxy installs its own watchdog and
  253. // falls back to an in-process engine if the daemon never comes up.
  254. this.mode = 'proxy';
  255. await this.runProxyWithLocalHandshake(root);
  256. return;
  257. } catch (err) {
  258. // Belt-and-braces: a throw during proxy SETUP (before the client was served)
  259. // is still safe to recover from with a direct-mode session.
  260. const msg = err instanceof Error ? err.message : String(err);
  261. process.stderr.write(`[CodeGraph MCP] Proxy path failed (${msg}); falling back to direct mode.\n`);
  262. return this.startDirect('proxy path threw');
  263. }
  264. }
  265. /**
  266. * Stop the server. In daemon mode this triggers graceful shutdown of every
  267. * connected session; in direct mode it mirrors the pre-#411 behavior (close
  268. * cg, exit). Proxy mode never routes through here — the proxy exits itself.
  269. */
  270. stop(): void {
  271. if (this.stopped) return;
  272. this.stopped = true;
  273. if (this.ppidWatchdog) {
  274. clearInterval(this.ppidWatchdog);
  275. this.ppidWatchdog = null;
  276. }
  277. if (this.daemon) {
  278. void this.daemon.stop('stop()');
  279. // Daemon.stop calls process.exit; nothing else to do.
  280. return;
  281. }
  282. if (this.session) {
  283. this.session.stop();
  284. this.session = null;
  285. }
  286. if (this.engine) {
  287. this.engine.stop();
  288. this.engine = null;
  289. }
  290. process.exit(0);
  291. }
  292. /** Single-process stdio MCP session — the pre-issue-#411 code path. */
  293. private async startDirect(reason: string): Promise<void> {
  294. if (reason && process.env.CODEGRAPH_MCP_DEBUG) {
  295. process.stderr.write(`[CodeGraph MCP] Direct mode: ${reason}.\n`);
  296. }
  297. this.engine = new MCPEngine();
  298. const transport = new StdioTransport();
  299. this.session = new MCPSession(transport, this.engine, {
  300. explicitProjectPath: this.projectPath,
  301. });
  302. if (this.projectPath) {
  303. // Background init so the initialize response stays fast (#172).
  304. void this.engine.ensureInitialized(this.projectPath);
  305. }
  306. this.session.start();
  307. // Detect parent-process death — same logic as pre-refactor. When stdin
  308. // closes we go through StdioTransport's `process.exit(0)` already, but
  309. // SIGKILL of the parent doesn't reliably close stdin on Linux (#277).
  310. process.stdin.on('end', () => this.stop());
  311. process.stdin.on('close', () => this.stop());
  312. this.mode = 'direct';
  313. this.installSignalHandlers();
  314. this.installPpidWatchdog();
  315. }
  316. /**
  317. * Run as the detached shared daemon (process spawned with
  318. * `CODEGRAPH_DAEMON_INTERNAL=1`). Arbitrate the O_EXCL lock, then either
  319. * become the daemon (bind the socket, serve forever) or — if a live daemon
  320. * already holds the lock — exit so we don't leak a redundant process.
  321. *
  322. * No PPID watchdog and no stdin handlers: the daemon is detached on purpose
  323. * and reaps itself via client-refcount + idle timeout (see {@link Daemon}).
  324. */
  325. private async startDaemonProcess(): Promise<void> {
  326. const root = resolveDaemonRoot(this.projectPath) ?? this.projectPath ?? process.cwd();
  327. for (let attempt = 0; attempt < TAKEOVER_MAX_RETRIES; attempt++) {
  328. const lock = tryAcquireDaemonLock(root);
  329. if (lock.kind === 'acquired') {
  330. const daemon = new Daemon(root);
  331. await daemon.start();
  332. this.daemon = daemon;
  333. this.mode = 'daemon';
  334. return; // the net.Server keeps the process alive
  335. }
  336. // Taken. If the holder is alive, another daemon already serves (or is
  337. // binding) — we're redundant; exit cleanly so the launcher proxies to it.
  338. const existing = lock.existing;
  339. if (existing && existing.pid > 0 && isProcessAlive(existing.pid)) {
  340. process.stderr.write(
  341. `[CodeGraph daemon] Another daemon (pid ${existing.pid}) already holds the lock; exiting.\n`
  342. );
  343. process.exit(0);
  344. }
  345. // Holder is dead (or the record is unreadable) — clear it (pid-verified,
  346. // so we never delete a live daemon's lock) and retry the acquire.
  347. clearStaleDaemonLock(lock.pidPath, existing?.pid);
  348. await sleep(TAKEOVER_RETRY_DELAY_MS);
  349. }
  350. process.stderr.write('[CodeGraph daemon] Could not acquire the daemon lock; exiting.\n');
  351. process.exit(0);
  352. }
  353. /**
  354. * Proxy mode (the common case). Serve the MCP handshake LOCALLY for instant
  355. * tool registration, forwarding tool calls to the shared daemon — which is
  356. * connected in the background (probed, then spawned + polled if absent) so the
  357. * handshake never waits ~600ms on it. Runs until the host disconnects; the
  358. * proxy falls back to an in-process engine if the daemon never binds, so this
  359. * never wedges a session.
  360. */
  361. private async runProxyWithLocalHandshake(root: string): Promise<void> {
  362. const socketPath = getDaemonSocketPath(root);
  363. const getDaemonSocket = async () => {
  364. // Fast path: a daemon may already be listening.
  365. const probe = await connectWithHello(socketPath);
  366. if (probe === 'version-mismatch') return null; // definitive — serve in-process, don't poll for 6s
  367. if (probe) return probe;
  368. // None reachable — spawn one (detached) and poll for its bind.
  369. spawnDetachedDaemon(root);
  370. for (let attempt = 0; attempt < DAEMON_CONNECT_MAX_RETRIES; attempt++) {
  371. await sleep(DAEMON_CONNECT_RETRY_DELAY_MS);
  372. const s = await connectWithHello(socketPath);
  373. if (s === 'version-mismatch') return null;
  374. if (s) return s;
  375. }
  376. return null; // never bound — the proxy serves this session in-process
  377. };
  378. await runLocalHandshakeProxy({ getDaemonSocket, makeEngine: () => new MCPEngine(), root });
  379. }
  380. /** Standard SIGINT/SIGTERM handlers that route to our `stop()` (direct mode). */
  381. private installSignalHandlers(): void {
  382. process.on('SIGINT', () => this.stop());
  383. process.on('SIGTERM', () => this.stop());
  384. }
  385. /**
  386. * PPID watchdog (#277) — direct mode only. Daemon mode is detached on purpose
  387. * and reaps via idle timeout; proxy mode installs its own watchdog inside
  388. * {@link runProxy}. So this only ever runs for an in-process direct session.
  389. */
  390. private installPpidWatchdog(): void {
  391. if (this.mode !== 'direct') return;
  392. const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS);
  393. if (pollMs <= 0) return;
  394. this.ppidWatchdog = setInterval(() => {
  395. const reason = supervisionLostReason({
  396. originalPpid: this.originalPpid,
  397. currentPpid: process.ppid,
  398. hostPpid: this.hostPpid,
  399. isAlive: isProcessAlive,
  400. });
  401. if (reason) {
  402. process.stderr.write(
  403. `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`
  404. );
  405. this.stop();
  406. }
  407. }, pollMs);
  408. this.ppidWatchdog.unref();
  409. }
  410. }
  411. function sleep(ms: number): Promise<void> {
  412. // Deliberately NOT unref'd. During the daemon connect/takeover retry loop we
  413. // may be between processes — no socket bound yet, no transport, no listener
  414. // pinning the event loop. An unref'd timer would let Node drain the loop and
  415. // exit silently before we get a chance to try again.
  416. return new Promise((resolve) => { setTimeout(resolve, ms); });
  417. }
  418. // Export for use in CLI
  419. export { StdioTransport } from './transport';
  420. export { tools, ToolHandler } from './tools';
  421. // Surface a few daemon-mode bits for tests + diagnostics.
  422. export { Daemon } from './daemon';
  423. export { CodeGraphPackageVersion } from './version';