daemon.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. /**
  2. * Shared MCP daemon — issue #411.
  3. *
  4. * One detached `codegraph serve --mcp` daemon process per project root,
  5. * accepting N concurrent MCP clients over a Unix-domain socket (or named pipe
  6. * on Windows). Each incoming connection gets its own {@link MCPSession}; all
  7. * sessions share a single {@link MCPEngine}, which means a single file watcher
  8. * (one inotify set), a single SQLite connection (one WAL writer), and a single
  9. * tree-sitter warm-up — paid once, amortized across every agent talking to the
  10. * project.
  11. *
  12. * Lifecycle (see also `./index.ts` and `./proxy.ts`):
  13. * - The daemon is spawned **detached** (its own session/process group, stdio
  14. * decoupled) by the first launcher that finds no daemon running. It is NOT
  15. * a child of any MCP host, so closing one terminal / Ctrl-C'ing one session
  16. * can't take it down and sever the others. That's why this process has no
  17. * PPID watchdog: it deliberately outlives every individual client.
  18. * - Every MCP host talks to the daemon through a thin `proxy` process (the
  19. * thing the host actually spawned). The proxy keeps the #277 PPID watchdog,
  20. * so a SIGKILL'd host still reaps its proxy promptly; the proxy's socket
  21. * close then decrements the daemon's refcount.
  22. * - When the last client disconnects the daemon lingers for
  23. * `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default 300s) so back-to-back agent
  24. * runs in the same project don't repay startup, then exits cleanly. This is
  25. * what keeps a single-agent session from leaking a daemon forever (#277).
  26. *
  27. * What this file owns:
  28. * - Listening on the daemon socket and spawning per-connection sessions.
  29. * - The handshake "hello" line that lets a proxy verify it found a
  30. * same-version daemon before piping any JSON-RPC through it.
  31. * - The lockfile (`.codegraph/daemon.pid`) competing daemons arbitrate
  32. * against — atomic `O_EXCL` create with the full record written in the same
  33. * breath (no empty-file window) + cleanup on exit.
  34. * - Reference counting + idle timeout.
  35. * - Graceful shutdown on SIGTERM/SIGINT and idle exit.
  36. *
  37. * What this file does NOT own:
  38. * - The proxy side (`./proxy.ts`).
  39. * - The decision of *whether* to run as daemon at all — that's `MCPServer`.
  40. * - The MCP protocol state machine — that's `./session.ts`.
  41. */
  42. import * as fs from 'fs';
  43. import * as net from 'net';
  44. import * as path from 'path';
  45. import { MCPEngine } from './engine';
  46. import { MCPSession } from './session';
  47. import { SocketTransport } from './transport';
  48. import {
  49. DaemonLockInfo,
  50. decodeLockInfo,
  51. encodeLockInfo,
  52. getDaemonPidPath,
  53. getDaemonSocketPath,
  54. } from './daemon-paths';
  55. import { CodeGraphPackageVersion } from './version';
  56. /** Default idle linger after the last client disconnects. */
  57. const DEFAULT_IDLE_TIMEOUT_MS = 300_000;
  58. /** Bytes/parse-window for an oversized hello line — bounded against a malicious peer. */
  59. const MAX_HELLO_LINE_BYTES = 4096;
  60. /**
  61. * Wire format for the one-shot hello line the daemon emits on every new
  62. * connection. Versioned with the package's own semver so a 0.9.x proxy never
  63. * pipes through a 0.10.x daemon (or vice-versa) — the proxy falls back to
  64. * direct mode on mismatch rather than risk subtle wire incompatibilities.
  65. */
  66. export interface DaemonHello {
  67. codegraph: string; // package version (must match the proxy's own version)
  68. pid: number; // daemon pid (informational; for `ps` debugging)
  69. socketPath: string; // echoed back so the proxy can log it
  70. protocol: 1; // bump if the hello shape changes
  71. }
  72. export interface DaemonStartResult {
  73. /** Always-non-null for a successfully-started daemon. */
  74. socketPath: string;
  75. /** Lockfile contents as written. */
  76. lock: DaemonLockInfo;
  77. }
  78. /**
  79. * Run as the shared daemon for `projectRoot`. Resolves once the socket is
  80. * listening. The Daemon owns the socket, the engine, and the lockfile until
  81. * `stop()` is called or it exits on idle/signal.
  82. *
  83. * Race-safe: callers must first call `tryAcquireDaemonLock(projectRoot)` and
  84. * only construct a Daemon if they got the lock (`kind: 'acquired'`). The atomic
  85. * `O_EXCL` create inside the acquire helper — which now also writes the full
  86. * record before returning — is the only synchronization between competing
  87. * daemons.
  88. */
  89. export class Daemon {
  90. private server: net.Server | null = null;
  91. private clients = new Set<MCPSession>();
  92. private idleTimer: NodeJS.Timeout | null = null;
  93. private idleTimeoutMs: number;
  94. private engine: MCPEngine;
  95. private stopping = false;
  96. private socketPath: string;
  97. private pidPath: string;
  98. constructor(
  99. private projectRoot: string,
  100. opts: { idleTimeoutMs?: number } = {},
  101. ) {
  102. this.socketPath = getDaemonSocketPath(projectRoot);
  103. this.pidPath = getDaemonPidPath(projectRoot);
  104. this.idleTimeoutMs = opts.idleTimeoutMs ?? resolveIdleTimeoutMs();
  105. this.engine = new MCPEngine();
  106. this.engine.setProjectPathHint(projectRoot);
  107. }
  108. /**
  109. * Bind the socket, kick off engine init, and register signal handlers. The
  110. * lockfile body was already written atomically by `tryAcquireDaemonLock`, so
  111. * there is nothing to write here. The promise resolves once the server is
  112. * listening — the daemon then sticks around until idle/shutdown.
  113. */
  114. async start(): Promise<DaemonStartResult> {
  115. // Engine init is deliberately backgrounded — see #172. The first session
  116. // to land waits on `ensureInitialized` either way, and unloaded sessions
  117. // (cross-project tool calls only) shouldn't pay any open cost.
  118. void this.engine.ensureInitialized(this.projectRoot);
  119. // Stale socket file (left over from a SIGKILL'd previous daemon) will
  120. // wedge `listen` with EADDRINUSE. We arrived here holding the lockfile,
  121. // which means there's no live daemon, so it's safe to clear.
  122. if (process.platform !== 'win32') {
  123. try { fs.unlinkSync(this.socketPath); } catch { /* not-exists is fine */ }
  124. }
  125. await new Promise<void>((resolve, reject) => {
  126. const server = net.createServer((socket) => this.handleConnection(socket));
  127. server.once('error', (err) => reject(err));
  128. server.listen(this.socketPath, () => {
  129. // POSIX: tighten permissions to user-only — the socket lives under
  130. // `.codegraph/`, which is git-ignored but may be on a shared FS.
  131. if (process.platform !== 'win32') {
  132. try { fs.chmodSync(this.socketPath, 0o600); } catch { /* best-effort */ }
  133. }
  134. this.server = server;
  135. resolve();
  136. });
  137. });
  138. const lock: DaemonLockInfo = {
  139. pid: process.pid,
  140. version: CodeGraphPackageVersion,
  141. socketPath: this.socketPath,
  142. startedAt: Date.now(),
  143. };
  144. process.stderr.write(
  145. `[CodeGraph daemon] Listening on ${this.socketPath} (pid ${process.pid}, v${CodeGraphPackageVersion}). Idle timeout ${this.idleTimeoutMs}ms.\n`
  146. );
  147. // No clients yet: arm the idle timer immediately so a daemon that nobody
  148. // ever connects to (e.g. spawned then abandoned because the launcher died)
  149. // doesn't pin resources forever.
  150. this.armIdleTimer();
  151. process.on('SIGINT', () => this.stop('SIGINT'));
  152. process.on('SIGTERM', () => this.stop('SIGTERM'));
  153. return { socketPath: this.socketPath, lock };
  154. }
  155. /** Currently-connected client count. Exposed for tests / status output. */
  156. getClientCount(): number {
  157. return this.clients.size;
  158. }
  159. /** The socket path the daemon is (or will be) listening on. */
  160. getSocketPath(): string {
  161. return this.socketPath;
  162. }
  163. /** Graceful shutdown: close all sessions, the engine, and clean up the lock. */
  164. async stop(reason: string = 'stop'): Promise<void> {
  165. if (this.stopping) return;
  166. this.stopping = true;
  167. if (this.idleTimer) {
  168. clearTimeout(this.idleTimer);
  169. this.idleTimer = null;
  170. }
  171. process.stderr.write(`[CodeGraph daemon] Shutting down (${reason}; clients=${this.clients.size}).\n`);
  172. for (const session of [...this.clients]) {
  173. try { session.stop(); } catch { /* best-effort */ }
  174. }
  175. this.clients.clear();
  176. if (this.server) {
  177. await new Promise<void>((resolve) => this.server!.close(() => resolve()));
  178. this.server = null;
  179. }
  180. this.engine.stop();
  181. this.cleanupLockfile();
  182. if (process.platform !== 'win32') {
  183. try { fs.unlinkSync(this.socketPath); } catch { /* may already be gone */ }
  184. }
  185. process.exit(0);
  186. }
  187. private handleConnection(socket: net.Socket): void {
  188. // Hello first so the proxy can verify versions before piping any
  189. // application bytes. The proxy reads exactly one line, then forwards.
  190. const hello: DaemonHello = {
  191. codegraph: CodeGraphPackageVersion,
  192. pid: process.pid,
  193. socketPath: this.socketPath,
  194. protocol: 1,
  195. };
  196. socket.write(JSON.stringify(hello) + '\n');
  197. const transport = new SocketTransport(socket);
  198. const session = new MCPSession(transport, this.engine, {
  199. explicitProjectPath: this.projectRoot,
  200. });
  201. transport.onClose(() => this.dropClient(session));
  202. this.clients.add(session);
  203. this.disarmIdleTimer();
  204. session.start();
  205. }
  206. private dropClient(session: MCPSession): void {
  207. if (!this.clients.delete(session)) return;
  208. if (this.clients.size === 0) this.armIdleTimer();
  209. }
  210. private armIdleTimer(): void {
  211. if (this.idleTimer || this.stopping) return;
  212. if (this.idleTimeoutMs <= 0) return; // 0 = never idle-exit
  213. this.idleTimer = setTimeout(() => {
  214. this.idleTimer = null;
  215. // Last-second sanity check: if a connection landed between the timer
  216. // firing and now, don't exit. (setImmediate-ordering is the only way
  217. // this races; cheap to defend against.)
  218. if (this.clients.size > 0) {
  219. this.armIdleTimer();
  220. return;
  221. }
  222. void this.stop('idle timeout');
  223. }, this.idleTimeoutMs);
  224. // Don't keep the event loop alive just for this — the net.Server keeps the
  225. // loop alive while listening, so the timer still fires; once we stop() the
  226. // loop should drain naturally.
  227. this.idleTimer.unref?.();
  228. }
  229. private disarmIdleTimer(): void {
  230. if (!this.idleTimer) return;
  231. clearTimeout(this.idleTimer);
  232. this.idleTimer = null;
  233. }
  234. private cleanupLockfile(): void {
  235. try {
  236. if (fs.existsSync(this.pidPath)) {
  237. // Only remove if it still belongs to us — another daemon may have
  238. // already taken over while we were shutting down (extremely rare).
  239. const raw = fs.readFileSync(this.pidPath, 'utf8');
  240. const info = decodeLockInfo(raw);
  241. if (info && info.pid === process.pid) {
  242. fs.unlinkSync(this.pidPath);
  243. }
  244. }
  245. } catch { /* best-effort; we're exiting anyway */ }
  246. }
  247. }
  248. /**
  249. * Result of `tryAcquireDaemonLock`. Either we got the lockfile (caller becomes
  250. * the daemon), or it already existed (caller should connect to the existing
  251. * daemon as a proxy, or — if the holder is dead — clear it and retry).
  252. */
  253. export type AcquireResult =
  254. | { kind: 'acquired'; pidPath: string; info: DaemonLockInfo }
  255. | { kind: 'taken'; existing: DaemonLockInfo | null; pidPath: string };
  256. /**
  257. * Atomically create the daemon pidfile with its full record already in place.
  258. * Returns either an `acquired` result (the caller is the daemon-elect and may
  259. * construct a {@link Daemon}) or a `taken` result.
  260. *
  261. * must-fix 1 (issue #411 review): the lockfile must appear in ONE atomic step,
  262. * already complete — never empty, even momentarily. The first attempt at this
  263. * (`O_EXCL` create then a separate `writeSync`) left a microsecond window where
  264. * the file existed but was empty; under concurrent daemon startup a third
  265. * candidate could read that empty file, decode it as `null`, and `unlink` the
  266. * winner's lock → two daemons (two watchers, two writers). The window was
  267. * normally too small to hit, but the chokidar watcher's extra startup time made
  268. * concurrent daemons overlap enough to reproduce it reliably.
  269. *
  270. * The fix writes the complete record to a private temp file, then hard-links it
  271. * into place: `link()` is atomic AND exclusive (EEXIST if the target exists), so
  272. * the pidfile becomes visible in one step already containing a full record.
  273. * Whoever links first wins; everyone else gets EEXIST and reads a complete file.
  274. * There is no empty-file window at all.
  275. */
  276. export function tryAcquireDaemonLock(projectRoot: string): AcquireResult {
  277. const pidPath = getDaemonPidPath(projectRoot);
  278. // Make sure the .codegraph/ directory exists — the daemon may be the first
  279. // thing to touch it on a fresh-clone-but-already-initialized checkout.
  280. fs.mkdirSync(path.dirname(pidPath), { recursive: true });
  281. const info: DaemonLockInfo = {
  282. pid: process.pid,
  283. version: CodeGraphPackageVersion,
  284. socketPath: getDaemonSocketPath(projectRoot),
  285. startedAt: Date.now(),
  286. };
  287. // Temp name is pid-scoped so racing candidates never collide on it.
  288. const tmp = `${pidPath}.${process.pid}.tmp`;
  289. let acquired = false;
  290. try {
  291. fs.writeFileSync(tmp, encodeLockInfo(info), { mode: 0o600 });
  292. try {
  293. fs.linkSync(tmp, pidPath); // atomic + exclusive
  294. acquired = true;
  295. } catch (err: unknown) {
  296. if ((err as NodeJS.ErrnoException).code !== 'EEXIST') throw err;
  297. }
  298. } finally {
  299. try { fs.unlinkSync(tmp); } catch { /* temp already gone */ }
  300. }
  301. if (acquired) return { kind: 'acquired', pidPath, info };
  302. // Taken. Because the pidfile was link'd atomically it always holds a complete
  303. // record — `existing` is null only for a genuinely corrupt leftover, never a
  304. // mid-write race.
  305. let existing: DaemonLockInfo | null = null;
  306. try {
  307. existing = decodeLockInfo(fs.readFileSync(pidPath, 'utf8'));
  308. } catch { /* unreadable lockfile — treat as malformed */ }
  309. return { kind: 'taken', existing, pidPath };
  310. }
  311. /**
  312. * Remove a stale pidfile, but only if it still names a dead process. Re-reads
  313. * the file immediately before unlinking so we never delete a lock that a live
  314. * daemon (re)acquired in the meantime.
  315. *
  316. * must-fix 1 (issue #411 review): the original unconditionally `unlink`'d,
  317. * which let a racing candidate delete a healthy daemon's lock. Passing
  318. * `expectedDeadPid` (the pid the caller believed was dead) makes the clear a
  319. * compare-and-delete: bail if the file now holds a different pid, or any live
  320. * pid. Returns true when the stale lock is gone (or was already gone).
  321. */
  322. export function clearStaleDaemonLock(pidPath: string, expectedDeadPid?: number): boolean {
  323. try {
  324. const raw = fs.readFileSync(pidPath, 'utf8');
  325. const info = decodeLockInfo(raw);
  326. if (info) {
  327. // A different pid took over since we read it — not ours to clear.
  328. if (expectedDeadPid !== undefined && info.pid !== expectedDeadPid) return false;
  329. // Holder is actually alive — never clear a live daemon's lock.
  330. if (info.pid > 0 && isProcessAlive(info.pid)) return false;
  331. }
  332. fs.unlinkSync(pidPath);
  333. return true;
  334. } catch (err: unknown) {
  335. const e = err as NodeJS.ErrnoException;
  336. if (e.code === 'ENOENT') return true; // already gone
  337. return false;
  338. }
  339. }
  340. /**
  341. * Probe whether `pid` is currently alive (signal-0). Treats EPERM as alive on
  342. * every platform (the process exists, it's just not ours to signal) so we never
  343. * mistake a live daemon for a dead one and clear its lock.
  344. */
  345. export function isProcessAlive(pid: number): boolean {
  346. try {
  347. process.kill(pid, 0);
  348. return true;
  349. } catch (err: unknown) {
  350. const e = err as NodeJS.ErrnoException;
  351. if (e.code === 'EPERM') return true; // exists, just not ours to signal
  352. return false;
  353. }
  354. }
  355. function resolveIdleTimeoutMs(): number {
  356. const raw = process.env.CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS;
  357. if (raw === undefined || raw === '') return DEFAULT_IDLE_TIMEOUT_MS;
  358. const parsed = Number(raw);
  359. if (!Number.isFinite(parsed) || parsed < 0) return DEFAULT_IDLE_TIMEOUT_MS;
  360. return Math.floor(parsed);
  361. }
  362. /** Exported for test stubs that need to bound the hello-line read. */
  363. export { MAX_HELLO_LINE_BYTES };