daemon.ts 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818
  1. /**
  2. * Shared MCP daemon — issue #411.
  3. *
  4. * One detached `codegraph serve --mcp` daemon process per project root,
  5. * accepting N concurrent MCP clients over a Unix-domain socket (or named pipe
  6. * on Windows). Each incoming connection gets its own {@link MCPSession}; all
  7. * sessions share a single {@link MCPEngine}, which means a single file watcher
  8. * (one inotify set), a single SQLite connection (one WAL writer), and a single
  9. * tree-sitter warm-up — paid once, amortized across every agent talking to the
  10. * project.
  11. *
  12. * Lifecycle (see also `./index.ts` and `./proxy.ts`):
  13. * - The daemon is spawned **detached** (its own session/process group, stdio
  14. * decoupled) by the first launcher that finds no daemon running. It is NOT
  15. * a child of any MCP host, so closing one terminal / Ctrl-C'ing one session
  16. * can't take it down and sever the others. That's why this process has no
  17. * PPID watchdog: it deliberately outlives every individual client.
  18. * - Every MCP host talks to the daemon through a thin `proxy` process (the
  19. * thing the host actually spawned). The proxy keeps the #277 PPID watchdog,
  20. * so a SIGKILL'd host still reaps its proxy promptly; the proxy's socket
  21. * close then decrements the daemon's refcount.
  22. * - When the last client disconnects the daemon lingers for
  23. * `CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` (default 300s) so back-to-back agent
  24. * runs in the same project don't repay startup, then exits cleanly. This is
  25. * what keeps a single-agent session from leaking a daemon forever (#277).
  26. *
  27. * What this file owns:
  28. * - Listening on the daemon socket and spawning per-connection sessions.
  29. * - The handshake "hello" line that lets a proxy verify it found a
  30. * same-version daemon before piping any JSON-RPC through it.
  31. * - The lockfile (`.codegraph/daemon.pid`) competing daemons arbitrate
  32. * against — atomic `O_EXCL` create with the full record written in the same
  33. * breath (no empty-file window) + cleanup on exit.
  34. * - Reference counting + idle timeout.
  35. * - Graceful shutdown on SIGTERM/SIGINT and idle exit.
  36. *
  37. * What this file does NOT own:
  38. * - The proxy side (`./proxy.ts`).
  39. * - The decision of *whether* to run as daemon at all — that's `MCPServer`.
  40. * - The MCP protocol state machine — that's `./session.ts`.
  41. */
  42. import * as fs from 'fs';
  43. import * as net from 'net';
  44. import * as path from 'path';
  45. import { MCPEngine } from './engine';
  46. import { MCPSession } from './session';
  47. import { SocketTransport } from './transport';
  48. import {
  49. DaemonLockInfo,
  50. decodeLockInfo,
  51. encodeLockInfo,
  52. getDaemonPidPath,
  53. getDaemonSocketCandidates,
  54. getDaemonSocketPath,
  55. } from './daemon-paths';
  56. import { CodeGraphPackageVersion } from './version';
  57. import { registerDaemon, deregisterDaemon } from './daemon-registry';
  58. /** Default idle linger after the last client disconnects. */
  59. const DEFAULT_IDLE_TIMEOUT_MS = 300_000;
  60. /**
  61. * Hard ceiling on how long the daemon stays up with clients connected but no
  62. * inbound traffic. A backstop (#692): if a client's socket-close is never
  63. * delivered (a Windows named-pipe hazard) it stays counted forever and the
  64. * normal idle timer — which only arms at zero clients — never fires. A phantom
  65. * client sends no traffic, so bounding on inactivity reaps the daemon anyway.
  66. * Set generously so a real but momentarily-idle session isn't reaped mid-use.
  67. */
  68. const DEFAULT_MAX_IDLE_MS = 1_800_000; // 30 min
  69. /**
  70. * Windows-only shutdown backstop. On Windows, calling `process.exit()` while a
  71. * recursive `fs.watch` handle is still tearing down aborts the process with a
  72. * libuv `UV_HANDLE_CLOSING` assertion (`0xC0000409`) — reproducible whenever the
  73. * watched tree contains a nested repo (submodule / embedded clone), since that's
  74. * what keeps a watch active at shutdown. The fix is to let the event loop drain
  75. * so libuv finishes closing those handles, then exit naturally; this timer only
  76. * force-exits if some unexpected handle keeps the loop alive past the grace
  77. * window. Kept short so shutdown stays snappy in that fallback. See
  78. * `finalizeDaemonExit`.
  79. */
  80. const DAEMON_SHUTDOWN_BACKSTOP_MS = 2_000;
  81. /**
  82. * Finalize daemon shutdown. On POSIX, exit immediately — it's clean and fast.
  83. * On Windows, do NOT force an exit while watchers may still be closing (that
  84. * trips the libuv assertion above); instead mark success and let the loop drain
  85. * to a natural exit, with an UNREF'd backstop that force-exits only if a stray
  86. * handle would otherwise hang shutdown. Pure and platform-injected so both
  87. * branches are unit-testable off-Windows. Returns the backstop timer (Windows)
  88. * so callers/tests can clear it.
  89. */
  90. export function finalizeDaemonExit(
  91. platform: NodeJS.Platform,
  92. exit: (code: number) => void,
  93. ): NodeJS.Timeout | null {
  94. if (platform === 'win32') {
  95. process.exitCode = 0;
  96. const backstop = setTimeout(() => exit(0), DAEMON_SHUTDOWN_BACKSTOP_MS);
  97. // Unref so it never keeps the loop alive: a natural drain (watchers closed,
  98. // nothing else pending) exits before it fires; it only fires when some other
  99. // handle is keeping the loop running, which is exactly when we need it.
  100. backstop.unref?.();
  101. return backstop;
  102. }
  103. exit(0);
  104. return null;
  105. }
  106. /** How often the daemon sweeps connected clients for a dead peer process (#692). */
  107. const DEFAULT_CLIENT_SWEEP_MS = 30_000;
  108. /** How long the daemon waits for the optional client-hello before proceeding without it. */
  109. const CLIENT_HELLO_TIMEOUT_MS = 3_000;
  110. /** Bytes/parse-window for an oversized hello line — bounded against a malicious peer. */
  111. const MAX_HELLO_LINE_BYTES = 4096;
  112. /**
  113. * Wire format for the one-shot hello line the daemon emits on every new
  114. * connection. Versioned with the package's own semver so a 0.9.x proxy never
  115. * pipes through a 0.10.x daemon (or vice-versa) — the proxy falls back to
  116. * direct mode on mismatch rather than risk subtle wire incompatibilities.
  117. */
  118. export interface DaemonHello {
  119. codegraph: string; // package version (must match the proxy's own version)
  120. pid: number; // daemon pid (informational; for `ps` debugging)
  121. socketPath: string; // echoed back so the proxy can log it
  122. protocol: 1; // bump if the hello shape changes
  123. }
  124. /**
  125. * Optional reverse-handshake line a proxy sends right after it verifies the
  126. * daemon hello, carrying its own pids so the daemon can reap the client if its
  127. * process dies WITHOUT the socket ever signalling close (the Windows named-pipe
  128. * hazard behind #692). Entirely optional and fail-safe: a connection that never
  129. * sends it (a legacy/direct client) just falls back to the socket-close
  130. * lifecycle. The `codegraph_client` marker is what tells it apart from the
  131. * client's first JSON-RPC message.
  132. */
  133. export interface DaemonClientHello {
  134. codegraph_client: 1;
  135. pid: number; // the proxy process's own pid
  136. hostPid: number | null; // the MCP host pid (past any launcher shim), if known
  137. }
  138. export interface DaemonStartResult {
  139. /** Always-non-null for a successfully-started daemon. */
  140. socketPath: string;
  141. /** Lockfile contents as written. */
  142. lock: DaemonLockInfo;
  143. }
  144. /**
  145. * Run as the shared daemon for `projectRoot`. Resolves once the socket is
  146. * listening. The Daemon owns the socket, the engine, and the lockfile until
  147. * `stop()` is called or it exits on idle/signal.
  148. *
  149. * Race-safe: callers must first call `tryAcquireDaemonLock(projectRoot)` and
  150. * only construct a Daemon if they got the lock (`kind: 'acquired'`). The atomic
  151. * `O_EXCL` create inside the acquire helper — which now also writes the full
  152. * record before returning — is the only synchronization between competing
  153. * daemons.
  154. */
  155. export class Daemon {
  156. private server: net.Server | null = null;
  157. private clients = new Set<MCPSession>();
  158. /** Per-client peer pids from the optional client-hello, for the liveness sweep. */
  159. private clientPeers = new Map<MCPSession, { pid: number | null; hostPid: number | null }>();
  160. private idleTimer: NodeJS.Timeout | null = null;
  161. private idleTimeoutMs: number;
  162. private maxIdleMs: number;
  163. private lastActivityAt = Date.now();
  164. private maxIdleTimer: NodeJS.Timeout | null = null;
  165. private clientSweepTimer: NodeJS.Timeout | null = null;
  166. private engine: MCPEngine;
  167. private stopping = false;
  168. private socketPath: string;
  169. private pidPath: string;
  170. constructor(
  171. private projectRoot: string,
  172. opts: { idleTimeoutMs?: number; maxIdleMs?: number } = {},
  173. ) {
  174. this.socketPath = getDaemonSocketPath(projectRoot);
  175. this.pidPath = getDaemonPidPath(projectRoot);
  176. this.idleTimeoutMs = opts.idleTimeoutMs ?? resolveIdleTimeoutMs();
  177. this.maxIdleMs = opts.maxIdleMs ?? resolveMaxIdleMs();
  178. // Daemon mode serves many concurrent clients on one event loop, so off-load
  179. // read-tool dispatch to a worker pool — otherwise concurrent explores
  180. // serialize and starve the MCP transport (clients time out). Direct mode
  181. // (one stdio client) leaves the pool off; `CODEGRAPH_QUERY_POOL_SIZE=0`
  182. // disables it here too.
  183. this.engine = new MCPEngine({ queryPool: true });
  184. this.engine.setProjectPathHint(projectRoot);
  185. }
  186. /**
  187. * Bind the socket, kick off engine init, and register signal handlers. The
  188. * lockfile body was already written atomically by `tryAcquireDaemonLock`, so
  189. * there is nothing to write here. The promise resolves once the server is
  190. * listening — the daemon then sticks around until idle/shutdown.
  191. */
  192. async start(): Promise<DaemonStartResult> {
  193. // Engine init is deliberately backgrounded — see #172. The first session
  194. // to land waits on `ensureInitialized` either way, and unloaded sessions
  195. // (cross-project tool calls only) shouldn't pay any open cost.
  196. void this.engine.ensureInitialized(this.projectRoot);
  197. // Walk the ordered socket candidates and bind the first that works. The
  198. // in-project path comes first; the deterministic tmpdir path is the fallback
  199. // for a filesystem that can't host an AF_UNIX node at all (ExFAT/FAT external
  200. // volumes, some network mounts, WSL2 DrvFs → ENOTSUP/EACCES; #997, #974). The
  201. // `listen` closure clears a stale socket (left by a SIGKILL'd previous daemon)
  202. // before each attempt — safe because we hold the lockfile, so no live daemon
  203. // owns it; without it `listen` would wedge on EADDRINUSE.
  204. const candidates = getDaemonSocketCandidates(this.projectRoot);
  205. const listen = (socketPath: string): Promise<net.Server> =>
  206. new Promise<net.Server>((resolve, reject) => {
  207. if (process.platform !== 'win32') {
  208. try { fs.unlinkSync(socketPath); } catch { /* not-exists is fine */ }
  209. }
  210. const server = net.createServer((socket) => this.handleConnection(socket));
  211. server.once('error', reject);
  212. server.listen(socketPath, () => {
  213. // POSIX: tighten permissions to user-only — the socket lives under
  214. // `.codegraph/` (git-ignored, maybe a shared FS) or tmpdir.
  215. if (process.platform !== 'win32') {
  216. try { fs.chmodSync(socketPath, 0o600); } catch { /* best-effort */ }
  217. }
  218. resolve(server);
  219. });
  220. });
  221. let bound: { server: net.Server; socketPath: string };
  222. try {
  223. bound = await bindFirstUsableSocket(candidates, listen, {
  224. onRelocate: (from, to, code) =>
  225. process.stderr.write(
  226. `[CodeGraph daemon] Socket ${from} unusable (${code}); relocating to ${to}.\n`
  227. ),
  228. });
  229. } catch (err) {
  230. // Every candidate failed (the last one, or a non-relocatable error like a
  231. // racing EADDRINUSE). We already hold the lockfile `tryAcquireDaemonLock`
  232. // wrote; release it and any partial sockets so the NEXT launcher doesn't
  233. // spin respawning us on a stale lock pointing at our now-dying pid. Then
  234. // re-throw so the caller (the bin's try/catch) exits this detached daemon
  235. // cleanly and every launcher falls back to direct mode (#974).
  236. this.cleanupLockfile();
  237. if (process.platform !== 'win32') {
  238. for (const candidate of candidates) {
  239. try { fs.unlinkSync(candidate); } catch { /* may not exist */ }
  240. }
  241. }
  242. throw err;
  243. }
  244. this.server = bound.server;
  245. // Adopt the path we ACTUALLY bound — it may be a tmpdir fallback past an
  246. // unusable in-project location. Everything downstream (lockfile, registry,
  247. // chmod, cleanup, status) keys off this real path, not the preferred guess.
  248. this.socketPath = bound.socketPath;
  249. const lock: DaemonLockInfo = {
  250. pid: process.pid,
  251. version: CodeGraphPackageVersion,
  252. socketPath: this.socketPath,
  253. startedAt: Date.now(),
  254. };
  255. // `tryAcquireDaemonLock` wrote the pidfile with the PREFERRED path (candidate
  256. // 0) before we knew which one would bind. If we relocated, rewrite it so the
  257. // per-project record is honest. Atomic temp+rename; safe because we hold the
  258. // lock and we're alive — `clearStaleDaemonLock` pid-verifies, so no racing
  259. // candidate clears or clobbers a live daemon's lock.
  260. if (this.socketPath !== candidates[0]) {
  261. try {
  262. const tmpPid = `${this.pidPath}.${process.pid}.relocate`;
  263. fs.writeFileSync(tmpPid, encodeLockInfo(lock), { mode: 0o600 });
  264. fs.renameSync(tmpPid, this.pidPath);
  265. } catch { /* best-effort; the registry record below carries the real path */ }
  266. }
  267. // Drop a discovery record so `codegraph list` / `stop --all` can find us.
  268. // Best-effort; a missing record only means list's liveness prune covers it.
  269. registerDaemon({ root: this.projectRoot, ...lock });
  270. process.stderr.write(
  271. `[CodeGraph daemon] Listening on ${this.socketPath} (pid ${process.pid}, v${CodeGraphPackageVersion}). Idle timeout ${this.idleTimeoutMs}ms.\n`
  272. );
  273. // No clients yet: arm the idle timer immediately so a daemon that nobody
  274. // ever connects to (e.g. spawned then abandoned because the launcher died)
  275. // doesn't pin resources forever.
  276. this.armIdleTimer();
  277. this.startLivenessTimers();
  278. process.on('SIGINT', () => this.stop('SIGINT'));
  279. process.on('SIGTERM', () => this.stop('SIGTERM'));
  280. return { socketPath: this.socketPath, lock };
  281. }
  282. /** Currently-connected client count. Exposed for tests / status output. */
  283. getClientCount(): number {
  284. return this.clients.size;
  285. }
  286. /** The socket path the daemon is (or will be) listening on. */
  287. getSocketPath(): string {
  288. return this.socketPath;
  289. }
  290. /** Graceful shutdown: close all sessions, the engine, and clean up the lock. */
  291. async stop(reason: string = 'stop'): Promise<void> {
  292. if (this.stopping) return;
  293. this.stopping = true;
  294. if (this.idleTimer) {
  295. clearTimeout(this.idleTimer);
  296. this.idleTimer = null;
  297. }
  298. if (this.maxIdleTimer) {
  299. clearInterval(this.maxIdleTimer);
  300. this.maxIdleTimer = null;
  301. }
  302. if (this.clientSweepTimer) {
  303. clearInterval(this.clientSweepTimer);
  304. this.clientSweepTimer = null;
  305. }
  306. process.stderr.write(`[CodeGraph daemon] Shutting down (${reason}; clients=${this.clients.size}).\n`);
  307. for (const session of [...this.clients]) {
  308. try { session.stop(); } catch { /* best-effort */ }
  309. }
  310. this.clients.clear();
  311. if (this.server) {
  312. await new Promise<void>((resolve) => this.server!.close(() => resolve()));
  313. this.server = null;
  314. }
  315. this.engine.stop();
  316. this.cleanupLockfile();
  317. deregisterDaemon(this.projectRoot);
  318. if (process.platform !== 'win32') {
  319. try { fs.unlinkSync(this.socketPath); } catch { /* may already be gone */ }
  320. }
  321. // POSIX exits here; Windows drains first (engine.stop() above began closing
  322. // the file watcher, and exiting mid-teardown aborts the process). See
  323. // finalizeDaemonExit / DAEMON_SHUTDOWN_BACKSTOP_MS.
  324. finalizeDaemonExit(process.platform, (code) => process.exit(code));
  325. }
  326. private handleConnection(socket: net.Socket): void {
  327. // Hello first so the proxy can verify versions before piping any
  328. // application bytes. The proxy reads exactly one line, then forwards.
  329. const hello: DaemonHello = {
  330. codegraph: CodeGraphPackageVersion,
  331. pid: process.pid,
  332. socketPath: this.socketPath,
  333. protocol: 1,
  334. };
  335. socket.write(JSON.stringify(hello) + '\n');
  336. // Read the optional client-hello (proxy → daemon) to learn the client's
  337. // peer pids, then hand the socket to the session. Fail-safe: any problem —
  338. // timeout, a non-hello first line, an early close — yields null pids and we
  339. // fall back to the socket-close lifecycle exactly as before (#692).
  340. void readClientHello(socket).then((peers) => {
  341. const transport = new SocketTransport(socket);
  342. const session = new MCPSession(transport, this.engine, {
  343. explicitProjectPath: this.projectRoot,
  344. });
  345. transport.onClose(() => this.dropClient(session));
  346. this.clients.add(session);
  347. this.clientPeers.set(session, peers);
  348. this.disarmIdleTimer();
  349. session.start();
  350. // Observe inbound bytes purely to feed the inactivity backstop — a second
  351. // 'data' listener that reads nothing, added AFTER the transport's so the
  352. // unshifted client-hello tail reaches the transport intact.
  353. socket.on('data', () => { this.lastActivityAt = Date.now(); });
  354. });
  355. }
  356. private dropClient(session: MCPSession): void {
  357. if (!this.clients.delete(session)) return;
  358. this.clientPeers.delete(session);
  359. if (this.clients.size === 0) this.armIdleTimer();
  360. }
  361. private armIdleTimer(): void {
  362. if (this.idleTimer || this.stopping) return;
  363. if (this.idleTimeoutMs <= 0) return; // 0 = never idle-exit
  364. this.idleTimer = setTimeout(() => {
  365. this.idleTimer = null;
  366. // Last-second sanity check: if a connection landed between the timer
  367. // firing and now, don't exit. (setImmediate-ordering is the only way
  368. // this races; cheap to defend against.)
  369. if (this.clients.size > 0) {
  370. this.armIdleTimer();
  371. return;
  372. }
  373. void this.stop('idle timeout');
  374. }, this.idleTimeoutMs);
  375. // Don't keep the event loop alive just for this — the net.Server keeps the
  376. // loop alive while listening, so the timer still fires; once we stop() the
  377. // loop should drain naturally.
  378. this.idleTimer.unref?.();
  379. }
  380. private disarmIdleTimer(): void {
  381. if (!this.idleTimer) return;
  382. clearTimeout(this.idleTimer);
  383. this.idleTimer = null;
  384. }
  385. /**
  386. * Defense-in-depth against a daemon that outlives its clients (#692), for the
  387. * cases the refcount + idle timer miss because a socket close never arrives:
  388. * - **Inactivity backstop:** exit if no inbound traffic for `maxIdleMs` while
  389. * clients are still (nominally) connected. A phantom client sends nothing,
  390. * so it can't pin the daemon past this window.
  391. * - **Liveness sweep:** drop any client whose peer process has died (per the
  392. * client-hello pids), which re-arms the idle timer once the last real
  393. * client is gone. Catches a dead peer within one sweep instead of waiting
  394. * out the whole backstop.
  395. * Both timers are unref'd — the listening server keeps the loop alive, and
  396. * neither should hold it open on its own.
  397. */
  398. private startLivenessTimers(): void {
  399. if (this.maxIdleMs > 0) {
  400. const tick = Math.min(this.maxIdleMs, 60_000);
  401. this.maxIdleTimer = setInterval(() => {
  402. if (this.stopping || this.clients.size === 0) return; // idle timer owns the no-client case
  403. if (Date.now() - this.lastActivityAt >= this.maxIdleMs) {
  404. void this.stop('inactivity backstop');
  405. }
  406. }, tick);
  407. this.maxIdleTimer.unref?.();
  408. }
  409. const sweepMs = resolveClientSweepMs();
  410. if (sweepMs > 0) {
  411. this.clientSweepTimer = setInterval(() => this.reapDeadClients(isProcessAlive), sweepMs);
  412. this.clientSweepTimer.unref?.();
  413. }
  414. }
  415. /**
  416. * Drop every connected client whose peer process is gone. Returns the count
  417. * reaped. `isAlive` is injected for testing. Clients with unknown pids (no
  418. * client-hello) are skipped — they rely on the socket-close path.
  419. */
  420. reapDeadClients(isAlive: (pid: number) => boolean): number {
  421. if (this.clients.size === 0) return 0;
  422. let reaped = 0;
  423. for (const session of [...this.clients]) {
  424. const peers = this.clientPeers.get(session);
  425. if (!peers || !peerIsDead(peers, isAlive)) continue;
  426. process.stderr.write(
  427. `[CodeGraph daemon] Reaping client with dead peer (pid ${peers.pid}); clients=${this.clients.size - 1}.\n`
  428. );
  429. try { session.stop(); } catch { /* best-effort */ }
  430. this.dropClient(session);
  431. reaped++;
  432. }
  433. return reaped;
  434. }
  435. private cleanupLockfile(): void {
  436. try {
  437. if (fs.existsSync(this.pidPath)) {
  438. // Only remove if it still belongs to us — another daemon may have
  439. // already taken over while we were shutting down (extremely rare).
  440. const raw = fs.readFileSync(this.pidPath, 'utf8');
  441. const info = decodeLockInfo(raw);
  442. if (info && info.pid === process.pid) {
  443. fs.unlinkSync(this.pidPath);
  444. }
  445. }
  446. } catch { /* best-effort; we're exiting anyway */ }
  447. }
  448. }
  449. /**
  450. * Result of `tryAcquireDaemonLock`. Either we got the lockfile (caller becomes
  451. * the daemon), or it already existed (caller should connect to the existing
  452. * daemon as a proxy, or — if the holder is dead — clear it and retry).
  453. */
  454. export type AcquireResult =
  455. | { kind: 'acquired'; pidPath: string; info: DaemonLockInfo }
  456. | { kind: 'taken'; existing: DaemonLockInfo | null; pidPath: string };
  457. /**
  458. * Atomically create the daemon pidfile with its full record already in place.
  459. * Returns either an `acquired` result (the caller is the daemon-elect and may
  460. * construct a {@link Daemon}) or a `taken` result.
  461. *
  462. * must-fix 1 (issue #411 review): the lockfile must appear in ONE atomic step,
  463. * already complete — never empty, even momentarily. The first attempt at this
  464. * (`O_EXCL` create then a separate `writeSync`) left a microsecond window where
  465. * the file existed but was empty; under concurrent daemon startup a third
  466. * candidate could read that empty file, decode it as `null`, and `unlink` the
  467. * winner's lock → two daemons (two watchers, two writers). The window was
  468. * normally too small to hit, but the file watcher's extra startup time made
  469. * concurrent daemons overlap enough to reproduce it reliably.
  470. *
  471. * The fix writes the complete record to a private temp file, then hard-links it
  472. * into place: `link()` is atomic AND exclusive (EEXIST if the target exists), so
  473. * the pidfile becomes visible in one step already containing a full record.
  474. * Whoever links first wins; everyone else gets EEXIST and reads a complete file.
  475. * There is no empty-file window at all.
  476. *
  477. * Filesystems without hard links (#997): ExFAT/FAT external volumes and some
  478. * network mounts can't `link()` at all — it throws ENOTSUP/EPERM, which would
  479. * otherwise kill the daemon before it ever reaches the socket bind. There we
  480. * fall back to an O_EXCL create (`acquireLockViaExclusiveOpen`): still exclusive
  481. * ("first writer wins"), but the full record is written through the fd in a
  482. * second step, so the empty-file window the link approach removed is reopened —
  483. * only on these filesystems, only for the microseconds between create and write
  484. * (far narrower than the original bug, which the file watcher's startup latency
  485. * widened). The race's worst case is two daemons briefly; on a single external
  486. * drive that's strictly better than the daemon never starting at all.
  487. */
  488. export function tryAcquireDaemonLock(projectRoot: string): AcquireResult {
  489. const pidPath = getDaemonPidPath(projectRoot);
  490. // Make sure the .codegraph/ directory exists — the daemon may be the first
  491. // thing to touch it on a fresh-clone-but-already-initialized checkout.
  492. fs.mkdirSync(path.dirname(pidPath), { recursive: true });
  493. const info: DaemonLockInfo = {
  494. pid: process.pid,
  495. version: CodeGraphPackageVersion,
  496. socketPath: getDaemonSocketPath(projectRoot),
  497. startedAt: Date.now(),
  498. };
  499. // Temp name is pid-scoped so racing candidates never collide on it.
  500. const tmp = `${pidPath}.${process.pid}.tmp`;
  501. let acquired = false;
  502. try {
  503. fs.writeFileSync(tmp, encodeLockInfo(info), { mode: 0o600 });
  504. try {
  505. fs.linkSync(tmp, pidPath); // atomic + exclusive (race-free; see must-fix 1)
  506. acquired = true;
  507. } catch (err: unknown) {
  508. if ((err as NodeJS.ErrnoException).code === 'EEXIST') {
  509. // Lost the race — another candidate already holds it. Fall through to read.
  510. } else {
  511. // link() failed for a non-conflict reason — nearly always "this filesystem
  512. // has no hard links" (ExFAT/FAT external volumes, some network mounts),
  513. // which surfaces as a DIFFERENT errno on every OS: ENOTSUP on macOS, EPERM
  514. // on Linux, EISDIR on Windows (#997). Enumerating them is whack-a-mole and
  515. // unnecessary: the `tmp` write above already proved this directory is
  516. // writable, so an O_EXCL create is a valid atomic+exclusive substitute. If
  517. // IT fails too, that's a genuine error and propagates. EEXIST ⇒ taken.
  518. acquired = acquireLockViaExclusiveOpen(pidPath, info);
  519. }
  520. }
  521. } finally {
  522. try { fs.unlinkSync(tmp); } catch { /* temp already gone */ }
  523. }
  524. if (acquired) return { kind: 'acquired', pidPath, info };
  525. // Taken. Because the pidfile was link'd atomically it always holds a complete
  526. // record — `existing` is null only for a genuinely corrupt leftover, never a
  527. // mid-write race.
  528. let existing: DaemonLockInfo | null = null;
  529. try {
  530. existing = decodeLockInfo(fs.readFileSync(pidPath, 'utf8'));
  531. } catch { /* unreadable lockfile — treat as malformed */ }
  532. return { kind: 'taken', existing, pidPath };
  533. }
  534. /**
  535. * Exclusive-create the pidfile (O_CREAT|O_EXCL via the `wx` flag) and write the
  536. * full record through the same fd — the hard-link-free fallback used by
  537. * {@link tryAcquireDaemonLock} on filesystems without `link()`. Returns true if
  538. * we created it (acquired the lock), false on EEXIST (another candidate holds
  539. * it). Any other error propagates. Still exclusive, so "first writer wins" holds
  540. * exactly as the link path does; the only difference is the brief empty-file
  541. * window between create and write. Exported for testing.
  542. */
  543. export function acquireLockViaExclusiveOpen(pidPath: string, info: DaemonLockInfo): boolean {
  544. let fd: number;
  545. try {
  546. fd = fs.openSync(pidPath, 'wx', 0o600); // O_CREAT | O_EXCL | O_WRONLY
  547. } catch (err: unknown) {
  548. if ((err as NodeJS.ErrnoException).code === 'EEXIST') return false;
  549. throw err;
  550. }
  551. try {
  552. fs.writeSync(fd, encodeLockInfo(info));
  553. } finally {
  554. fs.closeSync(fd);
  555. }
  556. return true;
  557. }
  558. /**
  559. * Remove a stale pidfile, but only if it still names a dead process. Re-reads
  560. * the file immediately before unlinking so we never delete a lock that a live
  561. * daemon (re)acquired in the meantime.
  562. *
  563. * must-fix 1 (issue #411 review): the original unconditionally `unlink`'d,
  564. * which let a racing candidate delete a healthy daemon's lock. Passing
  565. * `expectedDeadPid` (the pid the caller believed was dead) makes the clear a
  566. * compare-and-delete: bail if the file now holds a different pid, or any live
  567. * pid. Returns true when the stale lock is gone (or was already gone).
  568. */
  569. export function clearStaleDaemonLock(pidPath: string, expectedDeadPid?: number): boolean {
  570. try {
  571. const raw = fs.readFileSync(pidPath, 'utf8');
  572. const info = decodeLockInfo(raw);
  573. if (info) {
  574. // A different pid took over since we read it — not ours to clear.
  575. if (expectedDeadPid !== undefined && info.pid !== expectedDeadPid) return false;
  576. // Holder is actually alive — never clear a live daemon's lock.
  577. if (info.pid > 0 && isProcessAlive(info.pid)) return false;
  578. }
  579. fs.unlinkSync(pidPath);
  580. return true;
  581. } catch (err: unknown) {
  582. const e = err as NodeJS.ErrnoException;
  583. if (e.code === 'ENOENT') return true; // already gone
  584. return false;
  585. }
  586. }
  587. /**
  588. * Probe whether `pid` is currently alive (signal-0). Treats EPERM as alive on
  589. * every platform (the process exists, it's just not ours to signal) so we never
  590. * mistake a live daemon for a dead one and clear its lock.
  591. */
  592. export function isProcessAlive(pid: number): boolean {
  593. try {
  594. process.kill(pid, 0);
  595. return true;
  596. } catch (err: unknown) {
  597. const e = err as NodeJS.ErrnoException;
  598. if (e.code === 'EPERM') return true; // exists, just not ours to signal
  599. return false;
  600. }
  601. }
  602. /**
  603. * The one `listen()` error we must NOT relocate past. EADDRINUSE means the path
  604. * is genuinely occupied — a racing daemon that legitimately owns it, or a
  605. * leftover node we couldn't clear (the #974 planted-dir case) — so relocating
  606. * would abandon a path another daemon owns; the caller instead releases its lock
  607. * and falls back to direct mode. EVERY OTHER bind error just means "this path
  608. * didn't work," almost always a filesystem that can't host an AF_UNIX node at all
  609. * (ExFAT/FAT, network mounts, WSL2 DrvFs), which reports a DIFFERENT errno per OS
  610. * (ENOTSUP macOS, EPERM Linux; #997). Enumerating the "unsupported" codes is
  611. * whack-a-mole, so we relocate on anything-but-conflict instead — robust and
  612. * self-correcting: if the deterministic tmpdir fallback ALSO fails, that error
  613. * propagates from the last candidate. (ENAMETOOLONG never reaches here — the
  614. * candidate list already routes over-long paths straight to tmpdir.)
  615. */
  616. const SOCKET_BIND_CONFLICT_CODE = 'EADDRINUSE';
  617. /**
  618. * Bind the first usable socket from an ordered candidate list, relocating past
  619. * any path that fails to bind for a non-conflict reason (see {@link
  620. * SOCKET_BIND_CONFLICT_CODE}). The injected `listen` does the real
  621. * `net.Server.listen` (and stale-socket clear); abstracted so the relocation
  622. * policy is unit-testable without a real unsupported filesystem. Returns the
  623. * server plus the path actually bound. An EADDRINUSE, or any error on the LAST
  624. * candidate, propagates — the caller releases the lockfile and falls back to
  625. * direct mode (#974). Exported for testing.
  626. */
  627. export async function bindFirstUsableSocket(
  628. candidates: string[],
  629. listen: (socketPath: string) => Promise<net.Server>,
  630. opts: { onRelocate?: (from: string, to: string, code: string) => void } = {},
  631. ): Promise<{ server: net.Server; socketPath: string }> {
  632. let lastErr: unknown;
  633. for (let i = 0; i < candidates.length; i++) {
  634. const socketPath = candidates[i]!; // i < length, so always defined
  635. const isLast = i === candidates.length - 1;
  636. try {
  637. const server = await listen(socketPath);
  638. return { server, socketPath };
  639. } catch (err) {
  640. lastErr = err;
  641. const code = (err as NodeJS.ErrnoException).code;
  642. if (!isLast && code !== SOCKET_BIND_CONFLICT_CODE) {
  643. opts.onRelocate?.(socketPath, candidates[i + 1]!, code ?? ''); // !isLast ⇒ i+1 in range
  644. continue;
  645. }
  646. throw err;
  647. }
  648. }
  649. // Only reachable with an empty candidate list — a programmer error.
  650. throw lastErr ?? new Error('no socket candidates to bind');
  651. }
  652. function resolveIdleTimeoutMs(): number {
  653. const raw = process.env.CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS;
  654. if (raw === undefined || raw === '') return DEFAULT_IDLE_TIMEOUT_MS;
  655. const parsed = Number(raw);
  656. if (!Number.isFinite(parsed) || parsed < 0) return DEFAULT_IDLE_TIMEOUT_MS;
  657. return Math.floor(parsed);
  658. }
  659. function resolveMaxIdleMs(): number {
  660. const raw = process.env.CODEGRAPH_DAEMON_MAX_IDLE_MS;
  661. if (raw === undefined || raw === '') return DEFAULT_MAX_IDLE_MS;
  662. const parsed = Number(raw);
  663. if (!Number.isFinite(parsed) || parsed < 0) return DEFAULT_MAX_IDLE_MS;
  664. return Math.floor(parsed); // 0 disables the backstop
  665. }
  666. function resolveClientSweepMs(): number {
  667. const raw = process.env.CODEGRAPH_DAEMON_CLIENT_SWEEP_MS;
  668. if (raw === undefined || raw === '') return DEFAULT_CLIENT_SWEEP_MS;
  669. const parsed = Number(raw);
  670. if (!Number.isFinite(parsed) || parsed < 0) return DEFAULT_CLIENT_SWEEP_MS;
  671. return Math.floor(parsed); // 0 disables the sweep
  672. }
  673. /**
  674. * Parse one client-hello line. Returns the peer pids if `line` is a well-formed
  675. * client-hello (carries the `codegraph_client` marker), or null otherwise — in
  676. * which case the caller treats the bytes as ordinary JSON-RPC.
  677. */
  678. export function parseClientHelloLine(
  679. line: string,
  680. ): { pid: number; hostPid: number | null } | null {
  681. let parsed: unknown;
  682. try { parsed = JSON.parse(line); } catch { return null; }
  683. if (!parsed || typeof parsed !== 'object') return null;
  684. const o = parsed as Record<string, unknown>;
  685. if (o.codegraph_client !== 1 || typeof o.pid !== 'number') return null;
  686. return { pid: o.pid, hostPid: typeof o.hostPid === 'number' ? o.hostPid : null };
  687. }
  688. /**
  689. * A client's peer is dead when its proxy process is gone, or when its known
  690. * host process is gone. Unknown pid (no client-hello) is never "dead" on this
  691. * basis — those clients rely on the socket-close path. Exported for testing.
  692. */
  693. export function peerIsDead(
  694. peers: { pid: number | null; hostPid: number | null },
  695. isAlive: (pid: number) => boolean,
  696. ): boolean {
  697. if (peers.pid === null) return false;
  698. if (!isAlive(peers.pid)) return true;
  699. if (peers.hostPid !== null && !isAlive(peers.hostPid)) return true;
  700. return false;
  701. }
  702. /**
  703. * Read the optional client-hello line a proxy sends after the daemon hello.
  704. * Always resolves (never rejects) — fail-safe by design, since every connection
  705. * funnels through here. Resolves with the peer pids when the first line is a
  706. * client-hello; otherwise resolves with null pids and unshifts the already-read
  707. * bytes so the transport parses them as the client's first JSON-RPC message(s).
  708. * Accumulates as Buffers and splits on the newline byte so a UTF-8 sequence
  709. * straddling a chunk boundary in the unshifted tail is never corrupted.
  710. */
  711. function readClientHello(
  712. socket: net.Socket,
  713. ): Promise<{ pid: number | null; hostPid: number | null }> {
  714. return new Promise((resolve) => {
  715. let chunks: Buffer[] = [];
  716. let total = 0;
  717. let settled = false;
  718. const finish = (
  719. peers: { pid: number | null; hostPid: number | null },
  720. putBack?: Buffer,
  721. ) => {
  722. if (settled) return;
  723. settled = true;
  724. socket.removeListener('data', onData);
  725. socket.removeListener('error', onEnd);
  726. socket.removeListener('close', onEnd);
  727. clearTimeout(timer);
  728. if (putBack && putBack.length > 0 && !socket.destroyed) {
  729. try { socket.unshift(putBack); } catch { /* stream already gone */ }
  730. }
  731. resolve(peers);
  732. };
  733. const onData = (chunk: Buffer | string) => {
  734. const buf = typeof chunk === 'string' ? Buffer.from(chunk, 'utf8') : chunk;
  735. chunks.push(buf);
  736. total += buf.length;
  737. const all = chunks.length === 1 ? buf : Buffer.concat(chunks, total);
  738. const nl = all.indexOf(0x0a); // '\n'
  739. if (nl === -1) {
  740. // No newline yet. If it's already too long to be a hello, it isn't one —
  741. // hand the bytes back as data; otherwise keep accumulating.
  742. if (total > MAX_HELLO_LINE_BYTES) finish({ pid: null, hostPid: null }, all);
  743. else chunks = [all];
  744. return;
  745. }
  746. const peers = parseClientHelloLine(all.subarray(0, nl).toString('utf8'));
  747. if (peers) {
  748. const tail = all.subarray(nl + 1);
  749. finish(peers, tail.length > 0 ? tail : undefined);
  750. } else {
  751. // First line is not a client-hello (legacy/direct client) — hand the
  752. // whole buffer back so the transport sees the message verbatim.
  753. finish({ pid: null, hostPid: null }, all);
  754. }
  755. };
  756. const onEnd = () => finish({ pid: null, hostPid: null });
  757. const timer = setTimeout(() => finish({ pid: null, hostPid: null }), CLIENT_HELLO_TIMEOUT_MS);
  758. timer.unref?.();
  759. socket.on('data', onData);
  760. socket.on('error', onEnd);
  761. socket.on('close', onEnd);
  762. });
  763. }
  764. /** Exported for test stubs that need to bound the hello-line read. */
  765. export { MAX_HELLO_LINE_BYTES };