1
0

mcp-ppid-watchdog.test.ts 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173
  1. /**
  2. * PPID watchdog regression test (#277).
  3. *
  4. * On Linux, when an MCP host (Claude Code, opencode, …) is SIGKILL'd by the
  5. * OOM killer / a force-quit / a container teardown, the kernel does NOT
  6. * propagate the death to its `codegraph serve --mcp` child. The child gets
  7. * reparented to init/systemd, its stdin stays half-open in some
  8. * configurations, and the existing `stdin.on('end' | 'close')` handlers
  9. * never fire — the server lingers indefinitely, holding inotify watches,
  10. * file descriptors, and the SQLite WAL.
  11. *
  12. * `src/mcp/index.ts` polls `process.ppid` and shuts down the moment it
  13. * diverges from the value observed at startup. This test stands up a
  14. * four-tier process tree (vitest → wrapper → {stdin-holder, codegraph}) and
  15. * SIGKILL's the wrapper. The stdin-holder is a long-lived sibling whose
  16. * `stdout` pipe is dup'd into codegraph's `stdin`. After the wrapper dies
  17. * the pipe stays open (stdin-holder still owns the write-end), so the
  18. * existing stdin close handlers do **not** fire — the only thing that can
  19. * terminate codegraph then is the PPID watchdog.
  20. *
  21. * Windows is excluded — `process.kill(pid, 'SIGKILL')` does not actually
  22. * deliver SIGKILL there, and the per-OS reparenting semantics the watchdog
  23. * relies on are POSIX-specific.
  24. */
  25. import { describe, it, expect, afterEach } from 'vitest';
  26. import { spawn, ChildProcessWithoutNullStreams } from 'child_process';
  27. import * as fs from 'fs';
  28. import * as os from 'os';
  29. import * as path from 'path';
  30. const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js');
  31. function isAlive(pid: number): boolean {
  32. try {
  33. process.kill(pid, 0);
  34. return true;
  35. } catch {
  36. return false;
  37. }
  38. }
  39. function waitForExit(pid: number, timeoutMs: number): Promise<boolean> {
  40. return new Promise((resolve) => {
  41. const start = Date.now();
  42. const tick = () => {
  43. if (!isAlive(pid)) return resolve(true);
  44. if (Date.now() - start > timeoutMs) return resolve(false);
  45. setTimeout(tick, 100);
  46. };
  47. tick();
  48. });
  49. }
  50. describe.skipIf(process.platform === 'win32')('MCP PPID watchdog (#277)', () => {
  51. let wrapper: ChildProcessWithoutNullStreams | null = null;
  52. let childPid: number | null = null;
  53. let stdinHolderPid: number | null = null;
  54. afterEach(() => {
  55. if (wrapper && !wrapper.killed) {
  56. try { wrapper.kill('SIGKILL'); } catch { /* already gone */ }
  57. }
  58. // Belt and suspenders — don't leak processes if an assertion failed.
  59. for (const pid of [childPid, stdinHolderPid]) {
  60. if (pid !== null && isAlive(pid)) {
  61. try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ }
  62. }
  63. }
  64. wrapper = null;
  65. childPid = null;
  66. stdinHolderPid = null;
  67. });
  68. it("shuts down when its parent is SIGKILL'd and stdin stays open", async () => {
  69. // The wrapper:
  70. // 1. Spawns a "stdin-holder" — a tiny long-lived node process whose
  71. // `stdout` pipe is dup'd into codegraph's `stdin`. As long as the
  72. // stdin-holder is alive (it is — it's an orphan after the wrapper
  73. // dies), codegraph's stdin never sees EOF.
  74. // 2. Spawns codegraph with that pipe as fd 0 and its stderr redirected
  75. // to a tmp file that survives the wrapper, then reports both PIDs.
  76. // 3. Idles until SIGKILL'd from the test.
  77. //
  78. // CODEGRAPH_PPID_POLL_MS=200 keeps the watchdog responsive in test; the
  79. // production default is 5000ms.
  80. const stderrLog = path.join(
  81. fs.mkdtempSync(path.join(os.tmpdir(), 'cg-ppid-watchdog-')),
  82. 'codegraph.stderr.log',
  83. );
  84. // The wrapper waits 800ms before reporting the PIDs so the codegraph
  85. // child has time to finish its async start() (dynamic import + transport
  86. // setup + watchdog registration). Otherwise the test races: it
  87. // SIGKILL's the wrapper before the watchdog interval is installed, and
  88. // nothing terminates codegraph.
  89. const wrapperSrc = `
  90. const { spawn } = require('child_process');
  91. const fs = require('fs');
  92. const stderrFd = fs.openSync(${JSON.stringify(stderrLog)}, 'a');
  93. const stdinHolder = spawn(process.execPath, ['-e', 'setInterval(() => {}, 60000)'], {
  94. stdio: ['ignore', 'pipe', 'ignore'],
  95. detached: true,
  96. });
  97. stdinHolder.unref();
  98. const child = spawn(process.execPath, [${JSON.stringify(BIN)}, 'serve', '--mcp'], {
  99. stdio: [stdinHolder.stdout, 'ignore', stderrFd],
  100. // Pin to direct (in-process) mode: this test targets the in-process
  101. // server's PPID watchdog (#277). The detached-daemon/proxy watchdog is
  102. // covered separately in mcp-daemon.test.ts ("daemon survives the first
  103. // client dying"). Without this the spawned process becomes a proxy and
  104. // also spawns a detached daemon that would outlive the test.
  105. env: { ...process.env, CODEGRAPH_PPID_POLL_MS: '200', CODEGRAPH_NO_DAEMON: '1' },
  106. detached: true,
  107. });
  108. child.unref();
  109. setTimeout(() => {
  110. process.stdout.write(JSON.stringify({ pid: child.pid, stdinHolderPid: stdinHolder.pid }) + '\\n');
  111. }, 800);
  112. setInterval(() => {}, 60000);
  113. `;
  114. wrapper = spawn(process.execPath, ['-e', wrapperSrc], {
  115. stdio: ['pipe', 'pipe', 'pipe'],
  116. }) as ChildProcessWithoutNullStreams;
  117. const pids = await new Promise<{ pid: number; stdinHolderPid: number }>((resolve, reject) => {
  118. let buf = '';
  119. const timer = setTimeout(
  120. () => reject(new Error('wrapper did not report PIDs in time')),
  121. 10000,
  122. );
  123. wrapper!.stdout.on('data', (chunk: Buffer) => {
  124. buf += chunk.toString('utf8');
  125. const m = buf.match(/\{"pid":(\d+),"stdinHolderPid":(\d+)\}/);
  126. if (m) {
  127. clearTimeout(timer);
  128. resolve({ pid: parseInt(m[1], 10), stdinHolderPid: parseInt(m[2], 10) });
  129. }
  130. });
  131. wrapper!.on('exit', () => {
  132. clearTimeout(timer);
  133. reject(new Error('wrapper exited before reporting PIDs'));
  134. });
  135. });
  136. childPid = pids.pid;
  137. stdinHolderPid = pids.stdinHolderPid;
  138. expect(isAlive(childPid)).toBe(true);
  139. expect(isAlive(stdinHolderPid)).toBe(true);
  140. // SIGKILL the wrapper — no cleanup runs, just like a real OOM kill.
  141. // codegraph and the stdin-holder both get reparented to init/systemd.
  142. // Crucially, the pipe between them stays open, so codegraph's stdin
  143. // doesn't close: only the watchdog can take it down.
  144. wrapper.kill('SIGKILL');
  145. // Watchdog runs every 200ms in this test → 5s gives ~25 polls of headroom.
  146. const exited = await waitForExit(childPid, 5000);
  147. const stderrContent = fs.existsSync(stderrLog) ? fs.readFileSync(stderrLog, 'utf-8') : '<no stderr captured>';
  148. expect(
  149. exited,
  150. `codegraph child (pid=${childPid}) did not exit within 5s after wrapper was SIGKILL'd.\nstderr:\n${stderrContent}`,
  151. ).toBe(true);
  152. // The watchdog announces itself before tearing down — assert that the
  153. // shutdown came from the parent-death path, not from any other signal.
  154. expect(stderrContent).toMatch(/Parent process exited.*shutting down/);
  155. // The stdin-holder is now an orphan — kill it explicitly so it doesn't
  156. // outlive the test. It's still tracked in `stdinHolderPid` for the
  157. // afterEach safety net, but we tidy up proactively here too.
  158. if (isAlive(stdinHolderPid)) {
  159. try { process.kill(stdinHolderPid, 'SIGKILL'); } catch { /* race */ }
  160. }
  161. }, 20000);
  162. });