Просмотр исходного кода

fix(mcp): reap serve --mcp child when parent is SIGKILL'd (#286)

Add a PPID watchdog to the MCP server so a `codegraph serve --mcp` child terminates when its host (Claude Code, opencode, …) is force-killed — OOM killer, `kill -9`, container teardown — and the stdin close handlers don't fire. The child would otherwise linger indefinitely, holding inotify watches, file descriptors, and the SQLite WAL.

Also propagates the host PID across the `--liftoff-only` re-exec (CODEGRAPH_HOST_PPID) so the watchdog reaps the orphan on the from-source path too, not just the bundled launcher. Poll interval is CODEGRAPH_PPID_POLL_MS (default 5000ms, 0 disables).

Resolves #277.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Infinity_Block 1 месяц назад
Родитель
Сommit
fb45959af7
4 измененных файлов с 291 добавлено и 1 удалено
  1. 12 0
      CHANGELOG.md
  2. 168 0
      __tests__/mcp-ppid-watchdog.test.ts
  3. 14 1
      src/extraction/wasm-runtime-flags.ts
  4. 97 0
      src/mcp/index.ts

+ 12 - 0
CHANGELOG.md

@@ -9,6 +9,18 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [0.9.4] - 2026-05-22
 
+### Fixed
+- **Orphaned `codegraph serve --mcp` processes after a parent SIGKILL.** When
+  the MCP host (Claude Code, opencode, …) was force-killed — OOM killer, a
+  `kill -9`, a container teardown — the child kept running indefinitely on
+  Linux, holding inotify watches, file descriptors, and the SQLite WAL. The
+  kernel doesn't propagate parent death to children, and the stdin
+  `end`/`close` handlers we relied on don't always fire. The MCP server now
+  polls `process.ppid` and shuts down the moment it changes from the value
+  observed at startup; the poll interval is `CODEGRAPH_PPID_POLL_MS` (default
+  `5000`, `0` disables). Resolves
+  [#277](https://github.com/colbymchenry/codegraph/issues/277).
+
 ### Added
 - **Release archives now ship with a `SHA256SUMS` file**, and the npm launcher
   verifies the bundle it downloads against it — a mismatch aborts before

+ 168 - 0
__tests__/mcp-ppid-watchdog.test.ts

@@ -0,0 +1,168 @@
+/**
+ * PPID watchdog regression test (#277).
+ *
+ * On Linux, when an MCP host (Claude Code, opencode, …) is SIGKILL'd by the
+ * OOM killer / a force-quit / a container teardown, the kernel does NOT
+ * propagate the death to its `codegraph serve --mcp` child. The child gets
+ * reparented to init/systemd, its stdin stays half-open in some
+ * configurations, and the existing `stdin.on('end' | 'close')` handlers
+ * never fire — the server lingers indefinitely, holding inotify watches,
+ * file descriptors, and the SQLite WAL.
+ *
+ * `src/mcp/index.ts` polls `process.ppid` and shuts down the moment it
+ * diverges from the value observed at startup. This test stands up a
+ * four-tier process tree (vitest → wrapper → {stdin-holder, codegraph}) and
+ * SIGKILL's the wrapper. The stdin-holder is a long-lived sibling whose
+ * `stdout` pipe is dup'd into codegraph's `stdin`. After the wrapper dies
+ * the pipe stays open (stdin-holder still owns the write-end), so the
+ * existing stdin close handlers do **not** fire — the only thing that can
+ * terminate codegraph then is the PPID watchdog.
+ *
+ * Windows is excluded — `process.kill(pid, 'SIGKILL')` does not actually
+ * deliver SIGKILL there, and the per-OS reparenting semantics the watchdog
+ * relies on are POSIX-specific.
+ */
+import { describe, it, expect, afterEach } from 'vitest';
+import { spawn, ChildProcessWithoutNullStreams } from 'child_process';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+
+const BIN = path.resolve(__dirname, '../dist/bin/codegraph.js');
+
+function isAlive(pid: number): boolean {
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+function waitForExit(pid: number, timeoutMs: number): Promise<boolean> {
+  return new Promise((resolve) => {
+    const start = Date.now();
+    const tick = () => {
+      if (!isAlive(pid)) return resolve(true);
+      if (Date.now() - start > timeoutMs) return resolve(false);
+      setTimeout(tick, 100);
+    };
+    tick();
+  });
+}
+
+describe.skipIf(process.platform === 'win32')('MCP PPID watchdog (#277)', () => {
+  let wrapper: ChildProcessWithoutNullStreams | null = null;
+  let childPid: number | null = null;
+  let stdinHolderPid: number | null = null;
+
+  afterEach(() => {
+    if (wrapper && !wrapper.killed) {
+      try { wrapper.kill('SIGKILL'); } catch { /* already gone */ }
+    }
+    // Belt and suspenders — don't leak processes if an assertion failed.
+    for (const pid of [childPid, stdinHolderPid]) {
+      if (pid !== null && isAlive(pid)) {
+        try { process.kill(pid, 'SIGKILL'); } catch { /* already gone */ }
+      }
+    }
+    wrapper = null;
+    childPid = null;
+    stdinHolderPid = null;
+  });
+
+  it("shuts down when its parent is SIGKILL'd and stdin stays open", async () => {
+    // The wrapper:
+    //   1. Spawns a "stdin-holder" — a tiny long-lived node process whose
+    //      `stdout` pipe is dup'd into codegraph's `stdin`. As long as the
+    //      stdin-holder is alive (it is — it's an orphan after the wrapper
+    //      dies), codegraph's stdin never sees EOF.
+    //   2. Spawns codegraph with that pipe as fd 0 and its stderr redirected
+    //      to a tmp file that survives the wrapper, then reports both PIDs.
+    //   3. Idles until SIGKILL'd from the test.
+    //
+    // CODEGRAPH_PPID_POLL_MS=200 keeps the watchdog responsive in test; the
+    // production default is 5000ms.
+    const stderrLog = path.join(
+      fs.mkdtempSync(path.join(os.tmpdir(), 'cg-ppid-watchdog-')),
+      'codegraph.stderr.log',
+    );
+    // The wrapper waits 800ms before reporting the PIDs so the codegraph
+    // child has time to finish its async start() (dynamic import + transport
+    // setup + watchdog registration). Otherwise the test races: it
+    // SIGKILL's the wrapper before the watchdog interval is installed, and
+    // nothing terminates codegraph.
+    const wrapperSrc = `
+      const { spawn } = require('child_process');
+      const fs = require('fs');
+      const stderrFd = fs.openSync(${JSON.stringify(stderrLog)}, 'a');
+      const stdinHolder = spawn(process.execPath, ['-e', 'setInterval(() => {}, 60000)'], {
+        stdio: ['ignore', 'pipe', 'ignore'],
+        detached: true,
+      });
+      stdinHolder.unref();
+      const child = spawn(process.execPath, [${JSON.stringify(BIN)}, 'serve', '--mcp'], {
+        stdio: [stdinHolder.stdout, 'ignore', stderrFd],
+        env: { ...process.env, CODEGRAPH_PPID_POLL_MS: '200' },
+        detached: true,
+      });
+      child.unref();
+      setTimeout(() => {
+        process.stdout.write(JSON.stringify({ pid: child.pid, stdinHolderPid: stdinHolder.pid }) + '\\n');
+      }, 800);
+      setInterval(() => {}, 60000);
+    `;
+    wrapper = spawn(process.execPath, ['-e', wrapperSrc], {
+      stdio: ['pipe', 'pipe', 'pipe'],
+    }) as ChildProcessWithoutNullStreams;
+
+    const pids = await new Promise<{ pid: number; stdinHolderPid: number }>((resolve, reject) => {
+      let buf = '';
+      const timer = setTimeout(
+        () => reject(new Error('wrapper did not report PIDs in time')),
+        10000,
+      );
+      wrapper!.stdout.on('data', (chunk: Buffer) => {
+        buf += chunk.toString('utf8');
+        const m = buf.match(/\{"pid":(\d+),"stdinHolderPid":(\d+)\}/);
+        if (m) {
+          clearTimeout(timer);
+          resolve({ pid: parseInt(m[1], 10), stdinHolderPid: parseInt(m[2], 10) });
+        }
+      });
+      wrapper!.on('exit', () => {
+        clearTimeout(timer);
+        reject(new Error('wrapper exited before reporting PIDs'));
+      });
+    });
+    childPid = pids.pid;
+    stdinHolderPid = pids.stdinHolderPid;
+
+    expect(isAlive(childPid)).toBe(true);
+    expect(isAlive(stdinHolderPid)).toBe(true);
+
+    // SIGKILL the wrapper — no cleanup runs, just like a real OOM kill.
+    // codegraph and the stdin-holder both get reparented to init/systemd.
+    // Crucially, the pipe between them stays open, so codegraph's stdin
+    // doesn't close: only the watchdog can take it down.
+    wrapper.kill('SIGKILL');
+
+    // Watchdog runs every 200ms in this test → 5s gives ~25 polls of headroom.
+    const exited = await waitForExit(childPid, 5000);
+    const stderrContent = fs.existsSync(stderrLog) ? fs.readFileSync(stderrLog, 'utf-8') : '<no stderr captured>';
+    expect(
+      exited,
+      `codegraph child (pid=${childPid}) did not exit within 5s after wrapper was SIGKILL'd.\nstderr:\n${stderrContent}`,
+    ).toBe(true);
+    // The watchdog announces itself before tearing down — assert that the
+    // shutdown came from the parent-death path, not from any other signal.
+    expect(stderrContent).toMatch(/Parent process exited.*shutting down/);
+
+    // The stdin-holder is now an orphan — kill it explicitly so it doesn't
+    // outlive the test. It's still tracked in `stdinHolderPid` for the
+    // afterEach safety net, but we tidy up proactively here too.
+    if (isAlive(stdinHolderPid)) {
+      try { process.kill(stdinHolderPid, 'SIGKILL'); } catch { /* race */ }
+    }
+  }, 20000);
+});

+ 14 - 1
src/extraction/wasm-runtime-flags.ts

@@ -46,6 +46,19 @@ export const WASM_RUNTIME_FLAGS: readonly string[] = ['--liftoff-only'];
  */
 const RELAUNCH_GUARD_ENV = 'CODEGRAPH_WASM_RELAUNCHED';
 
+/**
+ * Env var carrying the *host* PID (the relauncher's own parent) across the
+ * re-exec. Without `--liftoff-only` the CLI re-execs itself once, inserting an
+ * intermediate process between the MCP host and the server. That intermediate
+ * stays alive (blocked in spawnSync) even after the host is killed, so the
+ * server's PPID watchdog can't detect the host's death by watching its own
+ * `process.ppid`. Passing the host PID through lets the watchdog poll it
+ * directly. Unset on the no-re-exec path (bundled launcher / flag already
+ * present), where the server is already a direct child of the host. See
+ * src/mcp/index.ts (#277).
+ */
+export const HOST_PPID_ENV = 'CODEGRAPH_HOST_PPID';
+
 /** True when every required WASM runtime flag is already present in `execArgv`. */
 export function processHasWasmRuntimeFlags(
   execArgv: readonly string[] = process.execArgv
@@ -84,7 +97,7 @@ export function relaunchWithWasmRuntimeFlagsIfNeeded(scriptPath: string): void {
   const argv = buildRelaunchArgv(scriptPath, process.argv.slice(2));
   const result = spawnSync(process.execPath, argv, {
     stdio: 'inherit',
-    env: { ...process.env, [RELAUNCH_GUARD_ENV]: '1' },
+    env: { ...process.env, [RELAUNCH_GUARD_ENV]: '1', [HOST_PPID_ENV]: String(process.ppid) },
   });
 
   if (result.error) {

+ 97 - 0
src/mcp/index.ts

@@ -21,6 +21,7 @@ import { watchDisabledReason } from '../sync';
 import { StdioTransport, JsonRpcRequest, JsonRpcNotification, ErrorCodes } from './transport';
 import { tools, ToolHandler } from './tools';
 import { SERVER_INSTRUCTIONS } from './server-instructions';
+import { HOST_PPID_ENV } from '../extraction/wasm-runtime-flags';
 
 /**
  * Convert a file:// URI to a filesystem path.
@@ -60,6 +61,51 @@ const PROTOCOL_VERSION = '2024-11-05';
  */
 const ROOTS_LIST_TIMEOUT_MS = 5000;
 
+/**
+ * How often to poll `process.ppid` to detect parent process death (see #277).
+ * 5s is a deliberate trade-off: the failure mode being guarded against is rare
+ * (parent SIGKILL'd), and longer poll = less wakeup overhead while idle.
+ */
+const DEFAULT_PPID_POLL_MS = 5000;
+
+/**
+ * Resolve the PPID watchdog poll interval from an env override. A value of
+ * `0` disables the watchdog entirely (escape hatch for embedded scenarios
+ * where the parent legitimately re-parents the server on purpose). Anything
+ * non-numeric or negative falls back to the default.
+ */
+function parsePpidPollMs(raw: string | undefined): number {
+  if (raw === undefined || raw === '') return DEFAULT_PPID_POLL_MS;
+  const parsed = Number(raw);
+  if (!Number.isFinite(parsed)) return DEFAULT_PPID_POLL_MS;
+  if (parsed < 0) return DEFAULT_PPID_POLL_MS;
+  return Math.floor(parsed);
+}
+
+/**
+ * Parse the host PID propagated across the `--liftoff-only` re-exec
+ * ({@link HOST_PPID_ENV}). Returns a positive integer PID, or null when
+ * unset/invalid — the direct-launch path, where the watchdog falls back to
+ * `process.ppid` divergence. PIDs of 0/1 are rejected (0 = unknown, 1 = init,
+ * i.e. already orphaned), so the watchdog doesn't latch onto init.
+ */
+function parseHostPpid(raw: string | undefined): number | null {
+  if (raw === undefined || raw === '') return null;
+  const parsed = Number(raw);
+  if (!Number.isInteger(parsed) || parsed <= 1) return null;
+  return parsed;
+}
+
+/** True if a process with `pid` currently exists (signal-0 probe). */
+function isProcessAlive(pid: number): boolean {
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
 /**
  * Extract the first usable filesystem path from a `roots/list` result.
  * Shape per MCP spec: `{ roots: [{ uri: "file:///path", name?: string }] }`.
@@ -95,6 +141,19 @@ export class MCPServer {
   // Guards the one-shot deferred resolution (roots/list or cwd) so we don't
   // re-issue roots/list on every tool call.
   private rootsAttempted = false;
+  // PPID watchdog — see start(). Captured at construction so we always have a
+  // baseline, even if start() runs after a fork-style reparent.
+  private originalPpid: number = process.ppid;
+  // The MCP host's PID, propagated across the `--liftoff-only` re-exec (see
+  // HOST_PPID_ENV). When set, the watchdog polls it directly: the re-exec
+  // inserts an intermediate process whose *death* — not just our reparenting —
+  // is what we'd otherwise miss. null on the direct (bundled) launch path.
+  private hostPpid: number | null = parseHostPpid(process.env[HOST_PPID_ENV]);
+  private ppidWatchdog: ReturnType<typeof setInterval> | null = null;
+  // Idempotency guard for stop(). Without it, the watchdog can race with the
+  // stdin `end`/`close` handlers (or SIGTERM/SIGINT) and double-close cg and
+  // the transport before process.exit() lands.
+  private stopped = false;
 
   constructor(projectPath?: string) {
     this.projectPath = projectPath || null;
@@ -122,6 +181,38 @@ export class MCPServer {
     // Detect this and shut down gracefully to prevent orphaned processes.
     process.stdin.on('end', () => this.stop());
     process.stdin.on('close', () => this.stop());
+
+    // PPID watchdog (#277). Linux doesn't propagate parent death to children,
+    // so when the MCP host (Claude Code, opencode, …) is SIGKILL'd by the OOM
+    // killer / a force-quit / a container teardown, the child is reparented to
+    // init/systemd and the stdin `end`/`close` events don't always fire. The
+    // server would then linger indefinitely, holding inotify watches, file
+    // descriptors, and the SQLite WAL. Poll `process.ppid` and shut down the
+    // moment it changes from what we observed at startup. Cross-platform:
+    // reparenting changes ppid on Linux *and* macOS; on Windows the value can
+    // also drop to 0 once the parent is gone. When the CLI re-execs itself for
+    // `--liftoff-only`, an intermediate process sits between us and the host and
+    // outlives it, so our own ppid wouldn't change — in that case we poll the
+    // host PID (propagated via HOST_PPID_ENV) for liveness instead. The watchdog
+    // is `.unref()`'d so it never holds the event loop open on its own.
+    const pollMs = parsePpidPollMs(process.env.CODEGRAPH_PPID_POLL_MS);
+    if (pollMs > 0) {
+      this.ppidWatchdog = setInterval(() => {
+        const current = process.ppid;
+        const ppidChanged = current !== this.originalPpid;
+        const hostGone = this.hostPpid !== null && !isProcessAlive(this.hostPpid);
+        if (ppidChanged || hostGone) {
+          const reason = ppidChanged
+            ? `ppid ${this.originalPpid} -> ${current}`
+            : `host pid ${this.hostPpid} exited`;
+          process.stderr.write(
+            `[CodeGraph MCP] Parent process exited (${reason}); shutting down.\n`
+          );
+          this.stop();
+        }
+      }, pollMs);
+      this.ppidWatchdog.unref();
+    }
   }
 
   /**
@@ -283,6 +374,12 @@ export class MCPServer {
    * Stop the server
    */
   stop(): void {
+    if (this.stopped) return;
+    this.stopped = true;
+    if (this.ppidWatchdog) {
+      clearInterval(this.ppidWatchdog);
+      this.ppidWatchdog = null;
+    }
     // Close all cached cross-project connections first
     this.toolHandler.closeAll();
     // Close the main CodeGraph instance