Ver Fonte

fix(mcp): make the liveness watchdog a separate process, not a worker thread (#858)

The worker-thread watchdog from #856 didn't work in the real daemon — caught by
live-testing against a real serve --mcp. V8 isolates coordinate on global
safepoints, so a main thread wedged in a tight non-allocating loop (#850's
SourcePositionTableIterator::Advance) strands the watchdog worker before it can
SIGKILL.

A separate child process shares no isolate/heap with the parent, so the wedge
can't touch it; it kills via the kernel. Parent heartbeats to the child's stdin;
silence past the timeout -> SIGKILL; parent exit closes the pipe -> child exits.
Validated live (real daemon SIGKILLed in ~timeout); regression test covers the
non-allocating-wedge-under-heap-pressure case. API/install points/CHANGELOG
unchanged; the broken worker version was never released.
Colby Mchenry há 1 semana atrás
pai
commit
1702dfc544
2 ficheiros alterados com 119 adições e 151 exclusões
  1. 20 47
      __tests__/liveness-watchdog.test.ts
  2. 99 104
      src/mcp/liveness-watchdog.ts

+ 20 - 47
__tests__/liveness-watchdog.test.ts

@@ -3,48 +3,12 @@ import { spawn } from 'child_process';
 import * as fs from 'fs';
 import * as path from 'path';
 import {
-  stepHeartbeat,
   parseWatchdogTimeoutMs,
   deriveCheckIntervalMs,
   installMainThreadWatchdog,
   DEFAULT_WATCHDOG_TIMEOUT_MS,
 } from '../src/mcp/liveness-watchdog';
 
-describe('stepHeartbeat (wedge-detection reducer)', () => {
-  it('resets the stale count when the counter advances', () => {
-    const r = stepHeartbeat({ lastCounter: 5, staleChecks: 3 }, 6, 4);
-    expect(r.wedged).toBe(false);
-    expect(r.next).toEqual({ lastCounter: 6, staleChecks: 0 });
-  });
-
-  it('accumulates stale checks while the counter is frozen', () => {
-    let s = { lastCounter: 9, staleChecks: 0 };
-    for (let i = 1; i < 4; i++) {
-      const r = stepHeartbeat(s, 9, 4);
-      expect(r.wedged).toBe(false);
-      expect(r.next.staleChecks).toBe(i);
-      s = r.next;
-    }
-  });
-
-  it('reports wedged once the stale count reaches the threshold', () => {
-    const r = stepHeartbeat({ lastCounter: 9, staleChecks: 3 }, 9, 4);
-    expect(r.wedged).toBe(true);
-  });
-
-  it('a single late heartbeat rescues the process (sleep/clock-jump safety)', () => {
-    // 3 stale checks, then progress (as if the main thread resumed after a
-    // system sleep) — must NOT be considered wedged.
-    let s = { lastCounter: 1, staleChecks: 0 };
-    s = stepHeartbeat(s, 1, 4).next; // stale 1
-    s = stepHeartbeat(s, 1, 4).next; // stale 2
-    s = stepHeartbeat(s, 1, 4).next; // stale 3
-    const resumed = stepHeartbeat(s, 2, 4); // counter advanced
-    expect(resumed.wedged).toBe(false);
-    expect(resumed.next.staleChecks).toBe(0);
-  });
-});
-
 describe('config parsing', () => {
   it('parseWatchdogTimeoutMs falls back for missing/invalid input', () => {
     expect(parseWatchdogTimeoutMs(undefined)).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
@@ -62,7 +26,7 @@ describe('config parsing', () => {
 });
 
 describe('installMainThreadWatchdog opt-out', () => {
-  it('returns null (no worker) when CODEGRAPH_NO_WATCHDOG is set', () => {
+  it('returns null (spawns nothing) when CODEGRAPH_NO_WATCHDOG is set', () => {
     const prev = process.env.CODEGRAPH_NO_WATCHDOG;
     process.env.CODEGRAPH_NO_WATCHDOG = '1';
     try {
@@ -75,11 +39,13 @@ describe('installMainThreadWatchdog opt-out', () => {
 });
 
 /**
- * End-to-end: spawn a real process, install the real worker, and prove it kills
- * a wedged main thread (and ONLY a wedged one). Drives the built module the same
- * way mcp-ppid-watchdog.test.ts drives the built CLI.
+ * End-to-end: spawn a real process, install the real watchdog (which spawns a
+ * separate watchdog child), and prove it kills a wedged main thread — including
+ * the case a worker thread could NOT (a non-allocating loop under heap pressure,
+ * which strands a same-process worker on V8's global safepoint, #850). Drives
+ * the built module the way mcp-ppid-watchdog.test.ts drives the built CLI.
  */
-describe('liveness watchdog (spawned, real worker)', () => {
+describe('liveness watchdog (spawned, real watchdog process)', () => {
   const MODULE = path.resolve(__dirname, '../dist/mcp/liveness-watchdog.js');
 
   beforeAll(() => {
@@ -117,7 +83,18 @@ describe('liveness watchdog (spawned, real worker)', () => {
   it('SIGKILLs a process whose main thread wedges in a sync loop', async () => {
     const { signal } = await runChild(
       { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
-      'setTimeout(() => { while (true) {} }, 150);', // wedge the event loop forever
+      'setTimeout(() => { while (true) {} }, 150);',
+      8000
+    );
+    expect(signal).toBe('SIGKILL');
+  }, 12000);
+
+  it('SIGKILLs a non-allocating wedge under heap pressure (the case worker threads stalled on)', async () => {
+    const { signal } = await runChild(
+      { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
+      // ~40MB retained so a GC is likely, then a tight NON-allocating loop — the
+      // exact shape that deadlocks a same-process worker on the global safepoint.
+      'const k=[]; for (let i=0;i<40;i++) k.push(Buffer.alloc(1024*1024,i)); global.__k=k; setTimeout(() => { while (true) {} }, 150);',
       8000
     );
     expect(signal).toBe('SIGKILL');
@@ -126,7 +103,6 @@ describe('liveness watchdog (spawned, real worker)', () => {
   it('does NOT kill a healthy process that keeps its event loop turning', async () => {
     const { code, signal } = await runChild(
       { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
-      // Stay responsive for 1.5s (3× the timeout), then exit cleanly with 7.
       'const iv = setInterval(() => {}, 50); setTimeout(() => { clearInterval(iv); process.exit(7); }, 1500);',
       8000
     );
@@ -137,12 +113,9 @@ describe('liveness watchdog (spawned, real worker)', () => {
   it('does NOT kill a wedged process when CODEGRAPH_NO_WATCHDOG=1', async () => {
     const { signal } = await runChild(
       { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500', CODEGRAPH_NO_WATCHDOG: '1' },
-      // Wedge briefly, but the test's hard timeout reaps it (the watchdog must not).
       'setTimeout(() => { const end = Date.now() + 1500; while (Date.now() < end) {} process.exit(3); }, 150);',
       8000
     );
-    // Killed by neither the watchdog (disabled) nor the hard timeout — it ran
-    // its bounded busy-loop and exited 3 on its own.
-    expect(signal).toBeNull();
+    expect(signal).toBeNull(); // the watchdog is off, so nothing kills it
   }, 12000);
 });

+ 99 - 104
src/mcp/liveness-watchdog.ts

@@ -2,70 +2,45 @@
  * Main-thread liveness watchdog — belt-and-suspenders for #850.
  *
  * The #850 fix removes the one *known* trigger (the uncaught-exception handler
- * no longer formats a raw Error's `.stack` — the step that could enter a
- * non-terminating V8 source-position loop). But ANY synchronous, non-yielding
+ * no longer formats a raw Error's `.stack`). But ANY synchronous, non-yielding
  * loop on the main thread — a future V8 stack-format pathology, a runaway
  * regex, an accidental `while (true)` — wedges the event loop, and from JS you
  * cannot interrupt it: timers, signal handlers, and the PPID watchdog all run
  * *on* that blocked loop, so the process pins a core forever with no
  * self-recovery (the exact unrecoverable state #850 reported).
  *
- * The only observer still running when the main thread is wedged is another
- * THREAD. This installs a tiny worker thread that watches a heartbeat the main
- * thread bumps through shared memory. If the heartbeat stops advancing across
- * enough consecutive checks (~`timeoutMs` of real time), the worker concludes
- * the main thread is wedged and SIGKILLs the process — the one signal a wedged
- * event loop can't swallow — so a fresh daemon starts on the next connection
- * instead of a zombie pinning a core.
+ * **Why a separate PROCESS, not a worker thread.** A worker thread was the
+ * obvious first choice and it works in a toy process — but it was validated to
+ * FAIL in the real daemon (#850 live test). V8 isolates in one process
+ * coordinate on global safepoints, so when one thread requests a GC every other
+ * thread must reach a safepoint before it can proceed. A main thread wedged in
+ * a tight, non-allocating loop never reaches one, which strands the watchdog
+ * worker on its very next allocation/safepoint check — and the #850 hot loop
+ * (`SourcePositionTableIterator::Advance`, a non-allocating C++ table walk) is
+ * exactly that shape. A child process shares no isolate and no heap with the
+ * parent, so the wedge cannot touch it; it kills via the kernel, which honours
+ * SIGKILL regardless of what the parent's threads are doing.
  *
- * **Why count checks, not elapsed wall-clock.** A laptop that sleeps freezes
- * both threads; on wake `Date.now()` has jumped hours but the heartbeat sat
- * still — a wall-clock delta would false-positive and kill a perfectly healthy
- * daemon. Counting *consecutive worker iterations* with no progress is immune:
- * a healthy main thread resumes and bumps the heartbeat within one interval of
- * waking, resetting the count; only a thread that never resumes keeps it
- * climbing. {@link stepHeartbeat} is the pure reducer behind both the worker
- * and the unit tests.
+ * **How.** The parent writes a heartbeat byte to the child's stdin every
+ * `checkMs` from a timer — firing at all means the event loop is turning. The
+ * child resets a kill-timer on each byte; if none arrives for `timeoutMs` it
+ * `SIGKILL`s the parent so a fresh daemon starts on the next connection. When
+ * the parent exits normally the pipe closes and the child exits too (no
+ * orphan).
  *
- * **Why it won't fire on real work.** Heavy parsing runs in the parse worker
- * (off this thread) and indexing shells out to a child process, so the daemon's
- * main thread only ever does fast, bounded work (socket handling + sub-second
- * SQLite reads). The default timeout is therefore vastly larger than any
- * legitimate main-thread block yet vastly smaller than "forever". Opt out with
- * `CODEGRAPH_NO_WATCHDOG=1`; tune with `CODEGRAPH_WATCHDOG_TIMEOUT_MS`.
+ * **Won't fire on real work.** Heavy parsing runs in the parse worker
+ * (off-thread) and indexing shells out to a child process, so the daemon's main
+ * thread only ever does fast, bounded work. The default timeout is ~300× the
+ * 5h #850 wedge shorter, yet far longer than any legitimate main-thread block.
+ * Opt out with `CODEGRAPH_NO_WATCHDOG=1`; tune with `CODEGRAPH_WATCHDOG_TIMEOUT_MS`.
  */
-import { Worker } from 'worker_threads';
+import * as fs from 'fs';
+import * as os from 'os';
+import { spawn, ChildProcess } from 'child_process';
 
 /** Default: 60s — ~300× shorter than the 5h #850 wedge, far longer than any real main-thread block. */
 export const DEFAULT_WATCHDOG_TIMEOUT_MS = 60_000;
 
-export interface HeartbeatState {
-  /** Last heartbeat counter the worker observed. */
-  lastCounter: number;
-  /** Consecutive checks the counter has NOT advanced. */
-  staleChecks: number;
-}
-
-/**
- * Pure reducer for one worker check. `maxStaleChecks` consecutive no-progress
- * checks → wedged. Counting iterations (not wall-clock) is what makes this
- * robust to clock jumps / system sleep.
- */
-export function stepHeartbeat(
-  state: HeartbeatState,
-  counter: number,
-  maxStaleChecks: number
-): { next: HeartbeatState; wedged: boolean } {
-  if (counter !== state.lastCounter) {
-    return { next: { lastCounter: counter, staleChecks: 0 }, wedged: false };
-  }
-  const staleChecks = state.staleChecks + 1;
-  return {
-    next: { lastCounter: counter, staleChecks },
-    wedged: staleChecks >= maxStaleChecks,
-  };
-}
-
 /** `true` for `1/true/yes/on` (case-insensitive); `false` otherwise. */
 function isEnvTruthy(raw: string | undefined): boolean {
   if (!raw) return false;
@@ -82,86 +57,105 @@ export function parseWatchdogTimeoutMs(
   return Number.isFinite(n) && n > 0 ? n : fallback;
 }
 
-/** Derive a heartbeat/check cadence that fires several times inside the timeout window. */
+/** Derive a heartbeat cadence that emits several beats inside the timeout window. */
 export function deriveCheckIntervalMs(timeoutMs: number): number {
   return Math.min(2000, Math.max(50, Math.round(timeoutMs / 5)));
 }
 
+/** Arming/teardown diagnostics, gated on the existing MCP debug switch. */
+function debug(msg: string): void {
+  if (process.env.CODEGRAPH_MCP_DEBUG) {
+    try { fs.writeSync(2, `[CodeGraph watchdog] ${msg}\n`); } catch { /* ignore */ }
+  }
+}
+
 export interface WatchdogHandle {
-  /** Stop heartbeating and terminate the worker. Idempotent. */
+  /** Stop heartbeating and shut the watchdog child down. Idempotent. */
   stop(): void;
 }
 
 /**
- * The worker body, run via `new Worker(src, { eval: true })`. Inlined as a
- * string (not a shipped `.js`) so there is no dist-vs-src path to resolve — it
- * runs identically under `tsx` in tests and under the bundle in production.
- * Mirrors {@link stepHeartbeat}; keep the two in sync (the unit test pins the
- * algorithm, the integration test pins this exact body end-to-end).
+ * The watchdog child body, run via `node -e`. Inlined as a string (not a
+ * shipped `.js`) so there is no dist-vs-src path to resolve — it runs
+ * identically under `tsx` in tests and under the bundle in production. Reads its
+ * target pid + timeout from argv; an MSG built once at startup (the child is
+ * never wedged, so allocation here is fine).
  */
-const WORKER_SOURCE = `
-const { workerData } = require('worker_threads');
+const CHILD_SOURCE = `
 const fs = require('fs');
-const beat = new Int32Array(workerData.sab);
-const { checkMs, maxStaleChecks } = workerData;
-let lastCounter = Atomics.load(beat, 0);
-let staleChecks = 0;
-const timer = setInterval(() => {
-  const counter = Atomics.load(beat, 0);
-  if (counter !== lastCounter) { lastCounter = counter; staleChecks = 0; return; }
-  if (++staleChecks < maxStaleChecks) return;
-  clearInterval(timer);
-  const secs = Math.round((staleChecks * checkMs) / 1000);
-  try {
-    fs.writeSync(2, '[CodeGraph] Main thread unresponsive for ~' + secs + 's — killing the wedged process so a fresh one can start (#850). Disable with CODEGRAPH_NO_WATCHDOG=1.\\n');
-  } catch (e) { /* stderr gone */ }
-  try { process.kill(process.pid, 'SIGKILL'); } catch (e) { /* nothing left to try */ }
-}, checkMs);
+const parentPid = Number(process.argv[1]);
+const timeoutMs = Number(process.argv[2]);
+const secs = Math.round(timeoutMs / 1000);
+const MSG = Buffer.from('[CodeGraph] Main thread unresponsive for ~' + secs + 's — killing the wedged process so a fresh one can start (#850). Disable with CODEGRAPH_NO_WATCHDOG=1.\\n');
+function kill() {
+  try { fs.writeSync(2, MSG); } catch (e) {}
+  try { process.kill(parentPid, 'SIGKILL'); } catch (e) {}
+  process.exit(0);
+}
+let timer = setTimeout(kill, timeoutMs);
+process.stdin.on('data', () => { clearTimeout(timer); timer = setTimeout(kill, timeoutMs); });
+process.stdin.on('end', () => process.exit(0));   // parent closed the pipe (exited) -> no orphan
+process.stdin.on('error', () => process.exit(0)); // pipe broke -> parent gone
+process.stdin.resume();
 `;
 
 /**
  * Install the main-thread liveness watchdog for a long-lived process. Returns a
- * handle to stop it, or `null` when disabled or when the worker can't be
- * spawned (degraded, never throws — a missing watchdog must never keep a
- * process from starting).
+ * handle to stop it, or `null` when disabled or when the child can't be spawned
+ * (degraded, never throws — a missing watchdog must never keep a process from
+ * starting).
  */
 export function installMainThreadWatchdog(): WatchdogHandle | null {
   if (isEnvTruthy(process.env.CODEGRAPH_NO_WATCHDOG)) return null;
 
   const timeoutMs = parseWatchdogTimeoutMs(process.env.CODEGRAPH_WATCHDOG_TIMEOUT_MS);
   const checkMs = deriveCheckIntervalMs(timeoutMs);
-  const maxStaleChecks = Math.max(1, Math.ceil(timeoutMs / checkMs));
 
-  // Single Int32 counter in shared memory. The main thread bumps it each tick;
-  // the worker reads it. Atomics make the write visible across threads.
-  const sab = new SharedArrayBuffer(Int32Array.BYTES_PER_ELEMENT);
-  const beat = new Int32Array(sab);
+  let child: ChildProcess;
+  try {
+    // No execArgv inheritance (unlike Worker), so the child carries none of our
+    // V8 flags — it runs no WASM and needs none. stderr inherits the parent's
+    // fd 2 so the kill notice lands wherever the parent logs (daemon.log).
+    child = spawn(
+      process.execPath,
+      ['-e', CHILD_SOURCE, String(process.pid), String(timeoutMs)],
+      {
+        stdio: ['pipe', 'ignore', 'inherit'],
+        windowsHide: true,
+        // The watchdog touches no files; keep its cwd off the project/temp dir
+        // so it can't hold one open (Windows EPERM-on-cleanup, mirrors the
+        // parse-worker quirk).
+        cwd: os.tmpdir(),
+      }
+    );
+  } catch (err) {
+    debug(`spawn failed: ${err instanceof Error ? err.message : String(err)}`);
+    return null;
+  }
 
-  // The heartbeat: firing at all means the event loop is turning. unref'd so it
-  // never keeps the process alive on its own (the server's socket does that).
+  const stdin = child.stdin;
+  if (!stdin) {
+    debug('child has no stdin pipe; not arming');
+    try { child.kill(); } catch { /* ignore */ }
+    return null;
+  }
+  // Writing after the child exits surfaces EPIPE on the stream — swallow it so
+  // it can't escalate to the global handler (which now exits, #850).
+  stdin.on('error', () => { /* child gone; heartbeat writes are best-effort */ });
+  child.on('error', (err) => debug(`child error: ${err.message}`));
+
+  // Heartbeat: a byte per tick. When the main thread wedges, these stop and the
+  // child's timeout fires. unref'd so it never keeps the process alive itself.
   const heartbeat = setInterval(() => {
-    Atomics.add(beat, 0, 1);
+    try { stdin.write('\n'); } catch { /* child gone */ }
   }, checkMs);
   heartbeat.unref();
 
-  let worker: Worker;
-  try {
-    worker = new Worker(WORKER_SOURCE, {
-      eval: true,
-      workerData: { sab, checkMs, maxStaleChecks },
-    });
-  } catch {
-    // Worker threads unavailable — fall back to no watchdog rather than refuse
-    // to start. Degraded (a future wedge wouldn't self-kill) but not broken.
-    clearInterval(heartbeat);
-    return null;
-  }
+  // Neither the child nor its pipe should keep the parent alive past its work.
+  child.unref();
+  try { (stdin as unknown as { unref?: () => void }).unref?.(); } catch { /* ignore */ }
 
-  // A watchdog-worker error must never escalate to the global handler (which now
-  // exits, #850): swallow it and run degraded.
-  worker.on('error', () => { /* watchdog gone; nothing safe to do here */ });
-  // Don't let the watchdog keep the process alive past its real work.
-  worker.unref();
+  debug(`armed (child pid ${child.pid ?? '?'}): timeoutMs=${timeoutMs} checkMs=${checkMs}`);
 
   let stopped = false;
   return {
@@ -169,7 +163,8 @@ export function installMainThreadWatchdog(): WatchdogHandle | null {
       if (stopped) return;
       stopped = true;
       clearInterval(heartbeat);
-      void worker.terminate();
+      try { stdin.end(); } catch { /* ignore */ } // EOF -> child exits cleanly
+      try { child.kill(); } catch { /* ignore */ } // belt-and-suspenders
     },
   };
 }