há 1 semana atrás · 1702dfc544
--- a/__tests__/liveness-watchdog.test.ts
+++ b/__tests__/liveness-watchdog.test.ts
@@ -3,48 +3,12 @@ import { spawn } from 'child_process';
 
				 import * as fs from 'fs';
			
 
				 import * as path from 'path';
			
 
				 import {
			
 
				-  stepHeartbeat,
			
 
				   parseWatchdogTimeoutMs,
			
 
				   deriveCheckIntervalMs,
			
 
				   installMainThreadWatchdog,
			
 
				   DEFAULT_WATCHDOG_TIMEOUT_MS,
			
 
				 } from '../src/mcp/liveness-watchdog';
			
 
				 
			
 
				-describe('stepHeartbeat (wedge-detection reducer)', () => {
			
 
				-  it('resets the stale count when the counter advances', () => {
			
 
				-    const r = stepHeartbeat({ lastCounter: 5, staleChecks: 3 }, 6, 4);
			
 
				-    expect(r.wedged).toBe(false);
			
 
				-    expect(r.next).toEqual({ lastCounter: 6, staleChecks: 0 });
			
 
				-  });
			
 
				-
			
 
				-  it('accumulates stale checks while the counter is frozen', () => {
			
 
				-    let s = { lastCounter: 9, staleChecks: 0 };
			
 
				-    for (let i = 1; i < 4; i++) {
			
 
				-      const r = stepHeartbeat(s, 9, 4);
			
 
				-      expect(r.wedged).toBe(false);
			
 
				-      expect(r.next.staleChecks).toBe(i);
			
 
				-      s = r.next;
			
 
				-    }
			
 
				-  });
			
 
				-
			
 
				-  it('reports wedged once the stale count reaches the threshold', () => {
			
 
				-    const r = stepHeartbeat({ lastCounter: 9, staleChecks: 3 }, 9, 4);
			
 
				-    expect(r.wedged).toBe(true);
			
 
				-  });
			
 
				-
			
 
				-  it('a single late heartbeat rescues the process (sleep/clock-jump safety)', () => {
			
 
				-    // 3 stale checks, then progress (as if the main thread resumed after a
			
 
				-    // system sleep) — must NOT be considered wedged.
			
 
				-    let s = { lastCounter: 1, staleChecks: 0 };
			
 
				-    s = stepHeartbeat(s, 1, 4).next; // stale 1
			
 
				-    s = stepHeartbeat(s, 1, 4).next; // stale 2
			
 
				-    s = stepHeartbeat(s, 1, 4).next; // stale 3
			
 
				-    const resumed = stepHeartbeat(s, 2, 4); // counter advanced
			
 
				-    expect(resumed.wedged).toBe(false);
			
 
				-    expect(resumed.next.staleChecks).toBe(0);
			
 
				-  });
			
 
				-});
			
 
				-
			
 
				 describe('config parsing', () => {
			
 
				   it('parseWatchdogTimeoutMs falls back for missing/invalid input', () => {
			
 
				     expect(parseWatchdogTimeoutMs(undefined)).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
			
@@ -62,7 +26,7 @@ describe('config parsing', () => {
 
				 });
			
 
				 
			
 
				 describe('installMainThreadWatchdog opt-out', () => {
			
 
				-  it('returns null (no worker) when CODEGRAPH_NO_WATCHDOG is set', () => {
			
 
				+  it('returns null (spawns nothing) when CODEGRAPH_NO_WATCHDOG is set', () => {
			
 
				     const prev = process.env.CODEGRAPH_NO_WATCHDOG;
			
 
				     process.env.CODEGRAPH_NO_WATCHDOG = '1';
			
 
				     try {
			
@@ -75,11 +39,13 @@ describe('installMainThreadWatchdog opt-out', () => {
 
				 });
			
 
				 
			
 
				 /**
			
 
				- * End-to-end: spawn a real process, install the real worker, and prove it kills
			
 
				- * a wedged main thread (and ONLY a wedged one). Drives the built module the same
			
 
				- * way mcp-ppid-watchdog.test.ts drives the built CLI.
			
 
				+ * End-to-end: spawn a real process, install the real watchdog (which spawns a
			
 
				+ * separate watchdog child), and prove it kills a wedged main thread — including
			
 
				+ * the case a worker thread could NOT (a non-allocating loop under heap pressure,
			
 
				+ * which strands a same-process worker on V8's global safepoint, #850). Drives
			
 
				+ * the built module the way mcp-ppid-watchdog.test.ts drives the built CLI.
			
 
				  */
			
 
				-describe('liveness watchdog (spawned, real worker)', () => {
			
 
				+describe('liveness watchdog (spawned, real watchdog process)', () => {
			
 
				   const MODULE = path.resolve(__dirname, '../dist/mcp/liveness-watchdog.js');
			
 
				 
			
 
				   beforeAll(() => {
			
@@ -117,7 +83,18 @@ describe('liveness watchdog (spawned, real worker)', () => {
 
				   it('SIGKILLs a process whose main thread wedges in a sync loop', async () => {
			
 
				     const { signal } = await runChild(
			
 
				       { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
			
 
				-      'setTimeout(() => { while (true) {} }, 150);', // wedge the event loop forever
			
 
				+      'setTimeout(() => { while (true) {} }, 150);',
			
 
				+      8000
			
 
				+    );
			
 
				+    expect(signal).toBe('SIGKILL');
			
 
				+  }, 12000);
			
 
				+
			
 
				+  it('SIGKILLs a non-allocating wedge under heap pressure (the case worker threads stalled on)', async () => {
			
 
				+    const { signal } = await runChild(
			
 
				+      { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
			
 
				+      // ~40MB retained so a GC is likely, then a tight NON-allocating loop — the
			
 
				+      // exact shape that deadlocks a same-process worker on the global safepoint.
			
 
				+      'const k=[]; for (let i=0;i<40;i++) k.push(Buffer.alloc(1024*1024,i)); global.__k=k; setTimeout(() => { while (true) {} }, 150);',
			
 
				       8000
			
 
				     );
			
 
				     expect(signal).toBe('SIGKILL');
			
@@ -126,7 +103,6 @@ describe('liveness watchdog (spawned, real worker)', () => {
 
				   it('does NOT kill a healthy process that keeps its event loop turning', async () => {
			
 
				     const { code, signal } = await runChild(
			
 
				       { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
			
 
				-      // Stay responsive for 1.5s (3× the timeout), then exit cleanly with 7.
			
 
				       'const iv = setInterval(() => {}, 50); setTimeout(() => { clearInterval(iv); process.exit(7); }, 1500);',
			
 
				       8000
			
 
				     );
			
@@ -137,12 +113,9 @@ describe('liveness watchdog (spawned, real worker)', () => {
 
				   it('does NOT kill a wedged process when CODEGRAPH_NO_WATCHDOG=1', async () => {
			
 
				     const { signal } = await runChild(
			
 
				       { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500', CODEGRAPH_NO_WATCHDOG: '1' },
			
 
				-      // Wedge briefly, but the test's hard timeout reaps it (the watchdog must not).
			
 
				       'setTimeout(() => { const end = Date.now() + 1500; while (Date.now() < end) {} process.exit(3); }, 150);',
			
 
				       8000
			
 
				     );
			
 
				-    // Killed by neither the watchdog (disabled) nor the hard timeout — it ran
			
 
				-    // its bounded busy-loop and exited 3 on its own.
			
 
				-    expect(signal).toBeNull();
			
 
				+    expect(signal).toBeNull(); // the watchdog is off, so nothing kills it
			
 
				   }, 12000);
			
 
				 });
			
--- a/src/mcp/liveness-watchdog.ts
+++ b/src/mcp/liveness-watchdog.ts
@@ -2,70 +2,45 @@
 
				  * Main-thread liveness watchdog — belt-and-suspenders for #850.
			
 
				  *
			
 
				  * The #850 fix removes the one *known* trigger (the uncaught-exception handler
			
 
				- * no longer formats a raw Error's `.stack` — the step that could enter a
			
 
				- * non-terminating V8 source-position loop). But ANY synchronous, non-yielding
			
 
				+ * no longer formats a raw Error's `.stack`). But ANY synchronous, non-yielding
			
 
				  * loop on the main thread — a future V8 stack-format pathology, a runaway
			
 
				  * regex, an accidental `while (true)` — wedges the event loop, and from JS you
			
 
				  * cannot interrupt it: timers, signal handlers, and the PPID watchdog all run
			
 
				  * *on* that blocked loop, so the process pins a core forever with no
			
 
				  * self-recovery (the exact unrecoverable state #850 reported).
			
 
				  *
			
 
				- * The only observer still running when the main thread is wedged is another
			
 
				- * THREAD. This installs a tiny worker thread that watches a heartbeat the main
			
 
				- * thread bumps through shared memory. If the heartbeat stops advancing across
			
 
				- * enough consecutive checks (~`timeoutMs` of real time), the worker concludes
			
 
				- * the main thread is wedged and SIGKILLs the process — the one signal a wedged
			
 
				- * event loop can't swallow — so a fresh daemon starts on the next connection
			
 
				- * instead of a zombie pinning a core.
			
 
				+ * **Why a separate PROCESS, not a worker thread.** A worker thread was the
			
 
				+ * obvious first choice and it works in a toy process — but it was validated to
			
 
				+ * FAIL in the real daemon (#850 live test). V8 isolates in one process
			
 
				+ * coordinate on global safepoints, so when one thread requests a GC every other
			
 
				+ * thread must reach a safepoint before it can proceed. A main thread wedged in
			
 
				+ * a tight, non-allocating loop never reaches one, which strands the watchdog
			
 
				+ * worker on its very next allocation/safepoint check — and the #850 hot loop
			
 
				+ * (`SourcePositionTableIterator::Advance`, a non-allocating C++ table walk) is
			
 
				+ * exactly that shape. A child process shares no isolate and no heap with the
			
 
				+ * parent, so the wedge cannot touch it; it kills via the kernel, which honours
			
 
				+ * SIGKILL regardless of what the parent's threads are doing.
			
 
				  *
			
 
				- * **Why count checks, not elapsed wall-clock.** A laptop that sleeps freezes
			
 
				- * both threads; on wake `Date.now()` has jumped hours but the heartbeat sat
			
 
				- * still — a wall-clock delta would false-positive and kill a perfectly healthy
			
 
				- * daemon. Counting *consecutive worker iterations* with no progress is immune:
			
 
				- * a healthy main thread resumes and bumps the heartbeat within one interval of
			
 
				- * waking, resetting the count; only a thread that never resumes keeps it
			
 
				- * climbing. {@link stepHeartbeat} is the pure reducer behind both the worker
			
 
				- * and the unit tests.
			
 
				+ * **How.** The parent writes a heartbeat byte to the child's stdin every
			
 
				+ * `checkMs` from a timer — firing at all means the event loop is turning. The
			
 
				+ * child resets a kill-timer on each byte; if none arrives for `timeoutMs` it
			
 
				+ * `SIGKILL`s the parent so a fresh daemon starts on the next connection. When
			
 
				+ * the parent exits normally the pipe closes and the child exits too (no
			
 
				+ * orphan).
			
 
				  *
			
 
				- * **Why it won't fire on real work.** Heavy parsing runs in the parse worker
			
 
				- * (off this thread) and indexing shells out to a child process, so the daemon's
			
 
				- * main thread only ever does fast, bounded work (socket handling + sub-second
			
 
				- * SQLite reads). The default timeout is therefore vastly larger than any
			
 
				- * legitimate main-thread block yet vastly smaller than "forever". Opt out with
			
 
				- * `CODEGRAPH_NO_WATCHDOG=1`; tune with `CODEGRAPH_WATCHDOG_TIMEOUT_MS`.
			
 
				+ * **Won't fire on real work.** Heavy parsing runs in the parse worker
			
 
				+ * (off-thread) and indexing shells out to a child process, so the daemon's main
			
 
				+ * thread only ever does fast, bounded work. The default timeout is ~300× the
			
 
				+ * 5h #850 wedge shorter, yet far longer than any legitimate main-thread block.
			
 
				+ * Opt out with `CODEGRAPH_NO_WATCHDOG=1`; tune with `CODEGRAPH_WATCHDOG_TIMEOUT_MS`.
			
 
				  */
			
 
				-import { Worker } from 'worker_threads';
			
 
				+import * as fs from 'fs';
			
 
				+import * as os from 'os';
			
 
				+import { spawn, ChildProcess } from 'child_process';
			
 
				 
			
 
				 /** Default: 60s — ~300× shorter than the 5h #850 wedge, far longer than any real main-thread block. */
			
 
				 export const DEFAULT_WATCHDOG_TIMEOUT_MS = 60_000;
			
 
				 
			
 
				-export interface HeartbeatState {
			
 
				-  /** Last heartbeat counter the worker observed. */
			
 
				-  lastCounter: number;
			
 
				-  /** Consecutive checks the counter has NOT advanced. */
			
 
				-  staleChecks: number;
			
 
				-}
			
 
				-
			
 
				-/**
			
 
				- * Pure reducer for one worker check. `maxStaleChecks` consecutive no-progress
			
 
				- * checks → wedged. Counting iterations (not wall-clock) is what makes this
			
 
				- * robust to clock jumps / system sleep.
			
 
				- */
			
 
				-export function stepHeartbeat(
			
 
				-  state: HeartbeatState,
			
 
				-  counter: number,
			
 
				-  maxStaleChecks: number
			
 
				-): { next: HeartbeatState; wedged: boolean } {
			
 
				-  if (counter !== state.lastCounter) {
			
 
				-    return { next: { lastCounter: counter, staleChecks: 0 }, wedged: false };
			
 
				-  }
			
 
				-  const staleChecks = state.staleChecks + 1;
			
 
				-  return {
			
 
				-    next: { lastCounter: counter, staleChecks },
			
 
				-    wedged: staleChecks >= maxStaleChecks,
			
 
				-  };
			
 
				-}
			
 
				-
			
 
				 /** `true` for `1/true/yes/on` (case-insensitive); `false` otherwise. */
			
 
				 function isEnvTruthy(raw: string | undefined): boolean {
			
 
				   if (!raw) return false;
			
@@ -82,86 +57,105 @@ export function parseWatchdogTimeoutMs(
 
				   return Number.isFinite(n) && n > 0 ? n : fallback;
			
 
				 }
			
 
				 
			
 
				-/** Derive a heartbeat/check cadence that fires several times inside the timeout window. */
			
 
				+/** Derive a heartbeat cadence that emits several beats inside the timeout window. */
			
 
				 export function deriveCheckIntervalMs(timeoutMs: number): number {
			
 
				   return Math.min(2000, Math.max(50, Math.round(timeoutMs / 5)));
			
 
				 }
			
 
				 
			
 
				+/** Arming/teardown diagnostics, gated on the existing MCP debug switch. */
			
 
				+function debug(msg: string): void {
			
 
				+  if (process.env.CODEGRAPH_MCP_DEBUG) {
			
 
				+    try { fs.writeSync(2, `[CodeGraph watchdog] ${msg}\n`); } catch { /* ignore */ }
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				 export interface WatchdogHandle {
			
 
				-  /** Stop heartbeating and terminate the worker. Idempotent. */
			
 
				+  /** Stop heartbeating and shut the watchdog child down. Idempotent. */
			
 
				   stop(): void;
			
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * The worker body, run via `new Worker(src, { eval: true })`. Inlined as a
			
 
				- * string (not a shipped `.js`) so there is no dist-vs-src path to resolve — it
			
 
				- * runs identically under `tsx` in tests and under the bundle in production.
			
 
				- * Mirrors {@link stepHeartbeat}; keep the two in sync (the unit test pins the
			
 
				- * algorithm, the integration test pins this exact body end-to-end).
			
 
				+ * The watchdog child body, run via `node -e`. Inlined as a string (not a
			
 
				+ * shipped `.js`) so there is no dist-vs-src path to resolve — it runs
			
 
				+ * identically under `tsx` in tests and under the bundle in production. Reads its
			
 
				+ * target pid + timeout from argv; an MSG built once at startup (the child is
			
 
				+ * never wedged, so allocation here is fine).
			
 
				  */
			
 
				-const WORKER_SOURCE = `
			
 
				-const { workerData } = require('worker_threads');
			
 
				+const CHILD_SOURCE = `
			
 
				 const fs = require('fs');
			
 
				-const beat = new Int32Array(workerData.sab);
			
 
				-const { checkMs, maxStaleChecks } = workerData;
			
 
				-let lastCounter = Atomics.load(beat, 0);
			
 
				-let staleChecks = 0;
			
 
				-const timer = setInterval(() => {
			
 
				-  const counter = Atomics.load(beat, 0);
			
 
				-  if (counter !== lastCounter) { lastCounter = counter; staleChecks = 0; return; }
			
 
				-  if (++staleChecks < maxStaleChecks) return;
			
 
				-  clearInterval(timer);
			
 
				-  const secs = Math.round((staleChecks * checkMs) / 1000);
			
 
				-  try {
			
 
				-    fs.writeSync(2, '[CodeGraph] Main thread unresponsive for ~' + secs + 's — killing the wedged process so a fresh one can start (#850). Disable with CODEGRAPH_NO_WATCHDOG=1.\\n');
			
 
				-  } catch (e) { /* stderr gone */ }
			
 
				-  try { process.kill(process.pid, 'SIGKILL'); } catch (e) { /* nothing left to try */ }
			
 
				-}, checkMs);
			
 
				+const parentPid = Number(process.argv[1]);
			
 
				+const timeoutMs = Number(process.argv[2]);
			
 
				+const secs = Math.round(timeoutMs / 1000);
			
 
				+const MSG = Buffer.from('[CodeGraph] Main thread unresponsive for ~' + secs + 's — killing the wedged process so a fresh one can start (#850). Disable with CODEGRAPH_NO_WATCHDOG=1.\\n');
			
 
				+function kill() {
			
 
				+  try { fs.writeSync(2, MSG); } catch (e) {}
			
 
				+  try { process.kill(parentPid, 'SIGKILL'); } catch (e) {}
			
 
				+  process.exit(0);
			
 
				+}
			
 
				+let timer = setTimeout(kill, timeoutMs);
			
 
				+process.stdin.on('data', () => { clearTimeout(timer); timer = setTimeout(kill, timeoutMs); });
			
 
				+process.stdin.on('end', () => process.exit(0));   // parent closed the pipe (exited) -> no orphan
			
 
				+process.stdin.on('error', () => process.exit(0)); // pipe broke -> parent gone
			
 
				+process.stdin.resume();
			
 
				 `;
			
 
				 
			
 
				 /**
			
 
				  * Install the main-thread liveness watchdog for a long-lived process. Returns a
			
 
				- * handle to stop it, or `null` when disabled or when the worker can't be
			
 
				- * spawned (degraded, never throws — a missing watchdog must never keep a
			
 
				- * process from starting).
			
 
				+ * handle to stop it, or `null` when disabled or when the child can't be spawned
			
 
				+ * (degraded, never throws — a missing watchdog must never keep a process from
			
 
				+ * starting).
			
 
				  */
			
 
				 export function installMainThreadWatchdog(): WatchdogHandle | null {
			
 
				   if (isEnvTruthy(process.env.CODEGRAPH_NO_WATCHDOG)) return null;
			
 
				 
			
 
				   const timeoutMs = parseWatchdogTimeoutMs(process.env.CODEGRAPH_WATCHDOG_TIMEOUT_MS);
			
 
				   const checkMs = deriveCheckIntervalMs(timeoutMs);
			
 
				-  const maxStaleChecks = Math.max(1, Math.ceil(timeoutMs / checkMs));
			
 
				 
			
 
				-  // Single Int32 counter in shared memory. The main thread bumps it each tick;
			
 
				-  // the worker reads it. Atomics make the write visible across threads.
			
 
				-  const sab = new SharedArrayBuffer(Int32Array.BYTES_PER_ELEMENT);
			
 
				-  const beat = new Int32Array(sab);
			
 
				+  let child: ChildProcess;
			
 
				+  try {
			
 
				+    // No execArgv inheritance (unlike Worker), so the child carries none of our
			
 
				+    // V8 flags — it runs no WASM and needs none. stderr inherits the parent's
			
 
				+    // fd 2 so the kill notice lands wherever the parent logs (daemon.log).
			
 
				+    child = spawn(
			
 
				+      process.execPath,
			
 
				+      ['-e', CHILD_SOURCE, String(process.pid), String(timeoutMs)],
			
 
				+      {
			
 
				+        stdio: ['pipe', 'ignore', 'inherit'],
			
 
				+        windowsHide: true,
			
 
				+        // The watchdog touches no files; keep its cwd off the project/temp dir
			
 
				+        // so it can't hold one open (Windows EPERM-on-cleanup, mirrors the
			
 
				+        // parse-worker quirk).
			
 
				+        cwd: os.tmpdir(),
			
 
				+      }
			
 
				+    );
			
 
				+  } catch (err) {
			
 
				+    debug(`spawn failed: ${err instanceof Error ? err.message : String(err)}`);
			
 
				+    return null;
			
 
				+  }
			
 
				 
			
 
				-  // The heartbeat: firing at all means the event loop is turning. unref'd so it
			
 
				-  // never keeps the process alive on its own (the server's socket does that).
			
 
				+  const stdin = child.stdin;
			
 
				+  if (!stdin) {
			
 
				+    debug('child has no stdin pipe; not arming');
			
 
				+    try { child.kill(); } catch { /* ignore */ }
			
 
				+    return null;
			
 
				+  }
			
 
				+  // Writing after the child exits surfaces EPIPE on the stream — swallow it so
			
 
				+  // it can't escalate to the global handler (which now exits, #850).
			
 
				+  stdin.on('error', () => { /* child gone; heartbeat writes are best-effort */ });
			
 
				+  child.on('error', (err) => debug(`child error: ${err.message}`));
			
 
				+
			
 
				+  // Heartbeat: a byte per tick. When the main thread wedges, these stop and the
			
 
				+  // child's timeout fires. unref'd so it never keeps the process alive itself.
			
 
				   const heartbeat = setInterval(() => {
			
 
				-    Atomics.add(beat, 0, 1);
			
 
				+    try { stdin.write('\n'); } catch { /* child gone */ }
			
 
				   }, checkMs);
			
 
				   heartbeat.unref();
			
 
				 
			
 
				-  let worker: Worker;
			
 
				-  try {
			
 
				-    worker = new Worker(WORKER_SOURCE, {
			
 
				-      eval: true,
			
 
				-      workerData: { sab, checkMs, maxStaleChecks },
			
 
				-    });
			
 
				-  } catch {
			
 
				-    // Worker threads unavailable — fall back to no watchdog rather than refuse
			
 
				-    // to start. Degraded (a future wedge wouldn't self-kill) but not broken.
			
 
				-    clearInterval(heartbeat);
			
 
				-    return null;
			
 
				-  }
			
 
				+  // Neither the child nor its pipe should keep the parent alive past its work.
			
 
				+  child.unref();
			
 
				+  try { (stdin as unknown as { unref?: () => void }).unref?.(); } catch { /* ignore */ }
			
 
				 
			
 
				-  // A watchdog-worker error must never escalate to the global handler (which now
			
 
				-  // exits, #850): swallow it and run degraded.
			
 
				-  worker.on('error', () => { /* watchdog gone; nothing safe to do here */ });
			
 
				-  // Don't let the watchdog keep the process alive past its real work.
			
 
				-  worker.unref();
			
 
				+  debug(`armed (child pid ${child.pid ?? '?'}): timeoutMs=${timeoutMs} checkMs=${checkMs}`);
			
 
				 
			
 
				   let stopped = false;
			
 
				   return {
			
@@ -169,7 +163,8 @@ export function installMainThreadWatchdog(): WatchdogHandle | null {
 
				       if (stopped) return;
			
 
				       stopped = true;
			
 
				       clearInterval(heartbeat);
			
 
				-      void worker.terminate();
			
 
				+      try { stdin.end(); } catch { /* ignore */ } // EOF -> child exits cleanly
			
 
				+      try { child.kill(); } catch { /* ignore */ } // belt-and-suspenders
			
 
				     },
			
 
				   };
			
 
				 }