Bladeren bron

fix(mcp): drain the event loop on Windows daemon shutdown instead of aborting mid-watcher-close (#1041)

On Windows, calling process.exit() while a recursive fs.watch handle is still
tearing down aborts the daemon with a libuv UV_HANDLE_CLOSING assertion
(0xC0000409) — reproducible whenever the indexed tree contains a nested repo
(submodule / embedded clone), since that's what keeps a watch active at shutdown.
A small exit delay doesn't help; only letting the loop drain is clean (verified
on a real Windows VM: close()+exit() and close()+setTimeout(exit) both abort,
while letting the loop drain exits 0).

finalizeDaemonExit() now exits immediately on POSIX (unchanged) but on Windows
marks success (exitCode=0) and lets the loop drain to a natural exit, with an
unref'd backstop that force-exits only if a stray handle would otherwise hang
shutdown. The daemon's own timers are already unref'd and its PPID watchdog lives
in the proxy, so nothing keeps the loop alive past the closing watch handles —
natural drain is fast. Pure + platform-injected so both branches unit-test off-Windows.

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby Mchenry 13 uur geleden
bovenliggende
commit
cfa293539f
2 gewijzigde bestanden met toevoegingen van 85 en 3 verwijderingen
  1. 42 2
      __tests__/daemon-bind-failure.test.ts
  2. 43 1
      src/mcp/daemon.ts

+ 42 - 2
__tests__/daemon-bind-failure.test.ts

@@ -13,11 +13,11 @@
  * so it survives and `listen()` fails with EADDRINUSE.
  */
 
-import { afterEach, describe, expect, it } from 'vitest';
+import { afterEach, describe, expect, it, vi } from 'vitest';
 import * as fs from 'fs';
 import * as os from 'os';
 import * as path from 'path';
-import { Daemon, tryAcquireDaemonLock } from '../src/mcp/daemon';
+import { Daemon, tryAcquireDaemonLock, finalizeDaemonExit } from '../src/mcp/daemon';
 import { getDaemonPidPath, getDaemonSocketPath } from '../src/mcp/daemon-paths';
 
 const tmpRoots: string[] = [];
@@ -53,3 +53,43 @@ describe('Daemon.start() bind failure (#974)', () => {
     expect(fs.existsSync(pidPath)).toBe(false);
   });
 });
+
+/**
+ * Windows shutdown must not force `process.exit()` while the recursive file
+ * watcher is still tearing down — that aborts the daemon with a libuv
+ * `UV_HANDLE_CLOSING` assertion (0xC0000409), reproducible when the indexed tree
+ * contains a nested repo. `finalizeDaemonExit` drains on Windows and exits
+ * immediately elsewhere; both branches are exercised here by injecting the
+ * platform + exit fn (so it runs on any host).
+ */
+describe('finalizeDaemonExit — Windows drains instead of aborting mid-watcher-close', () => {
+  for (const platform of ['linux', 'darwin'] as const) {
+    it(`exits immediately on ${platform}`, () => {
+      const exit = vi.fn();
+      const backstop = finalizeDaemonExit(platform, exit);
+      expect(exit).toHaveBeenCalledTimes(1);
+      expect(exit).toHaveBeenCalledWith(0);
+      expect(backstop).toBeNull();
+    });
+  }
+
+  it('on win32 defers exit (lets the loop drain), then force-exits via an unref\'d backstop', () => {
+    vi.useFakeTimers();
+    const prevExitCode = process.exitCode;
+    const exit = vi.fn();
+    try {
+      const backstop = finalizeDaemonExit('win32', exit);
+      // No synchronous exit — the process must drain its closing watch handles first.
+      expect(exit).not.toHaveBeenCalled();
+      expect(backstop).not.toBeNull();
+      // Success code is set so a natural drain exits 0.
+      expect(process.exitCode).toBe(0);
+      // If a stray handle keeps the loop alive, the backstop still forces exit.
+      vi.advanceTimersByTime(2_000);
+      expect(exit).toHaveBeenCalledWith(0);
+    } finally {
+      vi.useRealTimers();
+      process.exitCode = prevExitCode; // don't leak a 0 exit code into the runner
+    }
+  });
+});

+ 43 - 1
src/mcp/daemon.ts

@@ -70,6 +70,45 @@ const DEFAULT_IDLE_TIMEOUT_MS = 300_000;
  */
 const DEFAULT_MAX_IDLE_MS = 1_800_000; // 30 min
 
+/**
+ * Windows-only shutdown backstop. On Windows, calling `process.exit()` while a
+ * recursive `fs.watch` handle is still tearing down aborts the process with a
+ * libuv `UV_HANDLE_CLOSING` assertion (`0xC0000409`) — reproducible whenever the
+ * watched tree contains a nested repo (submodule / embedded clone), since that's
+ * what keeps a watch active at shutdown. The fix is to let the event loop drain
+ * so libuv finishes closing those handles, then exit naturally; this timer only
+ * force-exits if some unexpected handle keeps the loop alive past the grace
+ * window. Kept short so shutdown stays snappy in that fallback. See
+ * `finalizeDaemonExit`.
+ */
+const DAEMON_SHUTDOWN_BACKSTOP_MS = 2_000;
+
+/**
+ * Finalize daemon shutdown. On POSIX, exit immediately — it's clean and fast.
+ * On Windows, do NOT force an exit while watchers may still be closing (that
+ * trips the libuv assertion above); instead mark success and let the loop drain
+ * to a natural exit, with an UNREF'd backstop that force-exits only if a stray
+ * handle would otherwise hang shutdown. Pure and platform-injected so both
+ * branches are unit-testable off-Windows. Returns the backstop timer (Windows)
+ * so callers/tests can clear it.
+ */
+export function finalizeDaemonExit(
+  platform: NodeJS.Platform,
+  exit: (code: number) => void,
+): NodeJS.Timeout | null {
+  if (platform === 'win32') {
+    process.exitCode = 0;
+    const backstop = setTimeout(() => exit(0), DAEMON_SHUTDOWN_BACKSTOP_MS);
+    // Unref so it never keeps the loop alive: a natural drain (watchers closed,
+    // nothing else pending) exits before it fires; it only fires when some other
+    // handle is keeping the loop running, which is exactly when we need it.
+    backstop.unref?.();
+    return backstop;
+  }
+  exit(0);
+  return null;
+}
+
 /** How often the daemon sweeps connected clients for a dead peer process (#692). */
 const DEFAULT_CLIENT_SWEEP_MS = 30_000;
 
@@ -306,7 +345,10 @@ export class Daemon {
     if (process.platform !== 'win32') {
       try { fs.unlinkSync(this.socketPath); } catch { /* may already be gone */ }
     }
-    process.exit(0);
+    // POSIX exits here; Windows drains first (engine.stop() above began closing
+    // the file watcher, and exiting mid-teardown aborts the process). See
+    // finalizeDaemonExit / DAEMON_SHUTDOWN_BACKSTOP_MS.
+    finalizeDaemonExit(process.platform, (code) => process.exit(code));
   }
 
   private handleConnection(socket: net.Socket): void {