1
0

liveness-watchdog.test.ts 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import { describe, it, expect, beforeAll } from 'vitest';
  2. import { spawn } from 'child_process';
  3. import * as fs from 'fs';
  4. import * as path from 'path';
  5. import {
  6. parseWatchdogTimeoutMs,
  7. deriveCheckIntervalMs,
  8. installMainThreadWatchdog,
  9. DEFAULT_WATCHDOG_TIMEOUT_MS,
  10. } from '../src/mcp/liveness-watchdog';
  11. describe('config parsing', () => {
  12. it('parseWatchdogTimeoutMs falls back for missing/invalid input', () => {
  13. expect(parseWatchdogTimeoutMs(undefined)).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  14. expect(parseWatchdogTimeoutMs('not-a-number')).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  15. expect(parseWatchdogTimeoutMs('0')).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  16. expect(parseWatchdogTimeoutMs('-5')).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  17. expect(parseWatchdogTimeoutMs('1500')).toBe(1500);
  18. });
  19. it('deriveCheckIntervalMs stays within [50, 2000] and scales with the timeout', () => {
  20. expect(deriveCheckIntervalMs(60_000)).toBe(2000); // clamped high
  21. expect(deriveCheckIntervalMs(500)).toBe(100); // 500/5
  22. expect(deriveCheckIntervalMs(10)).toBe(50); // clamped low
  23. });
  24. });
  25. describe('installMainThreadWatchdog opt-out', () => {
  26. it('returns null (spawns nothing) when CODEGRAPH_NO_WATCHDOG is set', () => {
  27. const prev = process.env.CODEGRAPH_NO_WATCHDOG;
  28. process.env.CODEGRAPH_NO_WATCHDOG = '1';
  29. try {
  30. expect(installMainThreadWatchdog()).toBeNull();
  31. } finally {
  32. if (prev === undefined) delete process.env.CODEGRAPH_NO_WATCHDOG;
  33. else process.env.CODEGRAPH_NO_WATCHDOG = prev;
  34. }
  35. });
  36. });
  37. /**
  38. * End-to-end: spawn a real process, install the real watchdog (which spawns a
  39. * separate watchdog child), and prove it kills a wedged main thread — including
  40. * the case a worker thread could NOT (a non-allocating loop under heap pressure,
  41. * which strands a same-process worker on V8's global safepoint, #850). Drives
  42. * the built module the way mcp-ppid-watchdog.test.ts drives the built CLI.
  43. */
  44. describe('liveness watchdog (spawned, real watchdog process)', () => {
  45. const MODULE = path.resolve(__dirname, '../dist/mcp/liveness-watchdog.js');
  46. beforeAll(() => {
  47. if (!fs.existsSync(MODULE)) {
  48. throw new Error(`Build the project first: ${MODULE} is missing (run npm run build).`);
  49. }
  50. });
  51. function runChild(
  52. env: Record<string, string>,
  53. body: string,
  54. hardTimeoutMs: number
  55. ): Promise<{ code: number | null; signal: NodeJS.Signals | 'TIMEOUT' | null }> {
  56. const src = `
  57. const { installMainThreadWatchdog } = require(${JSON.stringify(MODULE)});
  58. installMainThreadWatchdog();
  59. ${body}
  60. `;
  61. const child = spawn(process.execPath, ['-e', src], {
  62. env: { ...process.env, ...env },
  63. stdio: ['ignore', 'ignore', 'ignore'],
  64. });
  65. return new Promise((resolve) => {
  66. const timer = setTimeout(() => {
  67. child.kill('SIGKILL');
  68. resolve({ code: null, signal: 'TIMEOUT' });
  69. }, hardTimeoutMs);
  70. child.on('exit', (code, signal) => {
  71. clearTimeout(timer);
  72. resolve({ code, signal });
  73. });
  74. });
  75. }
  76. // Assert the watchdog terminated the process. POSIX surfaces the external
  77. // SIGKILL as signal 'SIGKILL'; Windows has no real signals, so the watchdog's
  78. // `process.kill(pid, 'SIGKILL')` maps to TerminateProcess and an observer sees
  79. // signal=null with a non-zero exit code. Either is a kill; the synthetic
  80. // 'TIMEOUT' (the watchdog never fired) is the failure we're guarding against.
  81. function expectKilled(r: { code: number | null; signal: NodeJS.Signals | 'TIMEOUT' | null }): void {
  82. expect(r.signal === 'SIGKILL' || (r.signal === null && r.code !== 0 && r.code !== null)).toBe(true);
  83. }
  84. it('SIGKILLs a process whose main thread wedges in a sync loop', async () => {
  85. const r = await runChild(
  86. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
  87. 'setTimeout(() => { while (true) {} }, 150);',
  88. 8000
  89. );
  90. expectKilled(r);
  91. }, 12000);
  92. it('SIGKILLs a non-allocating wedge under heap pressure (the case worker threads stalled on)', async () => {
  93. const r = await runChild(
  94. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
  95. // ~40MB retained so a GC is likely, then a tight NON-allocating loop — the
  96. // exact shape that deadlocks a same-process worker on the global safepoint.
  97. 'const k=[]; for (let i=0;i<40;i++) k.push(Buffer.alloc(1024*1024,i)); global.__k=k; setTimeout(() => { while (true) {} }, 150);',
  98. 8000
  99. );
  100. expectKilled(r);
  101. }, 12000);
  102. it('does NOT kill a healthy process that keeps its event loop turning', async () => {
  103. const { code, signal } = await runChild(
  104. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
  105. 'const iv = setInterval(() => {}, 50); setTimeout(() => { clearInterval(iv); process.exit(7); }, 1500);',
  106. 8000
  107. );
  108. expect(signal).toBeNull(); // never signalled
  109. expect(code).toBe(7); // exited on its own terms
  110. }, 12000);
  111. it('does NOT kill a wedged process when CODEGRAPH_NO_WATCHDOG=1', async () => {
  112. const { code, signal } = await runChild(
  113. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500', CODEGRAPH_NO_WATCHDOG: '1' },
  114. 'setTimeout(() => { const end = Date.now() + 1500; while (Date.now() < end) {} process.exit(3); }, 150);',
  115. 8000
  116. );
  117. // It exits with its OWN code 3 — proving nothing killed it. (Checking only
  118. // signal=null is insufficient on Windows, where a kill also reports null.)
  119. expect(signal).toBeNull();
  120. expect(code).toBe(3);
  121. }, 12000);
  122. });