liveness-watchdog.test.ts 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import { describe, it, expect, beforeAll } from 'vitest';
  2. import { spawn } from 'child_process';
  3. import * as fs from 'fs';
  4. import * as path from 'path';
  5. import {
  6. stepHeartbeat,
  7. parseWatchdogTimeoutMs,
  8. deriveCheckIntervalMs,
  9. installMainThreadWatchdog,
  10. DEFAULT_WATCHDOG_TIMEOUT_MS,
  11. } from '../src/mcp/liveness-watchdog';
  12. describe('stepHeartbeat (wedge-detection reducer)', () => {
  13. it('resets the stale count when the counter advances', () => {
  14. const r = stepHeartbeat({ lastCounter: 5, staleChecks: 3 }, 6, 4);
  15. expect(r.wedged).toBe(false);
  16. expect(r.next).toEqual({ lastCounter: 6, staleChecks: 0 });
  17. });
  18. it('accumulates stale checks while the counter is frozen', () => {
  19. let s = { lastCounter: 9, staleChecks: 0 };
  20. for (let i = 1; i < 4; i++) {
  21. const r = stepHeartbeat(s, 9, 4);
  22. expect(r.wedged).toBe(false);
  23. expect(r.next.staleChecks).toBe(i);
  24. s = r.next;
  25. }
  26. });
  27. it('reports wedged once the stale count reaches the threshold', () => {
  28. const r = stepHeartbeat({ lastCounter: 9, staleChecks: 3 }, 9, 4);
  29. expect(r.wedged).toBe(true);
  30. });
  31. it('a single late heartbeat rescues the process (sleep/clock-jump safety)', () => {
  32. // 3 stale checks, then progress (as if the main thread resumed after a
  33. // system sleep) — must NOT be considered wedged.
  34. let s = { lastCounter: 1, staleChecks: 0 };
  35. s = stepHeartbeat(s, 1, 4).next; // stale 1
  36. s = stepHeartbeat(s, 1, 4).next; // stale 2
  37. s = stepHeartbeat(s, 1, 4).next; // stale 3
  38. const resumed = stepHeartbeat(s, 2, 4); // counter advanced
  39. expect(resumed.wedged).toBe(false);
  40. expect(resumed.next.staleChecks).toBe(0);
  41. });
  42. });
  43. describe('config parsing', () => {
  44. it('parseWatchdogTimeoutMs falls back for missing/invalid input', () => {
  45. expect(parseWatchdogTimeoutMs(undefined)).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  46. expect(parseWatchdogTimeoutMs('not-a-number')).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  47. expect(parseWatchdogTimeoutMs('0')).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  48. expect(parseWatchdogTimeoutMs('-5')).toBe(DEFAULT_WATCHDOG_TIMEOUT_MS);
  49. expect(parseWatchdogTimeoutMs('1500')).toBe(1500);
  50. });
  51. it('deriveCheckIntervalMs stays within [50, 2000] and scales with the timeout', () => {
  52. expect(deriveCheckIntervalMs(60_000)).toBe(2000); // clamped high
  53. expect(deriveCheckIntervalMs(500)).toBe(100); // 500/5
  54. expect(deriveCheckIntervalMs(10)).toBe(50); // clamped low
  55. });
  56. });
  57. describe('installMainThreadWatchdog opt-out', () => {
  58. it('returns null (no worker) when CODEGRAPH_NO_WATCHDOG is set', () => {
  59. const prev = process.env.CODEGRAPH_NO_WATCHDOG;
  60. process.env.CODEGRAPH_NO_WATCHDOG = '1';
  61. try {
  62. expect(installMainThreadWatchdog()).toBeNull();
  63. } finally {
  64. if (prev === undefined) delete process.env.CODEGRAPH_NO_WATCHDOG;
  65. else process.env.CODEGRAPH_NO_WATCHDOG = prev;
  66. }
  67. });
  68. });
  69. /**
  70. * End-to-end: spawn a real process, install the real worker, and prove it kills
  71. * a wedged main thread (and ONLY a wedged one). Drives the built module the same
  72. * way mcp-ppid-watchdog.test.ts drives the built CLI.
  73. */
  74. describe('liveness watchdog (spawned, real worker)', () => {
  75. const MODULE = path.resolve(__dirname, '../dist/mcp/liveness-watchdog.js');
  76. beforeAll(() => {
  77. if (!fs.existsSync(MODULE)) {
  78. throw new Error(`Build the project first: ${MODULE} is missing (run npm run build).`);
  79. }
  80. });
  81. function runChild(
  82. env: Record<string, string>,
  83. body: string,
  84. hardTimeoutMs: number
  85. ): Promise<{ code: number | null; signal: NodeJS.Signals | 'TIMEOUT' | null }> {
  86. const src = `
  87. const { installMainThreadWatchdog } = require(${JSON.stringify(MODULE)});
  88. installMainThreadWatchdog();
  89. ${body}
  90. `;
  91. const child = spawn(process.execPath, ['-e', src], {
  92. env: { ...process.env, ...env },
  93. stdio: ['ignore', 'ignore', 'ignore'],
  94. });
  95. return new Promise((resolve) => {
  96. const timer = setTimeout(() => {
  97. child.kill('SIGKILL');
  98. resolve({ code: null, signal: 'TIMEOUT' });
  99. }, hardTimeoutMs);
  100. child.on('exit', (code, signal) => {
  101. clearTimeout(timer);
  102. resolve({ code, signal });
  103. });
  104. });
  105. }
  106. it('SIGKILLs a process whose main thread wedges in a sync loop', async () => {
  107. const { signal } = await runChild(
  108. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
  109. 'setTimeout(() => { while (true) {} }, 150);', // wedge the event loop forever
  110. 8000
  111. );
  112. expect(signal).toBe('SIGKILL');
  113. }, 12000);
  114. it('does NOT kill a healthy process that keeps its event loop turning', async () => {
  115. const { code, signal } = await runChild(
  116. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500' },
  117. // Stay responsive for 1.5s (3× the timeout), then exit cleanly with 7.
  118. 'const iv = setInterval(() => {}, 50); setTimeout(() => { clearInterval(iv); process.exit(7); }, 1500);',
  119. 8000
  120. );
  121. expect(signal).toBeNull(); // never signalled
  122. expect(code).toBe(7); // exited on its own terms
  123. }, 12000);
  124. it('does NOT kill a wedged process when CODEGRAPH_NO_WATCHDOG=1', async () => {
  125. const { signal } = await runChild(
  126. { CODEGRAPH_WATCHDOG_TIMEOUT_MS: '500', CODEGRAPH_NO_WATCHDOG: '1' },
  127. // Wedge briefly, but the test's hard timeout reaps it (the watchdog must not).
  128. 'setTimeout(() => { const end = Date.now() + 1500; while (Date.now() < end) {} process.exit(3); }, 150);',
  129. 8000
  130. );
  131. // Killed by neither the watchdog (disabled) nor the hard timeout — it ran
  132. // its bounded busy-loop and exited 3 on its own.
  133. expect(signal).toBeNull();
  134. }, 12000);
  135. });