Переглянути джерело

feat(cli): codegraph stop / list to manage background daemons (#861)

Adds first-class daemon control (the #845 pain point: no clean way to stop a
runaway daemon). `codegraph stop [path]` stops the current/given project's
daemon (SIGTERM -> SIGKILL fallback, sweeps artifacts); `stop --all` stops every
daemon; `list`/`ps` shows running daemons (--json for scripts).

Discovery via a small self-healing registry: each daemon records its root under
~/.codegraph/daemons/ on start, removes it on graceful shutdown; readers prune
dead pids. Cross-platform by construction (files + process.kill). Validated live
on macOS, Linux (docker), and Windows (VM): registry unit 6/6 and real-daemon
stop/list 6/6 on each.
Colby Mchenry 1 тиждень тому
батько
коміт
0f825649a1
5 змінених файлів з 375 додано та 1 видалено
  1. 1 0
      CHANGELOG.md
  2. 103 0
      __tests__/daemon-registry.test.ts
  3. 70 1
      src/bin/codegraph.ts
  4. 195 0
      src/mcp/daemon-registry.ts
  5. 6 0
      src/mcp/daemon.ts

+ 1 - 0
CHANGELOG.md

@@ -11,6 +11,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ### New Features
 
+- New `codegraph list` and `codegraph stop` commands for managing the background daemon. `codegraph list` (alias `ps`) shows every running CodeGraph daemon — project, pid, version, uptime — with `--json` for scripting. `codegraph stop` stops the daemon for the current project (or `codegraph stop <path>`, or `codegraph stop --all` to stop every daemon on the machine). Previously the only way to shut a daemon down was to hunt for its pid and `kill` it by hand. (#845)
 - The CodeGraph MCP server now self-heals if its main thread ever locks up. A lightweight watchdog notices when the process has stopped responding and stops it so a fresh one starts on your next request — it can no longer sit pinned at 100% CPU with no way to recover. Tune the detection window with `CODEGRAPH_WATCHDOG_TIMEOUT_MS`, or turn it off entirely with `CODEGRAPH_NO_WATCHDOG=1`. (#850)
 
 ### Fixes

+ 103 - 0
__tests__/daemon-registry.test.ts

@@ -0,0 +1,103 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { spawn } from 'child_process';
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import {
+  getRegistryDir,
+  isProcessAlive,
+  registerDaemon,
+  deregisterDaemon,
+  listDaemons,
+  type DaemonRecord,
+} from '../src/mcp/daemon-registry';
+
+/** A pid that's guaranteed dead: spawn a trivial process, let it exit, reap it. */
+async function deadPid(): Promise<number> {
+  const child = spawn(process.execPath, ['-e', 'process.exit(0)']);
+  const pid = child.pid!;
+  await new Promise<void>((r) => child.on('exit', () => r()));
+  await new Promise((r) => setTimeout(r, 50)); // let the OS reap it
+  return pid;
+}
+
+function rec(root: string, pid: number, startedAt = Date.now()): DaemonRecord {
+  return { root, pid, version: '1.0.0', socketPath: `${root}/.codegraph/daemon.sock`, startedAt };
+}
+
+describe('daemon-registry', () => {
+  let tmpHome: string;
+  let prevHome: string | undefined;
+  let prevUserProfile: string | undefined;
+
+  beforeEach(() => {
+    tmpHome = fs.mkdtempSync(path.join(os.tmpdir(), 'cg-reg-home-'));
+    prevHome = process.env.HOME;
+    prevUserProfile = process.env.USERPROFILE;
+    process.env.HOME = tmpHome; // os.homedir() honors HOME (POSIX) ...
+    process.env.USERPROFILE = tmpHome; // ... and USERPROFILE (Windows)
+    // Sanity: the registry must resolve under our temp home, or the test would
+    // pollute the real ~/.codegraph.
+    expect(getRegistryDir().startsWith(tmpHome)).toBe(true);
+  });
+
+  afterEach(() => {
+    if (prevHome === undefined) delete process.env.HOME; else process.env.HOME = prevHome;
+    if (prevUserProfile === undefined) delete process.env.USERPROFILE; else process.env.USERPROFILE = prevUserProfile;
+    try { fs.rmSync(tmpHome, { recursive: true, force: true }); } catch { /* ignore */ }
+  });
+
+  describe('isProcessAlive', () => {
+    it('is true for our own process and false for junk/dead pids', async () => {
+      expect(isProcessAlive(process.pid)).toBe(true);
+      expect(isProcessAlive(0)).toBe(false);
+      expect(isProcessAlive(-1)).toBe(false);
+      expect(isProcessAlive(NaN)).toBe(false);
+      expect(isProcessAlive(await deadPid())).toBe(false);
+    });
+  });
+
+  it('listDaemons returns [] when nothing is registered (no dir yet)', () => {
+    expect(listDaemons()).toEqual([]);
+  });
+
+  it('register → list shows a live daemon; deregister removes it', () => {
+    registerDaemon(rec('/proj/a', process.pid));
+    const live = listDaemons();
+    expect(live).toHaveLength(1);
+    expect(live[0].root).toBe('/proj/a');
+    expect(live[0].pid).toBe(process.pid);
+
+    deregisterDaemon('/proj/a');
+    expect(listDaemons()).toEqual([]);
+  });
+
+  it('prunes records whose process is dead', async () => {
+    const dead = await deadPid();
+    registerDaemon(rec('/proj/dead', dead));
+    registerDaemon(rec('/proj/live', process.pid));
+
+    const live = listDaemons();
+    expect(live).toHaveLength(1);
+    expect(live[0].root).toBe('/proj/live');
+
+    // The dead record's file was deleted as a side effect.
+    const remaining = fs.readdirSync(getRegistryDir()).filter((f) => f.endsWith('.json'));
+    expect(remaining).toHaveLength(1);
+  });
+
+  it('peeking with prune:false leaves dead records on disk', async () => {
+    const dead = await deadPid();
+    registerDaemon(rec('/proj/dead', dead));
+    expect(listDaemons({ prune: false })).toEqual([]); // dead is filtered from results
+    // ...but the file survives for the caller to inspect.
+    expect(fs.readdirSync(getRegistryDir()).filter((f) => f.endsWith('.json'))).toHaveLength(1);
+  });
+
+  it('lists multiple live daemons newest-first', () => {
+    registerDaemon(rec('/proj/old', process.pid, 1000));
+    registerDaemon(rec('/proj/new', process.pid, 2000));
+    const live = listDaemons();
+    expect(live.map((d) => d.root)).toEqual(['/proj/new', '/proj/old']);
+  });
+});

+ 70 - 1
src/bin/codegraph.ts

@@ -26,7 +26,7 @@
 import { Command } from 'commander';
 import * as path from 'path';
 import * as fs from 'fs';
-import { getCodeGraphDir, isInitialized, unsafeIndexRootReason } from '../directory';
+import { getCodeGraphDir, isInitialized, unsafeIndexRootReason, findNearestCodeGraphRoot } from '../directory';
 import { detectWorktreeIndexMismatch, worktreeMismatchWarning } from '../sync/worktree';
 import { createShimmerProgress } from '../ui/shimmer-progress';
 import { getGlyphs } from '../ui/glyphs';
@@ -1268,6 +1268,75 @@ function printFileTree(
   renderNode(root, '', true, 0);
 }
 
+/**
+ * codegraph stop — stop the background daemon for a project (or --all).
+ */
+program
+  .command('stop [path]')
+  .description('Stop the background CodeGraph daemon for a project (defaults to the current one)')
+  .option('-a, --all', 'Stop every running CodeGraph daemon on this machine')
+  .action(async (pathArg: string | undefined, options: { all?: boolean }) => {
+    const { stopDaemonAt, stopAllDaemons } = await import('../mcp/daemon-registry');
+    try {
+      if (options.all) {
+        const results = await stopAllDaemons();
+        const stopped = results.filter((r) => r.outcome === 'term' || r.outcome === 'kill');
+        if (stopped.length === 0) {
+          info('No running CodeGraph daemons.');
+          return;
+        }
+        for (const r of stopped) {
+          success(`Stopped daemon (pid ${r.pid}${r.outcome === 'kill' ? ', forced' : ''}) — ${r.root}`);
+        }
+        return;
+      }
+
+      const found = findNearestCodeGraphRoot(path.resolve(pathArg || process.cwd()));
+      if (!found) {
+        error('No CodeGraph project found here. Run inside a project, pass a path, or use --all.');
+        process.exit(1);
+      }
+      let root = found;
+      try { root = fs.realpathSync(found); } catch { /* fall back to the un-realpath'd root */ }
+
+      const result = await stopDaemonAt(root);
+      if (result.outcome === 'no-daemon' || result.outcome === 'not-running') {
+        info(`No daemon running for ${root}.`);
+      } else {
+        success(`Stopped daemon (pid ${result.pid}${result.outcome === 'kill' ? ', forced' : ''}) for ${root}.`);
+      }
+    } catch (err) {
+      error(`Failed to stop daemon: ${err instanceof Error ? err.message : String(err)}`);
+      process.exit(1);
+    }
+  });
+
+/**
+ * codegraph list — show running background daemons.
+ */
+program
+  .command('list')
+  .alias('ps')
+  .description('List running CodeGraph background daemons')
+  .option('--json', 'Output as JSON')
+  .action(async (options: { json?: boolean }) => {
+    const { listDaemons } = await import('../mcp/daemon-registry');
+    const daemons = listDaemons();
+
+    if (options.json) {
+      process.stdout.write(JSON.stringify(daemons, null, 2) + '\n');
+      return;
+    }
+    if (daemons.length === 0) {
+      info('No CodeGraph daemons running.');
+      return;
+    }
+    for (const d of daemons) {
+      console.log(`pid ${d.pid}  v${d.version}  up ${formatDuration(Date.now() - d.startedAt)}  ${d.root}`);
+    }
+    info('Stop one with "codegraph stop <path>", or all with "codegraph stop --all".');
+  });
+
 /**
  * codegraph serve
  */

+ 195 - 0
src/mcp/daemon-registry.ts

@@ -0,0 +1,195 @@
+/**
+ * Global daemon registry + stop/list control — the discovery layer behind
+ * `codegraph list` and `codegraph stop [--all]`.
+ *
+ * Every per-project daemon already writes an authoritative lockfile at
+ * `<root>/.codegraph/daemon.pid`. That's enough to stop ONE daemon you can name,
+ * but there's no central place to find them ALL — which `list` and `stop --all`
+ * need. So each daemon also drops a tiny record under `~/.codegraph/daemons/` on
+ * start and removes it on graceful shutdown.
+ *
+ * The registry is a DISCOVERY index, never a source of truth: the live pid is.
+ * A SIGKILL'd daemon can't remove its own record, so readers prune any record
+ * whose pid is dead (`isProcessAlive`). Every write/read is best-effort — a
+ * registry hiccup must never break the daemon or a command; worst case `list`
+ * momentarily misses or over-lists one, which the next liveness prune corrects.
+ *
+ * Cross-platform by construction: only files + `process.kill(pid, signal)`,
+ * which behave consistently on macOS/Linux (real signals) and Windows (mapped to
+ * TerminateProcess). Validated live on all three.
+ */
+import * as fs from 'fs';
+import * as os from 'os';
+import * as path from 'path';
+import * as crypto from 'crypto';
+import { getDaemonPidPath, getDaemonSocketPath, decodeLockInfo } from './daemon-paths';
+
+export interface DaemonRecord {
+  /** Realpath'd project root the daemon serves. */
+  root: string;
+  pid: number;
+  version: string;
+  socketPath: string;
+  /** Epoch ms when the daemon bound its socket. */
+  startedAt: number;
+}
+
+/**
+ * `~/.codegraph/daemons` — GLOBAL, keyed off the home install dir. (The
+ * `CODEGRAPH_DIR` env var only renames the per-project index dir, not this.)
+ */
+export function getRegistryDir(): string {
+  return path.join(os.homedir(), '.codegraph', 'daemons');
+}
+
+function recordPath(root: string): string {
+  const hash = crypto.createHash('sha256').update(path.resolve(root)).digest('hex').slice(0, 16);
+  return path.join(getRegistryDir(), `${hash}.json`);
+}
+
+/**
+ * Is `pid` a live process? `kill(pid, 0)` sends no signal — it just probes:
+ * ESRCH ⇒ dead, EPERM ⇒ alive but not ours (still alive). Same liveness check
+ * the PPID watchdog (#277) and daemon lock arbitration use.
+ */
+export function isProcessAlive(pid: number): boolean {
+  if (!Number.isInteger(pid) || pid <= 0) return false;
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch (err) {
+    return (err as NodeJS.ErrnoException).code === 'EPERM';
+  }
+}
+
+/** Best-effort: record this daemon so `list`/`stop --all` can find it. */
+export function registerDaemon(rec: DaemonRecord): void {
+  try {
+    fs.mkdirSync(getRegistryDir(), { recursive: true });
+    fs.writeFileSync(recordPath(rec.root), JSON.stringify(rec, null, 2) + '\n', { mode: 0o600 });
+  } catch {
+    /* best-effort — list's liveness prune tolerates a missing record */
+  }
+}
+
+/** Best-effort: drop this daemon's record on graceful shutdown. */
+export function deregisterDaemon(root: string): void {
+  try {
+    fs.unlinkSync(recordPath(root));
+  } catch {
+    /* already gone */
+  }
+}
+
+/**
+ * All registered daemons whose process is still alive, newest first. Dead/garbage
+ * records are deleted as a side effect (self-healing) unless `prune` is false.
+ */
+export function listDaemons(opts: { prune?: boolean } = {}): DaemonRecord[] {
+  const prune = opts.prune ?? true;
+  const dir = getRegistryDir();
+  let files: string[];
+  try {
+    files = fs.readdirSync(dir).filter((f) => f.endsWith('.json'));
+  } catch {
+    return []; // no registry dir yet
+  }
+
+  const live: DaemonRecord[] = [];
+  for (const file of files) {
+    const full = path.join(dir, file);
+    let rec: DaemonRecord | null = null;
+    try {
+      rec = JSON.parse(fs.readFileSync(full, 'utf8')) as DaemonRecord;
+    } catch {
+      rec = null;
+    }
+    const valid = rec && typeof rec.pid === 'number' && typeof rec.root === 'string';
+    if (valid && isProcessAlive(rec!.pid)) {
+      live.push(rec!);
+    } else if (prune) {
+      try { fs.unlinkSync(full); } catch { /* ignore */ }
+    }
+  }
+  return live.sort((a, b) => b.startedAt - a.startedAt);
+}
+
+/** Remove a stopped daemon's leftover lockfile + socket + registry record. */
+function cleanupDaemonArtifacts(root: string): void {
+  try { fs.unlinkSync(getDaemonPidPath(root)); } catch { /* gone */ }
+  // POSIX sockets are real files; Windows named pipes vanish with the process.
+  if (process.platform !== 'win32') {
+    try { fs.unlinkSync(getDaemonSocketPath(root)); } catch { /* gone */ }
+  }
+  deregisterDaemon(root);
+}
+
+const sleep = (ms: number): Promise<void> => new Promise((r) => setTimeout(r, ms));
+
+async function waitForDeath(pid: number, timeoutMs: number): Promise<boolean> {
+  const deadline = Date.now() + timeoutMs;
+  while (Date.now() < deadline) {
+    if (!isProcessAlive(pid)) return true;
+    await sleep(100);
+  }
+  return !isProcessAlive(pid);
+}
+
+export interface StopResult {
+  root: string;
+  pid: number | null;
+  /** 'term' graceful, 'kill' force, 'not-running' stale lock, 'no-daemon' none found. */
+  outcome: 'term' | 'kill' | 'not-running' | 'no-daemon';
+}
+
+/**
+ * Stop the daemon serving `root`: SIGTERM, wait, then SIGKILL if it won't go,
+ * then sweep its artifacts. `root` must be realpath'd (match how the daemon
+ * keys its socket/lockfile). Resolves the pid from the authoritative lockfile,
+ * falling back to the registry.
+ */
+export async function stopDaemonAt(root: string): Promise<StopResult> {
+  let pid: number | null = null;
+  try {
+    const info = decodeLockInfo(fs.readFileSync(getDaemonPidPath(root), 'utf8'));
+    pid = info?.pid ?? null;
+  } catch {
+    /* no lockfile */
+  }
+  if (pid == null) {
+    const rec = listDaemons({ prune: false }).find(
+      (r) => path.resolve(r.root) === path.resolve(root)
+    );
+    pid = rec?.pid ?? null;
+  }
+
+  if (pid == null) {
+    cleanupDaemonArtifacts(root);
+    return { root, pid: null, outcome: 'no-daemon' };
+  }
+  if (!isProcessAlive(pid)) {
+    cleanupDaemonArtifacts(root);
+    return { root, pid, outcome: 'not-running' };
+  }
+
+  // POSIX: SIGTERM runs the daemon's graceful shutdown. Windows: TerminateProcess
+  // (no graceful path), so we always sweep artifacts ourselves below.
+  try { process.kill(pid, 'SIGTERM'); } catch { /* raced to exit */ }
+  let outcome: StopResult['outcome'] = 'term';
+  if (!(await waitForDeath(pid, 3000))) {
+    try { process.kill(pid, 'SIGKILL'); } catch { /* raced to exit */ }
+    await waitForDeath(pid, 2000);
+    outcome = 'kill';
+  }
+  cleanupDaemonArtifacts(root);
+  return { root, pid, outcome };
+}
+
+/** Stop every registered, live daemon. */
+export async function stopAllDaemons(): Promise<StopResult[]> {
+  const results: StopResult[] = [];
+  for (const rec of listDaemons()) {
+    results.push(await stopDaemonAt(rec.root));
+  }
+  return results;
+}

+ 6 - 0
src/mcp/daemon.ts

@@ -54,6 +54,7 @@ import {
   getDaemonSocketPath,
 } from './daemon-paths';
 import { CodeGraphPackageVersion } from './version';
+import { registerDaemon, deregisterDaemon } from './daemon-registry';
 
 /** Default idle linger after the last client disconnects. */
 const DEFAULT_IDLE_TIMEOUT_MS = 300_000;
@@ -191,6 +192,10 @@ export class Daemon {
       startedAt: Date.now(),
     };
 
+    // Drop a discovery record so `codegraph list` / `stop --all` can find us.
+    // Best-effort; a missing record only means list's liveness prune covers it.
+    registerDaemon({ root: this.projectRoot, ...lock });
+
     process.stderr.write(
       `[CodeGraph daemon] Listening on ${this.socketPath} (pid ${process.pid}, v${CodeGraphPackageVersion}). Idle timeout ${this.idleTimeoutMs}ms.\n`
     );
@@ -244,6 +249,7 @@ export class Daemon {
     }
     this.engine.stop();
     this.cleanupLockfile();
+    deregisterDaemon(this.projectRoot);
     if (process.platform !== 'win32') {
       try { fs.unlinkSync(this.socketPath); } catch { /* may already be gone */ }
     }