|
@@ -51,6 +51,7 @@ import {
|
|
|
decodeLockInfo,
|
|
decodeLockInfo,
|
|
|
encodeLockInfo,
|
|
encodeLockInfo,
|
|
|
getDaemonPidPath,
|
|
getDaemonPidPath,
|
|
|
|
|
+ getDaemonSocketCandidates,
|
|
|
getDaemonSocketPath,
|
|
getDaemonSocketPath,
|
|
|
} from './daemon-paths';
|
|
} from './daemon-paths';
|
|
|
import { CodeGraphPackageVersion } from './version';
|
|
import { CodeGraphPackageVersion } from './version';
|
|
@@ -169,39 +170,60 @@ export class Daemon {
|
|
|
// (cross-project tool calls only) shouldn't pay any open cost.
|
|
// (cross-project tool calls only) shouldn't pay any open cost.
|
|
|
void this.engine.ensureInitialized(this.projectRoot);
|
|
void this.engine.ensureInitialized(this.projectRoot);
|
|
|
|
|
|
|
|
- // Stale socket file (left over from a SIGKILL'd previous daemon) will
|
|
|
|
|
- // wedge `listen` with EADDRINUSE. We arrived here holding the lockfile,
|
|
|
|
|
- // which means there's no live daemon, so it's safe to clear.
|
|
|
|
|
- if (process.platform !== 'win32') {
|
|
|
|
|
- try { fs.unlinkSync(this.socketPath); } catch { /* not-exists is fine */ }
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- await new Promise<void>((resolve, reject) => {
|
|
|
|
|
- const server = net.createServer((socket) => this.handleConnection(socket));
|
|
|
|
|
- server.once('error', (err) => reject(err));
|
|
|
|
|
- server.listen(this.socketPath, () => {
|
|
|
|
|
- // POSIX: tighten permissions to user-only — the socket lives under
|
|
|
|
|
- // `.codegraph/`, which is git-ignored but may be on a shared FS.
|
|
|
|
|
|
|
+ // Walk the ordered socket candidates and bind the first that works. The
|
|
|
|
|
+ // in-project path comes first; the deterministic tmpdir path is the fallback
|
|
|
|
|
+ // for a filesystem that can't host an AF_UNIX node at all (ExFAT/FAT external
|
|
|
|
|
+ // volumes, some network mounts, WSL2 DrvFs → ENOTSUP/EACCES; #997, #974). The
|
|
|
|
|
+ // `listen` closure clears a stale socket (left by a SIGKILL'd previous daemon)
|
|
|
|
|
+ // before each attempt — safe because we hold the lockfile, so no live daemon
|
|
|
|
|
+ // owns it; without it `listen` would wedge on EADDRINUSE.
|
|
|
|
|
+ const candidates = getDaemonSocketCandidates(this.projectRoot);
|
|
|
|
|
+ const listen = (socketPath: string): Promise<net.Server> =>
|
|
|
|
|
+ new Promise<net.Server>((resolve, reject) => {
|
|
|
if (process.platform !== 'win32') {
|
|
if (process.platform !== 'win32') {
|
|
|
- try { fs.chmodSync(this.socketPath, 0o600); } catch { /* best-effort */ }
|
|
|
|
|
|
|
+ try { fs.unlinkSync(socketPath); } catch { /* not-exists is fine */ }
|
|
|
}
|
|
}
|
|
|
- this.server = server;
|
|
|
|
|
- resolve();
|
|
|
|
|
|
|
+ const server = net.createServer((socket) => this.handleConnection(socket));
|
|
|
|
|
+ server.once('error', reject);
|
|
|
|
|
+ server.listen(socketPath, () => {
|
|
|
|
|
+ // POSIX: tighten permissions to user-only — the socket lives under
|
|
|
|
|
+ // `.codegraph/` (git-ignored, maybe a shared FS) or tmpdir.
|
|
|
|
|
+ if (process.platform !== 'win32') {
|
|
|
|
|
+ try { fs.chmodSync(socketPath, 0o600); } catch { /* best-effort */ }
|
|
|
|
|
+ }
|
|
|
|
|
+ resolve(server);
|
|
|
|
|
+ });
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ let bound: { server: net.Server; socketPath: string };
|
|
|
|
|
+ try {
|
|
|
|
|
+ bound = await bindFirstUsableSocket(candidates, listen, {
|
|
|
|
|
+ onRelocate: (from, to, code) =>
|
|
|
|
|
+ process.stderr.write(
|
|
|
|
|
+ `[CodeGraph daemon] Socket ${from} unusable (${code}); relocating to ${to}.\n`
|
|
|
|
|
+ ),
|
|
|
});
|
|
});
|
|
|
- }).catch((err) => {
|
|
|
|
|
- // Bind failed — e.g. AF_UNIX is unsupported/unreliable on this filesystem
|
|
|
|
|
- // (the WSL2 DrvFs hazard behind #974), or a stale socket we couldn't clear.
|
|
|
|
|
- // We already hold the lockfile that `tryAcquireDaemonLock` wrote; release it
|
|
|
|
|
- // and any partial socket so the NEXT launcher doesn't spin respawning us on
|
|
|
|
|
- // a stale lock that points at our now-dying pid. Then re-throw so the caller
|
|
|
|
|
- // (the bin's try/catch) exits this detached daemon cleanly and every
|
|
|
|
|
- // launcher falls back to direct mode.
|
|
|
|
|
|
|
+ } catch (err) {
|
|
|
|
|
+ // Every candidate failed (the last one, or a non-relocatable error like a
|
|
|
|
|
+ // racing EADDRINUSE). We already hold the lockfile `tryAcquireDaemonLock`
|
|
|
|
|
+ // wrote; release it and any partial sockets so the NEXT launcher doesn't
|
|
|
|
|
+ // spin respawning us on a stale lock pointing at our now-dying pid. Then
|
|
|
|
|
+ // re-throw so the caller (the bin's try/catch) exits this detached daemon
|
|
|
|
|
+ // cleanly and every launcher falls back to direct mode (#974).
|
|
|
this.cleanupLockfile();
|
|
this.cleanupLockfile();
|
|
|
if (process.platform !== 'win32') {
|
|
if (process.platform !== 'win32') {
|
|
|
- try { fs.unlinkSync(this.socketPath); } catch { /* may not exist */ }
|
|
|
|
|
|
|
+ for (const candidate of candidates) {
|
|
|
|
|
+ try { fs.unlinkSync(candidate); } catch { /* may not exist */ }
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
throw err;
|
|
throw err;
|
|
|
- });
|
|
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ this.server = bound.server;
|
|
|
|
|
+ // Adopt the path we ACTUALLY bound — it may be a tmpdir fallback past an
|
|
|
|
|
+ // unusable in-project location. Everything downstream (lockfile, registry,
|
|
|
|
|
+ // chmod, cleanup, status) keys off this real path, not the preferred guess.
|
|
|
|
|
+ this.socketPath = bound.socketPath;
|
|
|
|
|
|
|
|
const lock: DaemonLockInfo = {
|
|
const lock: DaemonLockInfo = {
|
|
|
pid: process.pid,
|
|
pid: process.pid,
|
|
@@ -210,6 +232,19 @@ export class Daemon {
|
|
|
startedAt: Date.now(),
|
|
startedAt: Date.now(),
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
+ // `tryAcquireDaemonLock` wrote the pidfile with the PREFERRED path (candidate
|
|
|
|
|
+ // 0) before we knew which one would bind. If we relocated, rewrite it so the
|
|
|
|
|
+ // per-project record is honest. Atomic temp+rename; safe because we hold the
|
|
|
|
|
+ // lock and we're alive — `clearStaleDaemonLock` pid-verifies, so no racing
|
|
|
|
|
+ // candidate clears or clobbers a live daemon's lock.
|
|
|
|
|
+ if (this.socketPath !== candidates[0]) {
|
|
|
|
|
+ try {
|
|
|
|
|
+ const tmpPid = `${this.pidPath}.${process.pid}.relocate`;
|
|
|
|
|
+ fs.writeFileSync(tmpPid, encodeLockInfo(lock), { mode: 0o600 });
|
|
|
|
|
+ fs.renameSync(tmpPid, this.pidPath);
|
|
|
|
|
+ } catch { /* best-effort; the registry record below carries the real path */ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
// Drop a discovery record so `codegraph list` / `stop --all` can find us.
|
|
// Drop a discovery record so `codegraph list` / `stop --all` can find us.
|
|
|
// Best-effort; a missing record only means list's liveness prune covers it.
|
|
// Best-effort; a missing record only means list's liveness prune covers it.
|
|
|
registerDaemon({ root: this.projectRoot, ...lock });
|
|
registerDaemon({ root: this.projectRoot, ...lock });
|
|
@@ -433,6 +468,17 @@ export type AcquireResult =
|
|
|
* the pidfile becomes visible in one step already containing a full record.
|
|
* the pidfile becomes visible in one step already containing a full record.
|
|
|
* Whoever links first wins; everyone else gets EEXIST and reads a complete file.
|
|
* Whoever links first wins; everyone else gets EEXIST and reads a complete file.
|
|
|
* There is no empty-file window at all.
|
|
* There is no empty-file window at all.
|
|
|
|
|
+ *
|
|
|
|
|
+ * Filesystems without hard links (#997): ExFAT/FAT external volumes and some
|
|
|
|
|
+ * network mounts can't `link()` at all — it throws ENOTSUP/EPERM, which would
|
|
|
|
|
+ * otherwise kill the daemon before it ever reaches the socket bind. There we
|
|
|
|
|
+ * fall back to an O_EXCL create (`acquireLockViaExclusiveOpen`): still exclusive
|
|
|
|
|
+ * ("first writer wins"), but the full record is written through the fd in a
|
|
|
|
|
+ * second step, so the empty-file window the link approach removed is reopened —
|
|
|
|
|
+ * only on these filesystems, only for the microseconds between create and write
|
|
|
|
|
+ * (far narrower than the original bug, which the file watcher's startup latency
|
|
|
|
|
+ * widened). The race's worst case is two daemons briefly; on a single external
|
|
|
|
|
+ * drive that's strictly better than the daemon never starting at all.
|
|
|
*/
|
|
*/
|
|
|
export function tryAcquireDaemonLock(projectRoot: string): AcquireResult {
|
|
export function tryAcquireDaemonLock(projectRoot: string): AcquireResult {
|
|
|
const pidPath = getDaemonPidPath(projectRoot);
|
|
const pidPath = getDaemonPidPath(projectRoot);
|
|
@@ -453,10 +499,21 @@ export function tryAcquireDaemonLock(projectRoot: string): AcquireResult {
|
|
|
try {
|
|
try {
|
|
|
fs.writeFileSync(tmp, encodeLockInfo(info), { mode: 0o600 });
|
|
fs.writeFileSync(tmp, encodeLockInfo(info), { mode: 0o600 });
|
|
|
try {
|
|
try {
|
|
|
- fs.linkSync(tmp, pidPath); // atomic + exclusive
|
|
|
|
|
|
|
+ fs.linkSync(tmp, pidPath); // atomic + exclusive (race-free; see must-fix 1)
|
|
|
acquired = true;
|
|
acquired = true;
|
|
|
} catch (err: unknown) {
|
|
} catch (err: unknown) {
|
|
|
- if ((err as NodeJS.ErrnoException).code !== 'EEXIST') throw err;
|
|
|
|
|
|
|
+ if ((err as NodeJS.ErrnoException).code === 'EEXIST') {
|
|
|
|
|
+ // Lost the race — another candidate already holds it. Fall through to read.
|
|
|
|
|
+ } else {
|
|
|
|
|
+ // link() failed for a non-conflict reason — nearly always "this filesystem
|
|
|
|
|
+ // has no hard links" (ExFAT/FAT external volumes, some network mounts),
|
|
|
|
|
+ // which surfaces as a DIFFERENT errno on every OS: ENOTSUP on macOS, EPERM
|
|
|
|
|
+ // on Linux, EISDIR on Windows (#997). Enumerating them is whack-a-mole and
|
|
|
|
|
+ // unnecessary: the `tmp` write above already proved this directory is
|
|
|
|
|
+ // writable, so an O_EXCL create is a valid atomic+exclusive substitute. If
|
|
|
|
|
+ // IT fails too, that's a genuine error and propagates. EEXIST ⇒ taken.
|
|
|
|
|
+ acquired = acquireLockViaExclusiveOpen(pidPath, info);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|
|
|
} finally {
|
|
} finally {
|
|
|
try { fs.unlinkSync(tmp); } catch { /* temp already gone */ }
|
|
try { fs.unlinkSync(tmp); } catch { /* temp already gone */ }
|
|
@@ -474,6 +531,31 @@ export function tryAcquireDaemonLock(projectRoot: string): AcquireResult {
|
|
|
return { kind: 'taken', existing, pidPath };
|
|
return { kind: 'taken', existing, pidPath };
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/**
|
|
|
|
|
+ * Exclusive-create the pidfile (O_CREAT|O_EXCL via the `wx` flag) and write the
|
|
|
|
|
+ * full record through the same fd — the hard-link-free fallback used by
|
|
|
|
|
+ * {@link tryAcquireDaemonLock} on filesystems without `link()`. Returns true if
|
|
|
|
|
+ * we created it (acquired the lock), false on EEXIST (another candidate holds
|
|
|
|
|
+ * it). Any other error propagates. Still exclusive, so "first writer wins" holds
|
|
|
|
|
+ * exactly as the link path does; the only difference is the brief empty-file
|
|
|
|
|
+ * window between create and write. Exported for testing.
|
|
|
|
|
+ */
|
|
|
|
|
+export function acquireLockViaExclusiveOpen(pidPath: string, info: DaemonLockInfo): boolean {
|
|
|
|
|
+ let fd: number;
|
|
|
|
|
+ try {
|
|
|
|
|
+ fd = fs.openSync(pidPath, 'wx', 0o600); // O_CREAT | O_EXCL | O_WRONLY
|
|
|
|
|
+ } catch (err: unknown) {
|
|
|
|
|
+ if ((err as NodeJS.ErrnoException).code === 'EEXIST') return false;
|
|
|
|
|
+ throw err;
|
|
|
|
|
+ }
|
|
|
|
|
+ try {
|
|
|
|
|
+ fs.writeSync(fd, encodeLockInfo(info));
|
|
|
|
|
+ } finally {
|
|
|
|
|
+ fs.closeSync(fd);
|
|
|
|
|
+ }
|
|
|
|
|
+ return true;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
/**
|
|
/**
|
|
|
* Remove a stale pidfile, but only if it still names a dead process. Re-reads
|
|
* Remove a stale pidfile, but only if it still names a dead process. Re-reads
|
|
|
* the file immediately before unlinking so we never delete a lock that a live
|
|
* the file immediately before unlinking so we never delete a lock that a live
|
|
@@ -520,6 +602,58 @@ export function isProcessAlive(pid: number): boolean {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+/**
|
|
|
|
|
+ * The one `listen()` error we must NOT relocate past. EADDRINUSE means the path
|
|
|
|
|
+ * is genuinely occupied — a racing daemon that legitimately owns it, or a
|
|
|
|
|
+ * leftover node we couldn't clear (the #974 planted-dir case) — so relocating
|
|
|
|
|
+ * would abandon a path another daemon owns; the caller instead releases its lock
|
|
|
|
|
+ * and falls back to direct mode. EVERY OTHER bind error just means "this path
|
|
|
|
|
+ * didn't work," almost always a filesystem that can't host an AF_UNIX node at all
|
|
|
|
|
+ * (ExFAT/FAT, network mounts, WSL2 DrvFs), which reports a DIFFERENT errno per OS
|
|
|
|
|
+ * (ENOTSUP macOS, EPERM Linux; #997). Enumerating the "unsupported" codes is
|
|
|
|
|
+ * whack-a-mole, so we relocate on anything-but-conflict instead — robust and
|
|
|
|
|
+ * self-correcting: if the deterministic tmpdir fallback ALSO fails, that error
|
|
|
|
|
+ * propagates from the last candidate. (ENAMETOOLONG never reaches here — the
|
|
|
|
|
+ * candidate list already routes over-long paths straight to tmpdir.)
|
|
|
|
|
+ */
|
|
|
|
|
+const SOCKET_BIND_CONFLICT_CODE = 'EADDRINUSE';
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Bind the first usable socket from an ordered candidate list, relocating past
|
|
|
|
|
+ * any path that fails to bind for a non-conflict reason (see {@link
|
|
|
|
|
+ * SOCKET_BIND_CONFLICT_CODE}). The injected `listen` does the real
|
|
|
|
|
+ * `net.Server.listen` (and stale-socket clear); abstracted so the relocation
|
|
|
|
|
+ * policy is unit-testable without a real unsupported filesystem. Returns the
|
|
|
|
|
+ * server plus the path actually bound. An EADDRINUSE, or any error on the LAST
|
|
|
|
|
+ * candidate, propagates — the caller releases the lockfile and falls back to
|
|
|
|
|
+ * direct mode (#974). Exported for testing.
|
|
|
|
|
+ */
|
|
|
|
|
+export async function bindFirstUsableSocket(
|
|
|
|
|
+ candidates: string[],
|
|
|
|
|
+ listen: (socketPath: string) => Promise<net.Server>,
|
|
|
|
|
+ opts: { onRelocate?: (from: string, to: string, code: string) => void } = {},
|
|
|
|
|
+): Promise<{ server: net.Server; socketPath: string }> {
|
|
|
|
|
+ let lastErr: unknown;
|
|
|
|
|
+ for (let i = 0; i < candidates.length; i++) {
|
|
|
|
|
+ const socketPath = candidates[i]!; // i < length, so always defined
|
|
|
|
|
+ const isLast = i === candidates.length - 1;
|
|
|
|
|
+ try {
|
|
|
|
|
+ const server = await listen(socketPath);
|
|
|
|
|
+ return { server, socketPath };
|
|
|
|
|
+ } catch (err) {
|
|
|
|
|
+ lastErr = err;
|
|
|
|
|
+ const code = (err as NodeJS.ErrnoException).code;
|
|
|
|
|
+ if (!isLast && code !== SOCKET_BIND_CONFLICT_CODE) {
|
|
|
|
|
+ opts.onRelocate?.(socketPath, candidates[i + 1]!, code ?? ''); // !isLast ⇒ i+1 in range
|
|
|
|
|
+ continue;
|
|
|
|
|
+ }
|
|
|
|
|
+ throw err;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ // Only reachable with an empty candidate list — a programmer error.
|
|
|
|
|
+ throw lastErr ?? new Error('no socket candidates to bind');
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
function resolveIdleTimeoutMs(): number {
|
|
function resolveIdleTimeoutMs(): number {
|
|
|
const raw = process.env.CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS;
|
|
const raw = process.env.CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS;
|
|
|
if (raw === undefined || raw === '') return DEFAULT_IDLE_TIMEOUT_MS;
|
|
if (raw === undefined || raw === '') return DEFAULT_IDLE_TIMEOUT_MS;
|