Просмотр исходного кода

fix(watcher): degrade cleanly on watch exhaustion and prolonged lock contention (#891)

The live file watcher could stay "alive" after it had stopped being
trustworthy. EMFILE/ENFILE watch-resource exhaustion only logged (and was
silently tolerated on the Linux per-directory path), and prolonged
LockUnavailableError retried forever at the normal debounce cadence — both
left auto-sync dead while the index silently drifted stale. Especially bad
for long-running MCP/daemon sessions.

Add a one-way degrade(): on watch-resource exhaustion (any watch strategy)
or on lock contention past a bounded exponential-backoff budget, log once,
fire a new onDegraded callback, and stop. start() now returns false
consistently when the per-directory path degrades at startup — it previously
returned true on Linux, so the MCP server reported the watcher "active" when
it had degraded. Wire onDegraded into the MCP server so callers are actually
told, and expose isDegraded()/getDegradedReason().

Builds on the approach in #877 by @thismilktea. Validated on macOS
(recursive), Linux (per-directory, Docker) and Windows (recursive) — 30/30
watcher + watch-policy tests on each.

Closes #876

Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby Mchenry 1 неделя назад
Родитель
Сommit
cea4d086f9
4 измененных файлов с 351 добавлено и 13 удалено
  1. 4 0
      CHANGELOG.md
  2. 157 0
      __tests__/watcher.test.ts
  3. 8 0
      src/mcp/engine.ts
  4. 182 13
      src/sync/watcher.ts

+ 4 - 0
CHANGELOG.md

@@ -9,6 +9,10 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
 ## [Unreleased]
 ## [Unreleased]
 
 
+### Fixes
+
+- The file watcher that auto-syncs the graph now fails cleanly when live watching can no longer be trusted, instead of looking healthy while the index quietly goes stale. If the operating system runs out of file-watch resources, or another process holds the write lock far longer than a normal save, CodeGraph now disables auto-sync once — with a single clear message telling you to run `codegraph sync` (or rely on the git sync hooks) to refresh — rather than retrying forever or repeating the same error on a loop. This mostly matters for long-running MCP/daemon sessions, which could otherwise keep serving stale results while appearing to work. Thanks @thismilktea. (#876)
+
 
 
 ## [1.0.1] - 2026-06-13
 ## [1.0.1] - 2026-06-13
 
 

+ 157 - 0
__tests__/watcher.test.ts

@@ -18,6 +18,7 @@
  */
  */
 
 
 import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
 import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
+import { EventEmitter } from 'events';
 import * as fs from 'fs';
 import * as fs from 'fs';
 import * as path from 'path';
 import * as path from 'path';
 import * as os from 'os';
 import * as os from 'os';
@@ -25,6 +26,7 @@ import {
   FileWatcher,
   FileWatcher,
   LockUnavailableError,
   LockUnavailableError,
   __emitWatchEventForTests,
   __emitWatchEventForTests,
+  __setFsWatchForTests,
   type WatchOptions,
   type WatchOptions,
 } from '../src/sync/watcher';
 } from '../src/sync/watcher';
 import CodeGraph from '../src/index';
 import CodeGraph from '../src/index';
@@ -69,6 +71,8 @@ describe('FileWatcher', () => {
   });
   });
 
 
   afterEach(() => {
   afterEach(() => {
+    __setFsWatchForTests(null); // reset the injected fs.watch seam
+    vi.restoreAllMocks();
     if (fs.existsSync(testDir)) {
     if (fs.existsSync(testDir)) {
       fs.rmSync(testDir, { recursive: true, force: true });
       fs.rmSync(testDir, { recursive: true, force: true });
     }
     }
@@ -110,6 +114,159 @@ describe('FileWatcher', () => {
     });
     });
   });
   });
 
 
+  describe('watch-resource exhaustion (#876)', () => {
+    // These exercise the REAL fs.watch path (not inert) with an injected watch
+    // that throws / emits EMFILE, covering whichever strategy the host platform
+    // uses — recursive on macOS/Windows, per-directory on Linux. Each uses its
+    // OWN EMPTY temp dir so exactly one watch is installed and the close-count
+    // is deterministic across platforms.
+    const mkEmptyDir = () => fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-exhaust-'));
+
+    it('fails to start and degrades when fs.watch setup exhausts watch resources', () => {
+      const dir = mkEmptyDir();
+      const onDegraded = vi.fn();
+      const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+      __setFsWatchForTests(() => {
+        const err = new Error('too many open files') as NodeJS.ErrnoException;
+        err.code = 'EMFILE';
+        throw err;
+      });
+      const watcher = new FileWatcher(
+        dir,
+        vi.fn().mockResolvedValue({ filesChanged: 0, durationMs: 0 }),
+        { debounceMs: 100, onDegraded }
+      );
+
+      try {
+        // Both watch strategies must report startup exhaustion identically.
+        expect(watcher.start()).toBe(false);
+        expect(watcher.isActive()).toBe(false);
+        expect(watcher.isDegraded()).toBe(true);
+        expect(watcher.getDegradedReason()).toContain('auto-sync disabled');
+        expect(onDegraded).toHaveBeenCalledTimes(1);
+        expect(onDegraded).toHaveBeenCalledWith(expect.stringContaining('auto-sync disabled'));
+        const disableWarnings = warnSpy.mock.calls.filter(
+          (c) => typeof c[0] === 'string' && c[0].includes('File watcher disabled')
+        );
+        expect(disableWarnings).toHaveLength(1);
+      } finally {
+        fs.rmSync(dir, { recursive: true, force: true });
+      }
+    });
+
+    it('degrades exactly once when the live watcher emits EMFILE at runtime', () => {
+      const dir = mkEmptyDir();
+      const onDegraded = vi.fn();
+      const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+      const emitter = new EventEmitter();
+      let closed = 0;
+      const fakeWatcher = {
+        on: (event: string, handler: (...a: unknown[]) => void) => {
+          emitter.on(event, handler);
+          return fakeWatcher;
+        },
+        close: () => {
+          closed += 1;
+        },
+      } as unknown as fs.FSWatcher;
+      __setFsWatchForTests(() => fakeWatcher);
+      const watcher = new FileWatcher(
+        dir,
+        vi.fn().mockResolvedValue({ filesChanged: 0, durationMs: 0 }),
+        { debounceMs: 100, onDegraded }
+      );
+
+      try {
+        expect(watcher.start()).toBe(true);
+        expect(watcher.isActive()).toBe(true);
+
+        const err = new Error('too many open files') as NodeJS.ErrnoException;
+        err.code = 'EMFILE';
+        emitter.emit('error', err);
+        emitter.emit('error', err); // a second burst must NOT degrade / close again
+
+        expect(watcher.isActive()).toBe(false);
+        expect(watcher.isDegraded()).toBe(true);
+        expect(onDegraded).toHaveBeenCalledTimes(1);
+        expect(closed).toBe(1);
+        const disableWarnings = warnSpy.mock.calls.filter(
+          (c) => typeof c[0] === 'string' && c[0].includes('File watcher disabled')
+        );
+        expect(disableWarnings).toHaveLength(1);
+      } finally {
+        fs.rmSync(dir, { recursive: true, force: true });
+      }
+    });
+
+    it('reports isDegraded false / null reason while healthy', () => {
+      const watcher = newWatcher(vi.fn().mockResolvedValue({ filesChanged: 0, durationMs: 0 }));
+      watcher.start();
+      expect(watcher.isDegraded()).toBe(false);
+      expect(watcher.getDegradedReason()).toBeNull();
+      watcher.stop();
+    });
+  });
+
+  describe('lock contention degradation (#876)', () => {
+    it('disables auto-sync after prolonged lock contention, with bounded retries', async () => {
+      const syncFn = vi.fn().mockRejectedValue(new LockUnavailableError());
+      const onSyncComplete = vi.fn();
+      const onSyncError = vi.fn();
+      const onDegraded = vi.fn();
+      const warnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+      const watcher = newWatcher(syncFn, {
+        debounceMs: 25,
+        onSyncComplete,
+        onSyncError,
+        onDegraded,
+      });
+      watcher.start();
+      await watcher.waitUntilReady();
+      __emitWatchEventForTests(testDir, 'src/long-lock.ts');
+
+      // 5 backoff retries (25·1,2,4,8,16 ms), then degrade on the 6th attempt.
+      await waitFor(() => !watcher.isActive(), 8000, 20);
+
+      expect(syncFn.mock.calls.length).toBeGreaterThanOrEqual(6); // MAX_LOCK_RETRIES + 1
+      expect(watcher.isDegraded()).toBe(true);
+      expect(onDegraded).toHaveBeenCalledTimes(1);
+      expect(onDegraded).toHaveBeenCalledWith(expect.stringContaining('auto-sync disabled'));
+      // A held lock is neither a sync error nor a completion.
+      expect(onSyncError).not.toHaveBeenCalled();
+      expect(onSyncComplete).not.toHaveBeenCalled();
+      // Degrade stops the watcher, which clears pending state.
+      expect(watcher.getPendingFiles()).toEqual([]);
+      const disableWarnings = warnSpy.mock.calls.filter(
+        (c) => typeof c[0] === 'string' && c[0].includes('File watcher disabled')
+      );
+      expect(disableWarnings).toHaveLength(1);
+    });
+
+    it('does NOT degrade on brief contention — backoff resets after a clean sync', async () => {
+      const syncFn = vi
+        .fn()
+        .mockRejectedValueOnce(new LockUnavailableError())
+        .mockRejectedValueOnce(new LockUnavailableError())
+        .mockRejectedValueOnce(new LockUnavailableError())
+        .mockResolvedValue({ filesChanged: 1, durationMs: 5 });
+      const onDegraded = vi.fn();
+      const onSyncComplete = vi.fn();
+      const watcher = newWatcher(syncFn, { debounceMs: 25, onDegraded, onSyncComplete });
+      watcher.start();
+      await watcher.waitUntilReady();
+      __emitWatchEventForTests(testDir, 'src/brief-lock.ts');
+
+      await waitFor(() => onSyncComplete.mock.calls.length > 0, 4000, 20);
+
+      expect(onDegraded).not.toHaveBeenCalled();
+      expect(watcher.isDegraded()).toBe(false);
+      expect(watcher.isActive()).toBe(true);
+      expect(watcher.getPendingFiles().some((p) => p.path === 'src/brief-lock.ts')).toBe(false);
+
+      watcher.stop();
+    });
+  });
+
   describe('debounced sync', () => {
   describe('debounced sync', () => {
     it('should trigger sync after file change', async () => {
     it('should trigger sync after file change', async () => {
       const syncFn = vi.fn().mockResolvedValue({ filesChanged: 1, durationMs: 10 });
       const syncFn = vi.fn().mockResolvedValue({ filesChanged: 1, durationMs: 10 });

+ 8 - 0
src/mcp/engine.ts

@@ -216,6 +216,14 @@ export class MCPEngine {
       onSyncError: (err) => {
       onSyncError: (err) => {
         process.stderr.write(`[CodeGraph MCP] Auto-sync error: ${err.message}\n`);
         process.stderr.write(`[CodeGraph MCP] Auto-sync error: ${err.message}\n`);
       },
       },
+      onDegraded: (reason) => {
+        // Live watching gave up permanently (watch-resource exhaustion or a
+        // write lock held past the retry budget). Say so loudly and ONCE — the
+        // graph will no longer auto-update, so a long-running MCP session must
+        // not keep assuming it's fresh. The reason already names the remedy
+        // (`codegraph sync` / git sync hooks).
+        process.stderr.write(`[CodeGraph MCP] File watcher degraded — ${reason}\n`);
+      },
     });
     });
 
 
     this.watcherStarted = true;
     this.watcherStarted = true;

+ 182 - 13
src/sync/watcher.ts

@@ -39,6 +39,34 @@ import { normalizePath } from '../utils';
 import { isCodeGraphDataDir } from '../directory';
 import { isCodeGraphDataDir } from '../directory';
 import { watchDisabledReason } from './watch-policy';
 import { watchDisabledReason } from './watch-policy';
 
 
+/**
+ * Number of consecutive lock-contention retries the watcher tolerates before
+ * it gives up and degrades auto-sync. Brief contention (another writer for a
+ * few cycles) stays under this; a long-lived external writer crosses it.
+ */
+const MAX_LOCK_RETRIES = 5;
+/** Cap on the exponential lock-retry backoff so it never sleeps absurdly long. */
+const MAX_LOCK_RETRY_DELAY_MS = 30_000;
+
+/** Actionable degrade message; both exhaustion paths share it verbatim. */
+const EXHAUSTION_REASON =
+  'OS watch/file limit exhausted; auto-sync disabled. Run `codegraph sync` ' +
+  '(or install git sync hooks) to refresh the graph after changes.';
+
+/**
+ * True when an error is OS watch/file-descriptor exhaustion (EMFILE/ENFILE).
+ * Prefers the structured `err.code`; falls back to message matching ONLY when
+ * no code is present (some platforms surface a bare Error from `fs.watch`).
+ */
+function isWatchResourceExhaustion(err: unknown): boolean {
+  const e = err as NodeJS.ErrnoException | undefined;
+  if (e?.code === 'EMFILE' || e?.code === 'ENFILE') return true;
+  if (!e?.code && e?.message) {
+    return /EMFILE|ENFILE|too many open files/i.test(e.message);
+  }
+  return false;
+}
+
 /**
 /**
  * Native recursive `fs.watch` is only reliable on macOS and Windows; on Linux
  * Native recursive `fs.watch` is only reliable on macOS and Windows; on Linux
  * (and AIX) it throws `ERR_FEATURE_UNAVAILABLE_ON_PLATFORM`. We branch on this
  * (and AIX) it throws `ERR_FEATURE_UNAVAILABLE_ON_PLATFORM`. We branch on this
@@ -48,6 +76,20 @@ function supportsRecursiveWatch(): boolean {
   return process.platform === 'darwin' || process.platform === 'win32';
   return process.platform === 'darwin' || process.platform === 'win32';
 }
 }
 
 
+/**
+ * Indirection over `fs.watch` so tests can inject a fake that throws or emits
+ * `EMFILE`/`ENFILE` deterministically (real watch-resource exhaustion can't be
+ * provoked reliably, and `fs.watch` is a non-configurable property so it can't
+ * be spied). Production always uses the real `fs.watch`.
+ */
+type WatchFn = typeof fs.watch;
+let watchImpl: WatchFn = fs.watch;
+
+/** @internal Test-only seam to inject a fake fs.watch implementation. */
+export function __setFsWatchForTests(fn: WatchFn | null): void {
+  watchImpl = fn ?? fs.watch;
+}
+
 /**
 /**
  * Upper bound on simultaneously-watched directories on the Linux per-directory
  * Upper bound on simultaneously-watched directories on the Linux per-directory
  * path. Each is one inotify watch; the kernel's `fs.inotify.max_user_watches`
  * path. Each is one inotify watch; the kernel's `fs.inotify.max_user_watches`
@@ -98,6 +140,15 @@ export interface WatchOptions {
    */
    */
   onSyncError?: (error: Error) => void;
   onSyncError?: (error: Error) => void;
 
 
+  /**
+   * Callback fired ONCE when live watching degrades permanently and auto-sync
+   * is disabled — OS watch-resource exhaustion (EMFILE/ENFILE), or a write lock
+   * held past the retry budget. The string is an actionable, human-readable
+   * reason. Lets a host (MCP server, daemon, CLI) tell the user that the index
+   * will no longer auto-update instead of silently serving stale results.
+   */
+  onDegraded?: (reason: string) => void;
+
   /**
   /**
    * Test-only. When true, `start()` installs NO OS-level fs.watch — the
    * Test-only. When true, `start()` installs NO OS-level fs.watch — the
    * watcher is "inert" and only the {@link __emitWatchEventForTests} /
    * watcher is "inert" and only the {@link __emitWatchEventForTests} /
@@ -164,6 +215,14 @@ export class FileWatcher {
   private dirWatchers = new Map<string, fs.FSWatcher>();
   private dirWatchers = new Map<string, fs.FSWatcher>();
   /** Set once the per-directory watch cap is hit, so we log only once. */
   /** Set once the per-directory watch cap is hit, so we log only once. */
   private dirCapWarned = false;
   private dirCapWarned = false;
+  /**
+   * One-way latch: the reason live watching was permanently disabled at runtime
+   * (watch-resource exhaustion, or lock contention past the retry budget), or
+   * null while healthy. Set by {@link degrade}; cleared only by a fresh start().
+   */
+  private degradedReason: string | null = null;
+  /** Consecutive lock-contention retries for watcher-triggered syncs. */
+  private lockRetryCount = 0;
   /** Test-only inert mode: started, but with no OS watcher installed. */
   /** Test-only inert mode: started, but with no OS watcher installed. */
   private inert = false;
   private inert = false;
   private debounceTimer: ReturnType<typeof setTimeout> | null = null;
   private debounceTimer: ReturnType<typeof setTimeout> | null = null;
@@ -211,6 +270,7 @@ export class FileWatcher {
   private readonly syncFn: () => Promise<{ filesChanged: number; durationMs: number }>;
   private readonly syncFn: () => Promise<{ filesChanged: number; durationMs: number }>;
   private readonly onSyncComplete?: WatchOptions['onSyncComplete'];
   private readonly onSyncComplete?: WatchOptions['onSyncComplete'];
   private readonly onSyncError?: WatchOptions['onSyncError'];
   private readonly onSyncError?: WatchOptions['onSyncError'];
+  private readonly onDegraded?: WatchOptions['onDegraded'];
   private readonly inertForTests: boolean;
   private readonly inertForTests: boolean;
 
 
   constructor(
   constructor(
@@ -223,6 +283,7 @@ export class FileWatcher {
     this.debounceMs = options.debounceMs ?? 2000;
     this.debounceMs = options.debounceMs ?? 2000;
     this.onSyncComplete = options.onSyncComplete;
     this.onSyncComplete = options.onSyncComplete;
     this.onSyncError = options.onSyncError;
     this.onSyncError = options.onSyncError;
+    this.onDegraded = options.onDegraded;
     this.inertForTests = options.inertForTests ?? false;
     this.inertForTests = options.inertForTests ?? false;
   }
   }
 
 
@@ -233,6 +294,8 @@ export class FileWatcher {
   start(): boolean {
   start(): boolean {
     if (this.recursiveWatcher || this.dirWatchers.size > 0 || this.inert) return true; // Already watching
     if (this.recursiveWatcher || this.dirWatchers.size > 0 || this.inert) return true; // Already watching
     this.stopped = false;
     this.stopped = false;
+    this.degradedReason = null;
+    this.lockRetryCount = 0;
 
 
     // Some environments make filesystem watching unusable — most notably
     // Some environments make filesystem watching unusable — most notably
     // WSL2 /mnt/ drives, where the underlying fs.watch calls block long
     // WSL2 /mnt/ drives, where the underlying fs.watch calls block long
@@ -257,6 +320,12 @@ export class FileWatcher {
         this.startPerDirectory();
         this.startPerDirectory();
       }
       }
 
 
+      // The per-directory (Linux) path catches watch-resource exhaustion inside
+      // watchTree and degrades synchronously rather than throwing, so it never
+      // reaches the catch below. Surface that as a failed start here so both
+      // strategies report exhaustion identically (start() === false).
+      if (this.degradedReason) return false;
+
       // No async crawl to wait on: as soon as the watch set is installed we
       // No async crawl to wait on: as soon as the watch set is installed we
       // have a clean baseline (pendingFiles is only populated by post-start
       // have a clean baseline (pendingFiles is only populated by post-start
       // events). Clear defensively and flip ready.
       // events). Clear defensively and flip ready.
@@ -274,9 +343,16 @@ export class FileWatcher {
       });
       });
       return true;
       return true;
     } catch (err) {
     } catch (err) {
-      // Watcher setup failed (e.g., permission denied, missing directory).
-      logWarn('Could not start file watcher', { error: String(err) });
-      this.stop();
+      // Watcher setup failed. Watch-resource exhaustion (EMFILE/ENFILE on the
+      // recursive path) is terminal — degrade cleanly with one actionable
+      // warning instead of leaving a half-broken watcher. Everything else
+      // (permission denied, missing directory) keeps the prior quiet-stop.
+      if (isWatchResourceExhaustion(err)) {
+        this.degrade(EXHAUSTION_REASON, { error: String(err) });
+      } else {
+        logWarn('Could not start file watcher', { error: String(err) });
+        this.stop();
+      }
       return false;
       return false;
     }
     }
   }
   }
@@ -287,7 +363,7 @@ export class FileWatcher {
    * it maps straight to a project-relative path.
    * it maps straight to a project-relative path.
    */
    */
   private startRecursive(): void {
   private startRecursive(): void {
-    this.recursiveWatcher = fs.watch(
+    this.recursiveWatcher = watchImpl(
       this.projectRoot,
       this.projectRoot,
       { recursive: true, persistent: true },
       { recursive: true, persistent: true },
       (_event, filename) => {
       (_event, filename) => {
@@ -296,6 +372,10 @@ export class FileWatcher {
       }
       }
     );
     );
     this.recursiveWatcher.on('error', (err: unknown) => {
     this.recursiveWatcher.on('error', (err: unknown) => {
+      if (isWatchResourceExhaustion(err)) {
+        this.degrade(EXHAUSTION_REASON, { error: String(err) });
+        return;
+      }
       logWarn('File watcher error', { error: String(err) });
       logWarn('File watcher error', { error: String(err) });
     });
     });
   }
   }
@@ -319,6 +399,10 @@ export class FileWatcher {
    * sync owns the baseline).
    * sync owns the baseline).
    */
    */
   private watchTree(dir: string, markExisting: boolean): void {
   private watchTree(dir: string, markExisting: boolean): void {
+    // A degrade() mid-walk (exhaustion on an earlier directory) calls stop(),
+    // which sets `stopped`; bail so the recursion unwinds without adding more
+    // watches to a watcher that is shutting down.
+    if (this.stopped || this.degradedReason) return;
     if (this.dirWatchers.has(dir)) return;
     if (this.dirWatchers.has(dir)) return;
     if (this.dirWatchers.size >= maxDirWatches()) {
     if (this.dirWatchers.size >= maxDirWatches()) {
       if (!this.dirCapWarned) {
       if (!this.dirCapWarned) {
@@ -332,14 +416,26 @@ export class FileWatcher {
 
 
     let w: fs.FSWatcher;
     let w: fs.FSWatcher;
     try {
     try {
-      w = fs.watch(dir, { persistent: true }, (_event, filename) =>
+      w = watchImpl(dir, { persistent: true }, (_event, filename) =>
         this.handleDirEvent(dir, filename)
         this.handleDirEvent(dir, filename)
       );
       );
-    } catch {
-      // ENOENT / EACCES / too-many-open-files — skip this directory quietly.
+    } catch (err) {
+      // EMFILE/ENFILE means the PROCESS is out of descriptors — every further
+      // directory would fail too, so degrade the whole watcher rather than
+      // limping along with a partial watch set.
+      if (isWatchResourceExhaustion(err)) {
+        this.degrade(EXHAUSTION_REASON, { error: String(err), dir });
+      }
+      // ENOENT / EACCES on a single directory stays non-fatal: skip it quietly.
       return;
       return;
     }
     }
-    w.on('error', () => this.unwatchDir(dir));
+    w.on('error', (err: unknown) => {
+      if (isWatchResourceExhaustion(err)) {
+        this.degrade(EXHAUSTION_REASON, { error: String(err), dir });
+        return;
+      }
+      this.unwatchDir(dir);
+    });
     this.dirWatchers.set(dir, w);
     this.dirWatchers.set(dir, w);
 
 
     let entries: fs.Dirent[];
     let entries: fs.Dirent[];
@@ -450,6 +546,35 @@ export class FileWatcher {
     return this.ignoreMatcher.ignores(rel + '/');
     return this.ignoreMatcher.ignores(rel + '/');
   }
   }
 
 
+  /**
+   * Permanently disable live watching after a terminal runtime failure
+   * (watch-resource exhaustion, or lock contention past the retry budget).
+   * Idempotent: logs one actionable warning, fires {@link WatchOptions.onDegraded}
+   * once, and stops the watcher. A subsequent start() clears the latch.
+   */
+  private degrade(reason: string, context: Record<string, unknown> = {}): void {
+    if (this.degradedReason) return;
+    this.degradedReason = reason;
+    logWarn('File watcher disabled', { projectRoot: this.projectRoot, reason, ...context });
+    this.onDegraded?.(reason);
+    this.stop();
+  }
+
+  /**
+   * Whether live watching has degraded permanently (until the next start()).
+   * Distinct from {@link isActive}: a degraded watcher is inactive, but an
+   * inactive watcher is not necessarily degraded (it may simply be stopped or
+   * never started). Hosts use this to tell the user auto-sync is off.
+   */
+  isDegraded(): boolean {
+    return this.degradedReason !== null;
+  }
+
+  /** The reason live watching degraded, or null if it is healthy. */
+  getDegradedReason(): string | null {
+    return this.degradedReason;
+  }
+
   /**
   /**
    * Stop watching for file changes.
    * Stop watching for file changes.
    */
    */
@@ -478,6 +603,9 @@ export class FileWatcher {
     }
     }
     this.dirWatchers.clear();
     this.dirWatchers.clear();
     this.dirCapWarned = false;
     this.dirCapWarned = false;
+    this.lockRetryCount = 0;
+    // NB: degradedReason is intentionally NOT reset here — it must survive the
+    // stop() that degrade() triggers so isDegraded() stays true. start() clears it.
     this.inert = false;
     this.inert = false;
 
 
     this.pendingFiles.clear();
     this.pendingFiles.clear();
@@ -528,7 +656,7 @@ export class FileWatcher {
   }
   }
 
 
   /**
   /**
-   * Schedule a debounced sync.
+   * Schedule a normal debounced sync after a source edit.
    */
    */
   private scheduleSync(): void {
   private scheduleSync(): void {
     if (this.debounceTimer) {
     if (this.debounceTimer) {
@@ -540,6 +668,21 @@ export class FileWatcher {
     }, this.debounceMs);
     }, this.debounceMs);
   }
   }
 
 
+  /**
+   * Schedule a retry after a recoverable sync failure (lock contention). Kept
+   * separate from {@link scheduleSync} so prolonged contention backs off
+   * exponentially instead of hammering the lock every debounce cycle.
+   */
+  private scheduleRetrySync(delayMs: number): void {
+    if (this.debounceTimer) {
+      clearTimeout(this.debounceTimer);
+    }
+    this.debounceTimer = setTimeout(() => {
+      this.debounceTimer = null;
+      this.flush();
+    }, delayMs);
+  }
+
   /**
   /**
    * Flush pending changes by running sync.
    * Flush pending changes by running sync.
    *
    *
@@ -561,6 +704,7 @@ export class FileWatcher {
 
 
     try {
     try {
       const result = await this.syncFn();
       const result = await this.syncFn();
+      this.lockRetryCount = 0; // a clean sync clears any contention backoff
       // Remove entries whose most recent event predates this sync — those
       // Remove entries whose most recent event predates this sync — those
       // edits are now in the DB. Entries with lastSeenMs > syncStartedMs
       // edits are now in the DB. Entries with lastSeenMs > syncStartedMs
       // arrived mid-sync; whether the in-flight sync captured them depends
       // arrived mid-sync; whether the in-flight sync captured them depends
@@ -576,13 +720,26 @@ export class FileWatcher {
       this.onSyncComplete?.(result);
       this.onSyncComplete?.(result);
     } catch (err) {
     } catch (err) {
       if (err instanceof LockUnavailableError) {
       if (err instanceof LockUnavailableError) {
+        this.lockRetryCount += 1;
         // Lock-failure no-op (another writer holds the lock). pendingFiles
         // Lock-failure no-op (another writer holds the lock). pendingFiles
-        // stays intact and the `finally` block reschedules. Debug-only —
-        // a long external index would otherwise spam stderr every cycle.
+        // stays intact and the `finally` block reschedules with backoff. Keep
+        // brief contention quiet (debug-only — a long external index would
+        // otherwise spam stderr every cycle), but stop retrying forever: once a
+        // writer holds the lock past the budget, degrade auto-sync explicitly.
         logDebug('Watch sync skipped: file lock unavailable', {
         logDebug('Watch sync skipped: file lock unavailable', {
           pendingFiles: this.pendingFiles.size,
           pendingFiles: this.pendingFiles.size,
+          retryCount: this.lockRetryCount,
         });
         });
+        if (this.lockRetryCount > MAX_LOCK_RETRIES) {
+          this.degrade(
+            'CodeGraph file lock held by another process past the retry budget; ' +
+              'auto-sync disabled. Run `codegraph sync` once the other writer finishes ' +
+              '(or install git sync hooks) to refresh the graph.',
+            { pendingFiles: this.pendingFiles.size, retryCount: this.lockRetryCount }
+          );
+        }
       } else {
       } else {
+        this.lockRetryCount = 0; // a non-lock failure isn't contention; reset backoff
         const error = err instanceof Error ? err : new Error(String(err));
         const error = err instanceof Error ? err : new Error(String(err));
         logWarn('Watch sync failed', { error: error.message });
         logWarn('Watch sync failed', { error: error.message });
         this.onSyncError?.(error);
         this.onSyncError?.(error);
@@ -593,9 +750,21 @@ export class FileWatcher {
       this.syncing = false;
       this.syncing = false;
 
 
       // If pending files remain (mid-sync events, or this sync failed),
       // If pending files remain (mid-sync events, or this sync failed),
-      // schedule another pass.
+      // schedule another pass. After lock contention, back off exponentially
+      // (debounceMs · 2^(n-1), capped) instead of retrying at the normal
+      // debounce cadence; a clean sync resets lockRetryCount so normal edits
+      // keep the fast debounce. A degrade() above already set `stopped`, so
+      // this won't reschedule a watcher that has given up.
       if (this.pendingFiles.size > 0 && !this.stopped) {
       if (this.pendingFiles.size > 0 && !this.stopped) {
-        this.scheduleSync();
+        if (this.lockRetryCount > 0) {
+          const retryDelayMs = Math.min(
+            this.debounceMs * 2 ** Math.max(0, this.lockRetryCount - 1),
+            MAX_LOCK_RETRY_DELAY_MS
+          );
+          this.scheduleRetrySync(retryDelayMs);
+        } else {
+          this.scheduleSync();
+        }
       }
       }
     }
     }
   }
   }