Procházet zdrojové kódy

fix(sync): filesystem-based change detection (catch git pull & non-git edits) (#414)

* fix(sync): detect changes via filesystem, not git status

Incremental sync detected changes with `git status --porcelain`, which only sees uncommitted working-tree changes — so committed changes from git pull/checkout/merge/rebase (clean tree afterward) were never reconciled, and non-git projects leaned on a slow full rescan. Change detection is now filesystem-based and git-independent: a (size, mtime) stat pre-filter skips unchanged files, then a content hash confirms the rest; removals are checked against the filesystem (git ls-files still lists deleted-but-unstaged files). Also adds a non-blocking catch-up sync on MCP connect so changes made while the server was down (e.g. a terminal git pull) are reconciled on connect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

* docs(changelog): add 0.9.5 entry for filesystem-based sync fix

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Colby Mchenry před 4 týdny
rodič
revize
4a94696e44
3 změnil soubory, kde provedl 108 přidání a 77 odebrání
  1. 17 0
      CHANGELOG.md
  2. 63 77
      src/extraction/index.ts
  3. 28 0
      src/mcp/index.ts

+ 17 - 0
CHANGELOG.md

@@ -7,6 +7,22 @@ a [GitHub Release](https://github.com/colbymchenry/codegraph/releases) tagged
 This project follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)
 and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.9.5] - 2026-05-25
+
+### Fixed
+- **The index now stays in sync after `git pull`, branch switches, and edits made
+  outside your editor.** Incremental sync detected changes via `git status`, which
+  only sees *uncommitted* edits — so code pulled or checked out (which leaves a
+  clean working tree) was silently missed until a full `codegraph index -f`.
+  Change detection is now filesystem-based and git-independent: a `(size, mtime)`
+  stat pre-filter skips unchanged files, then a content hash confirms the rest. It
+  reconciles committed changes from `pull`/`checkout`/`merge`/`rebase`, plain edits
+  in non-git projects, and deletions alike.
+- **The MCP server catches up on connect.** When your editor connects, codegraph
+  reconciles anything that changed while it wasn't running (e.g. a `git pull` from
+  the terminal), so the first query reflects the current code instead of a stale
+  snapshot — rather than waiting for the next live edit.
+
 ## [0.9.4] - 2026-05-24
 
 ### Added
@@ -228,6 +244,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
   find its bundle. The release pipeline now verifies every package reached the
   registry (and is idempotent), so a release can't pass green-but-broken again.
 
+[0.9.5]: https://github.com/colbymchenry/codegraph/releases/tag/v0.9.5
 [0.9.4]: https://github.com/colbymchenry/codegraph/releases/tag/v0.9.4
 [0.9.3]: https://github.com/colbymchenry/codegraph/releases/tag/v0.9.3
 [0.9.2]: https://github.com/colbymchenry/codegraph/releases/tag/v0.9.2

+ 63 - 77
src/extraction/index.ts

@@ -1202,8 +1202,12 @@ export class ExtractionOrchestrator {
   }
 
   /**
-   * Sync with current file state.
-   * Uses git status as a fast path when available, falling back to full scan.
+   * Sync the index with the current file state.
+   *
+   * Change detection is filesystem-based, never git: a (size, mtime) stat
+   * pre-filter skips unchanged files, then a content-hash compare confirms real
+   * changes. This works in non-git projects and catches committed changes from
+   * `git pull`/`checkout`/`merge`/`rebase` that `git status` cannot see.
    */
   async sync(onProgress?: (progress: IndexProgress) => void): Promise<SyncResult> {
     await initGrammars(); // Initialize WASM runtime (grammars loaded lazily below)
@@ -1222,93 +1226,75 @@ export class ExtractionOrchestrator {
     });
 
     const filesToIndex: string[] = [];
-    const gitChanges = getGitChangedFiles(this.rootDir);
+    // === Filesystem reconcile (git-independent) ===
+    // The source of truth for "what changed" is the filesystem vs the indexed
+    // state — never git. We enumerate the current source files and reconcile
+    // each against the DB. A cheap (size, mtime) stat pre-filter skips unchanged
+    // files without reading or hashing them, so the expensive read+hash+parse
+    // only runs for files that actually changed. This catches edits/adds/deletes
+    // whether or not the project uses git, and crucially also catches committed
+    // changes from `git pull`/`checkout`/`merge`/`rebase` — which `git status`
+    // cannot see, because the working tree is clean afterward.
+    const currentFiles = scanDirectory(this.rootDir);
+    filesChecked = currentFiles.length;
+    const currentSet = new Set(currentFiles);
 
-    if (gitChanges) {
-      // === Git fast path ===
-      // Only inspect the files git reports as changed instead of scanning everything.
-      filesChecked = gitChanges.modified.length + gitChanges.added.length + gitChanges.deleted.length;
+    const trackedFiles = this.queries.getAllFiles();
+    const trackedMap = new Map<string, FileRecord>();
+    for (const f of trackedFiles) {
+      trackedMap.set(f.path, f);
+    }
 
-      // Handle deleted files
-      for (const filePath of gitChanges.deleted) {
-        const tracked = this.queries.getFileByPath(filePath);
-        if (tracked) {
-          this.queries.deleteFile(filePath);
-          filesRemoved++;
-        }
+    // Removals: tracked in the DB but no longer a present source file. Check the
+    // filesystem directly — `scanDirectory` (via `git ls-files`) still lists a
+    // file deleted from disk but not yet staged, so set membership alone misses it.
+    for (const tracked of trackedFiles) {
+      if (!currentSet.has(tracked.path) || !fs.existsSync(path.join(this.rootDir, tracked.path))) {
+        this.queries.deleteFile(tracked.path);
+        filesRemoved++;
       }
+    }
 
-      // Handle modified + added files — read + hash only these. Untracked
-      // (`??`) files stay untracked in git even after we index them, so they
-      // can't be trusted as "new": re-hash and compare against the DB exactly
-      // like modified files. Otherwise every sync re-indexes them and status
-      // reports them as pending forever. (See issue #206.)
-      for (const filePath of [...gitChanges.modified, ...gitChanges.added]) {
-        const fullPath = path.join(this.rootDir, filePath);
-        let content: string;
+    // Adds / modifications.
+    for (const filePath of currentFiles) {
+      const fullPath = path.join(this.rootDir, filePath);
+      const tracked = trackedMap.get(filePath);
+
+      // Cheap pre-filter: an already-indexed file whose size AND mtime both match
+      // the DB is unchanged — skip it without reading or hashing. (A content
+      // change that preserves both exactly is the blind spot every mtime-based
+      // incremental tool accepts; `index --force` is the escape hatch. Git bumps
+      // mtime on every file it writes during checkout/merge, so pulls are caught.)
+      if (tracked) {
         try {
-          content = fs.readFileSync(fullPath, 'utf-8');
+          const stat = fs.statSync(fullPath);
+          if (stat.size === tracked.size && Math.floor(stat.mtimeMs) === Math.floor(tracked.modifiedAt)) {
+            continue;
+          }
         } catch (error) {
-          logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
+          logDebug('Skipping unstattable file during sync', { filePath, error: String(error) });
           continue;
         }
-
-        const contentHash = hashContent(content);
-        const tracked = this.queries.getFileByPath(filePath);
-
-        if (!tracked) {
-          filesToIndex.push(filePath);
-          changedFilePaths.push(filePath);
-          filesAdded++;
-        } else if (tracked.contentHash !== contentHash) {
-          filesToIndex.push(filePath);
-          changedFilePaths.push(filePath);
-          filesModified++;
-        }
-      }
-    } else {
-      // === Fallback: full scan (non-git project or git failure) ===
-      const currentFiles = new Set(scanDirectory(this.rootDir));
-      filesChecked = currentFiles.size;
-
-      // Build Map for O(1) lookups instead of .find() per file
-      const trackedFiles = this.queries.getAllFiles();
-      const trackedMap = new Map<string, FileRecord>();
-      for (const f of trackedFiles) {
-        trackedMap.set(f.path, f);
       }
 
-      // Find files to remove (in DB but not on disk)
-      for (const tracked of trackedFiles) {
-        if (!currentFiles.has(tracked.path)) {
-          this.queries.deleteFile(tracked.path);
-          filesRemoved++;
-        }
+      // New, or size/mtime changed — read + hash to confirm a real content change.
+      let content: string;
+      try {
+        content = fs.readFileSync(fullPath, 'utf-8');
+      } catch (error) {
+        logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
+        continue;
       }
+      const contentHash = hashContent(content);
 
-      // Find files to add or update
-      for (const filePath of currentFiles) {
-        const fullPath = path.join(this.rootDir, filePath);
-        let content: string;
-        try {
-          content = fs.readFileSync(fullPath, 'utf-8');
-        } catch (error) {
-          logDebug('Skipping unreadable file during sync', { filePath, error: String(error) });
-          continue;
-        }
-
-        const contentHash = hashContent(content);
-        const tracked = trackedMap.get(filePath);
-
-        if (!tracked) {
-          filesToIndex.push(filePath);
-          changedFilePaths.push(filePath);
-          filesAdded++;
-        } else if (tracked.contentHash !== contentHash) {
-          filesToIndex.push(filePath);
-          changedFilePaths.push(filePath);
-          filesModified++;
-        }
+      if (!tracked) {
+        filesToIndex.push(filePath);
+        changedFilePaths.push(filePath);
+        filesAdded++;
+      } else if (tracked.contentHash !== contentHash) {
+        filesToIndex.push(filePath);
+        changedFilePaths.push(filePath);
+        filesModified++;
       }
     }
 

+ 28 - 0
src/mcp/index.ts

@@ -243,6 +243,7 @@ export class MCPServer {
       this.cg = await CodeGraph.open(resolvedRoot);
       this.toolHandler.setDefaultCodeGraph(this.cg);
       this.startWatching();
+      this.catchUpSync();
     } catch (err) {
       // Log the error so transient failures are diagnosable (see issue #47)
       const msg = err instanceof Error ? err.message : String(err);
@@ -301,6 +302,7 @@ export class MCPServer {
       this.projectPath = resolvedRoot;
       this.toolHandler.setDefaultCodeGraph(this.cg);
       this.startWatching();
+      this.catchUpSync();
     } catch {
       // Still failing — will retry on next tool call
     }
@@ -370,6 +372,32 @@ export class MCPServer {
     }
   }
 
+  /**
+   * Reconcile the index with the current filesystem once, right after connect —
+   * catches edits, adds, deletes, and `git pull`/`checkout` changes made while
+   * no watcher was running. Runs in the background so it never delays the
+   * `initialize` response; `sync()` is incremental (a stat pre-filter skips
+   * unchanged files) and mutex-guarded, so it can't collide with the live
+   * watcher or a git-hook sync. Runs even when the watcher is unavailable
+   * (e.g. WSL2 /mnt drives), where catch-up matters most.
+   */
+  private catchUpSync(): void {
+    const cg = this.cg;
+    if (!cg) return;
+    void cg
+      .sync()
+      .then((result) => {
+        const changed = result.filesAdded + result.filesModified + result.filesRemoved;
+        if (changed > 0) {
+          process.stderr.write(`[CodeGraph MCP] Caught up ${changed} file(s) changed since last run\n`);
+        }
+      })
+      .catch((err) => {
+        const msg = err instanceof Error ? err.message : String(err);
+        process.stderr.write(`[CodeGraph MCP] Catch-up sync failed: ${msg}\n`);
+      });
+  }
+
   /**
    * Stop the server
    */