Pārlūkot izejas kodu

fix(sync): preserve cross-file caller edges across callee re-index (#899) (#927)

`storeExtractionResult` deletes a re-indexed file's nodes via `deleteFile`,
which cascades through `edges.FK ... ON DELETE CASCADE` to delete every edge
whose source OR target is one of those nodes. Edges whose source is in the
re-indexed file are re-emitted by the extractor, but edges whose source is in
a *different* (unchanged) file are not — they are silently dropped. This is
issue #899: re-indexing a callee file severs `calls`/`references` edges from
callers that import it via module-attribute access (`pkg.mod.fn(...)`), so
`codegraph callers fn` reports 0 callers for functions that have real call
sites. A docstring-only edit on the callee is sufficient to trigger it.

The bug affects every incremental path that routes through `sync()` /
`indexFile()`: `codegraph sync`, the file-watcher auto-sync (which calls
`sync()`), and the git sync hooks. `codegraph index` was already fixed by
#894 (it now clears-then-rebuilds, so it's a full re-extraction, not
incremental). `sync` remains the fast incremental path and still has the bug.

Fix: before the delete, snapshot incoming cross-file edges paired with the
target node's (name, kind). After re-inserting the file's nodes + same-file
edges, re-insert the snapshot — re-resolving each edge's target to the
re-indexed node's NEW id by (filePath, kind, name). Node ids are
`sha256(filePath:kind:name:line)`, so any line shift in the callee file (e.g.
a docstring-only edit above the symbol) changes every target id and a naive
re-insert by old id would drop them all. Matching by (kind, name) is stable
across line shifts; if the symbol was renamed/removed, no match is found and
the edge stays dropped (correct). `insertEdges` still filters to endpoints
that exist, so edges whose caller (source) was deleted are also dropped.

Regression tests in `__tests__/sync.test.ts` model the RAGFlow production
case: a `pkg/mod.py` with two callees, both called from `test/test_callers.py`
via `mod.<fn>(...)`. The first test confirms a docstring-only edit that shifts
the second callee's line preserves both incoming edges. The second test
confirms renaming a callee correctly drops its old incoming edge (no phantom
preservation against a non-existent symbol).
Josh 1 dienu atpakaļ
vecāks
revīzija
6110df8b76
3 mainītis faili ar 207 papildinājumiem un 0 dzēšanām
  1. 134 0
      __tests__/sync.test.ts
  2. 26 0
      src/db/queries.ts
  3. 47 0
      src/extraction/index.ts

+ 134 - 0
__tests__/sync.test.ts

@@ -303,4 +303,138 @@ describe('Sync Module', () => {
       expect(result.changedFilePaths).toBeUndefined();
     });
   });
+
+  describe('Cross-file module-attribute caller edges survive callee re-index (#899)', () => {
+    let testDir: string;
+    let cg: CodeGraph;
+
+    beforeEach(async () => {
+      testDir = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-899-'));
+
+      // pkg/mod.py — a module with two functions, both called from a separate
+      // test file via `mod.<fn>(...)` (module-attribute access). This is the
+      // exact shape from the RAGFlow production case in issue #899.
+      fs.mkdirSync(path.join(testDir, 'pkg'), { recursive: true });
+      fs.mkdirSync(path.join(testDir, 'test'), { recursive: true });
+      fs.writeFileSync(
+        path.join(testDir, 'pkg', '__init__.py'),
+        ``
+      );
+      fs.writeFileSync(
+        path.join(testDir, 'pkg', 'mod.py'),
+        [
+          `def callee_one(value):`,
+          `    """First callee — docstring above the second callee so edits here shift its line."""`,
+          `    return value + 1`,
+          ``,
+          ``,
+          `def callee_two(value):`,
+          `    """Second callee, called from the test file via mod.callee_two(...)."""`,
+          `    return value + 2`,
+          ``,
+        ].join('\n')
+      );
+      fs.writeFileSync(
+        path.join(testDir, 'test', 'test_callers.py'),
+        [
+          `from pkg import mod`,
+          ``,
+          ``,
+          `def test_calls_callee_one():`,
+          `    assert mod.callee_one(1) == 2`,
+          ``,
+          ``,
+          `def test_calls_callee_two():`,
+          `    assert mod.callee_two(1) == 3`,
+          ``,
+        ].join('\n')
+      );
+
+      cg = CodeGraph.initSync(testDir, {
+        config: { include: ['**/*.py'], exclude: [] },
+      });
+      await cg.indexAll();
+    });
+
+    afterEach(() => {
+      if (cg) cg.destroy();
+      if (fs.existsSync(testDir)) fs.rmSync(testDir, { recursive: true, force: true });
+    });
+
+    function callerCount(fnName: string): number {
+      const results = cg.searchNodes(fnName);
+      const def = results.map(r => r.node).find(n => n.kind === 'function' && n.name === fnName);
+      if (!def) return -1;
+      return cg.getCallers(def.id).length;
+    }
+
+    it('preserves incoming cross-file calls edges when the callee file is re-indexed', async () => {
+      // Baseline: both callees have one cross-file caller each.
+      expect(callerCount('callee_one')).toBe(1);
+      expect(callerCount('callee_two')).toBe(1);
+
+      // Docstring-only edit to callee_one — adds 1 line, shifting callee_two's
+      // line number. A naive ID-based edge restore would drop callee_two's
+      // incoming edge (its node id changed); the (kind, name) re-resolve
+      // preserves it. A docstring-only edit also confirms zero-AST-change
+      // re-indexes don't sever edges.
+      fs.writeFileSync(
+        path.join(testDir, 'pkg', 'mod.py'),
+        [
+          `def callee_one(value):`,
+          `    """First callee — docstring above the second callee so edits here shift its line."""`,
+          `    """Probe: extra docstring line to shift callee_two's start line by 1."""`,
+          `    return value + 1`,
+          ``,
+          ``,
+          `def callee_two(value):`,
+          `    """Second callee, called from the test file via mod.callee_two(...)."""`,
+          `    return value + 2`,
+          ``,
+        ].join('\n')
+      );
+
+      const result = await cg.sync();
+      expect(result.filesModified).toBe(1);
+
+      // Both incoming cross-file calls edges must survive the callee re-index.
+      expect(callerCount('callee_one')).toBe(1);
+      expect(callerCount('callee_two')).toBe(1);
+    });
+
+    it('drops incoming edges for a callee that was renamed during re-index', async () => {
+      // Baseline.
+      expect(callerCount('callee_one')).toBe(1);
+
+      // Rename callee_one -> callee_one_renamed. The old edge's target
+      // (kind=function, name=callee_one) no longer matches any re-indexed
+      // node, so the edge is correctly dropped (not preserved against a
+      // non-existent symbol).
+      fs.writeFileSync(
+        path.join(testDir, 'pkg', 'mod.py'),
+        [
+          `def callee_one_renamed(value):`,
+          `    """Renamed callee — the old edge targeting callee_one must not be restored."""`,
+          `    return value + 1`,
+          ``,
+          ``,
+          `def callee_two(value):`,
+          `    """Second callee, called from the test file via mod.callee_two(...)."""`,
+          `    return value + 2`,
+          ``,
+        ].join('\n')
+      );
+
+      await cg.sync();
+
+      // The renamed callee has no callers (the test still calls mod.callee_one,
+      // which no longer exists). The old callee_one node is gone, so its
+      // callerCount is -1 (definition not found); callee_one_renamed exists
+      // but has no incoming edges (the test calls the old name).
+      expect(callerCount('callee_one')).toBe(-1);
+      expect(callerCount('callee_one_renamed')).toBe(0);
+      // callee_two is untouched by the rename and its edge survives.
+      expect(callerCount('callee_two')).toBe(1);
+    });
+  });
 });

+ 26 - 0
src/db/queries.ts

@@ -1419,6 +1419,32 @@ export class QueryBuilder {
     return rows.map((r) => r.fp);
   }
 
+  /**
+   * Cross-file edges whose TARGET is a node in `filePath` and whose SOURCE is a
+   * node in a *different* file, paired with the target node's (name, kind) so a
+   * caller can re-resolve the edge to the re-indexed target's new ID (node IDs
+   * are `sha256(filePath:kind:name:line)`, so any line shift in the callee file
+   * changes target IDs and a naive re-insert by old ID silently drops them).
+   * Used by `storeExtractionResult` to preserve incoming edges across a file
+   * re-index (issue #899). Same edge-kind rules as
+   * {@link getDependentFilePaths}: all kinds except `contains`.
+   */
+  getCrossFileIncomingEdgesWithTarget(filePath: string): Array<Edge & { targetName: string; targetKind: NodeKind }> {
+    const sql = `SELECT e.*, tgt.name AS target_name, tgt.kind AS target_kind
+      FROM edges e
+      JOIN nodes tgt ON tgt.id = e.target
+      JOIN nodes src ON src.id = e.source
+      WHERE tgt.file_path = ?
+        AND e.kind != 'contains'
+        AND src.file_path != ?`;
+    const rows = this.db.prepare(sql).all(filePath, filePath) as Array<EdgeRow & { target_name: string; target_kind: NodeKind }>;
+    return rows.map(row => ({
+      ...rowToEdge(row),
+      targetName: row.target_name,
+      targetKind: row.target_kind,
+    }));
+  }
+
   // ===========================================================================
   // File Operations
   // ===========================================================================

+ 47 - 0
src/extraction/index.ts

@@ -14,6 +14,7 @@ import {
   FileRecord,
   ExtractionResult,
   ExtractionError,
+  Edge,
 } from '../types';
 import { QueryBuilder } from '../db/queries';
 import { extractFromSource } from './tree-sitter';
@@ -1597,6 +1598,26 @@ export class ExtractionOrchestrator {
       return; // No changes
     }
 
+    // Snapshot incoming cross-file edges BEFORE deleting this file's nodes.
+    // `deleteFile` cascades to delete every edge whose source OR target is a
+    // node in this file (edges.FK ... ON DELETE CASCADE). Edges whose SOURCE is
+    // in this file are re-emitted by the extractor below, but edges whose SOURCE
+    // is in a *different* (unchanged) file are not — they would be silently
+    // dropped, which is issue #899: re-indexing a callee file severs `calls`/
+    // `references` edges from callers that import it via module-attribute
+    // access (`pkg.mod.fn(...)`).
+    //
+    // We snapshot the edge plus the target node's (name, kind) so we can
+    // re-resolve to the re-indexed target's NEW id. Node ids are
+    // `sha256(filePath:kind:name:line)`, so any line shift in the callee file
+    // (e.g. a docstring-only edit above the symbol) changes every target id and
+    // a naive re-insert by old id would silently drop every edge. Matching by
+    // (filePath, kind, name) is stable across line shifts; if the symbol was
+    // renamed/removed, no match is found and the edge stays dropped (correct).
+    const crossFileIncomingEdges = existingFile
+      ? this.queries.getCrossFileIncomingEdgesWithTarget(filePath)
+      : [];
+
     // Delete existing data for this file
     if (existingFile) {
       this.queries.deleteFile(filePath);
@@ -1623,6 +1644,32 @@ export class ExtractionOrchestrator {
       }
     }
 
+    // Re-insert cross-file incoming edges snapshotted before the delete,
+    // re-resolving each edge's target to the re-indexed node's new id by
+    // (filePath, kind, name). Node ids include the source line, so any line
+    // shift in the callee file (e.g. a docstring-only edit above the symbol)
+    // changes every target id and a naive re-insert by old id would drop them
+    // all. `insertEdges` still filters to endpoints that exist, so edges whose
+    // caller (source) was deleted, or whose callee (target) was renamed/removed
+    // during the re-index (no match in `newTargetIds`), are dropped. This
+    // closes the #899 edge-drop on `sync`.
+    if (crossFileIncomingEdges.length > 0) {
+      const newNodesByKindName = new Map<string, string>();
+      for (const n of validNodes) {
+        newNodesByKindName.set(`${n.kind}\0${n.name}`, n.id);
+      }
+      const reinserted: Edge[] = [];
+      for (const e of crossFileIncomingEdges) {
+        const newTargetId = newNodesByKindName.get(`${e.targetKind}\0${e.targetName}`);
+        if (newTargetId) {
+          reinserted.push({ source: e.source, target: newTargetId, kind: e.kind, metadata: e.metadata, line: e.line, column: e.column, provenance: e.provenance });
+        }
+      }
+      if (reinserted.length > 0) {
+        this.queries.insertEdges(reinserted);
+      }
+    }
+
     // Insert unresolved references in batch with denormalized filePath/language
     if (result.unresolvedReferences.length > 0) {
       const insertedIds = new Set(validNodes.map((n) => n.id));