Jelajahi Sumber

feat(extraction): add Python to value-reference edges (+ conditional-def-safe prune)

Extend value-reference edges to Python (module-level `NAME = …`; the `assignment`
declarator added to the per-grammar shadow-prune switch). A synthetic probe found
the expected shadow FP — a module `CONFIG` shadowed by a local `CONFIG = …` —
fixed by that addition.

Python also exposed a flaw in the prune RULE. Python conditionally defines module
constants (`try: X = a; except: X = b`), a very common idiom that binds a name
twice AT MODULE SCOPE. The old "declarators > 1 → drop" over-pruned these,
dropping a real const and all its readers. Refine the rule to compare declarator
count against the number of FILE-SCOPE NODES the name has: a conditional def makes
them equal (kept); a real local shadow makes declarators exceed file-scope nodes
(pruned). Strictly more correct for every language. Track file-scope node count
per name in captureValueRefScope; also suppress same-name value-ref edges (the two
halves of a conditional def would otherwise cross-reference via their own names).

Validated small/medium/large on public Python OSS (requests, sqlalchemy, django):
node count identical on/off, precision samples all true positives, guard holds,
conditional defs now kept (django recall +18 edges), zero same-name edges, and the
impact win reproduces (django `_trans` 1→138, sqlalchemy `COMPARE_FAILED` 1→26).

Adds 3 Python test cases (module read; the local-`=` shadow, verified to fail
without the prune extension; the conditional-def keep case). Design matrix,
playbook, and CHANGELOG updated. Full suite green (1,555 passed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Colby McHenry 1 Minggu lalu
induk
melakukan
7b0f204604

+ 1 - 1
CHANGELOG.md

@@ -11,7 +11,7 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ### New Features
 
-- Impact and blast-radius analysis for TypeScript, JavaScript, and Go now understands the readers of a constant. When you change a file-scope or package-level `const`/`var` — a config object, a lookup table, a shared constant — the other symbols in that file that read it now show up as affected, where before they were invisible (impact only followed calls, imports, and inheritance, so a constant's consumers looked like "nothing depends on this"). This makes `codegraph impact`, and the impact trail in `codegraph_explore`/`codegraph_node`, catch the "change this table, break its readers" class of change. It's on by default and adds no nodes to your graph; bundled/minified files and ambiguously-shadowed names are skipped to keep results precise. Set `CODEGRAPH_VALUE_REFS=0` to turn it off.
+- Impact and blast-radius analysis for TypeScript, JavaScript, Go, and Python now understands the readers of a constant. When you change a file-scope, package-level, or module-level constant — a config object, a lookup table, a shared constant — the other symbols in that file that read it now show up as affected, where before they were invisible (impact only followed calls, imports, and inheritance, so a constant's consumers looked like "nothing depends on this"). This makes `codegraph impact`, and the impact trail in `codegraph_explore`/`codegraph_node`, catch the "change this table, break its readers" class of change. It's on by default and adds no nodes to your graph; bundled/minified files and ambiguously-shadowed names are skipped to keep results precise. Set `CODEGRAPH_VALUE_REFS=0` to turn it off.
 
 ### Fixes
 

+ 36 - 7
__tests__/value-reference-edges.test.ts

@@ -12,13 +12,20 @@ import * as os from 'os';
 import CodeGraph from '../src';
 
 function valueRefReaders(cg: CodeGraph, constName: string): string[] {
-  const target = cg.searchNodes(constName).map((r) => r.node).find((n) => n.name === constName);
-  if (!target) return [];
-  return cg
-    .getIncomingEdges(target.id)
-    .filter((e) => e.kind === 'references' && (e.metadata as { valueRef?: boolean } | undefined)?.valueRef)
-    .map((e) => cg.getNode(e.source)?.name)
-    .filter((n): n is string => Boolean(n));
+  // Aggregate across ALL nodes of this name — a conditionally-defined module
+  // const (`try: X=…; except: X=…`) has more than one, and the edge targets
+  // whichever one ended up in the target map.
+  const targets = cg.searchNodes(constName).map((r) => r.node).filter((n) => n.name === constName);
+  const readers = new Set<string>();
+  for (const t of targets) {
+    for (const e of cg.getIncomingEdges(t.id)) {
+      if (e.kind === 'references' && (e.metadata as { valueRef?: boolean } | undefined)?.valueRef) {
+        const r = cg.getNode(e.source)?.name;
+        if (r) readers.add(r);
+      }
+    }
+  }
+  return [...readers];
 }
 
 describe('value-reference edges', () => {
@@ -161,6 +168,28 @@ describe('value-reference edges', () => {
     expect(valueRefReaders(cg, 'Timeout')).toEqual([]);
   });
 
+  it('keeps a conditionally-defined module const (try/except), not a shadow (Python)', async () => {
+    // `HAS_SSL` is defined twice but BOTH at module scope (a conditional def, a
+    // very common Python idiom). It is one logical const, not a shadow, so its
+    // reader must stay edged — and the two halves must not edge each other.
+    fs.writeFileSync(
+      path.join(dir, 'cond.py'),
+      [
+        'try:',
+        '\tHAS_SSL = True',
+        'except ImportError:',
+        '\tHAS_SSL = False',
+        '',
+        'def uses_ssl():',
+        '\treturn HAS_SSL',
+      ].join('\n'),
+    );
+    cg = index();
+    await cg.indexAll();
+
+    expect(valueRefReaders(cg, 'HAS_SSL')).toEqual(['uses_ssl']);
+  });
+
   it('emits nothing when CODEGRAPH_VALUE_REFS=0', async () => {
     const prev = process.env.CODEGRAPH_VALUE_REFS;
     process.env.CODEGRAPH_VALUE_REFS = '0';

+ 21 - 11
docs/design/value-reference-edges-playbook.md

@@ -45,7 +45,7 @@ agent read-reduction (see §4.3).
 
 | Symbol | Role |
 |---|---|
-| `VALUE_REF_LANGS` (static Set) | languages the feature runs for. Currently `typescript`, `javascript`, `tsx`. **Add the new language here.** |
+| `VALUE_REF_LANGS` (static Set) | languages the feature runs for. Currently `typescript`, `javascript`, `tsx`, `go`, `python`. **Add the new language here.** |
 | `valueRefsEnabled` | `process.env.CODEGRAPH_VALUE_REFS !== '0'` — default ON, env opts out. |
 | `MAX_VALUE_REF_NODES` (20_000) | per-scope traversal cap (and the shadow-scan cap). |
 | `captureValueRefScope(kind, name, id, node)` | called from `createNode` on every node. Records **targets** (file-scope `const`/`var`) and **reader scopes** (`function`/`method`/`const`/`var`). |
@@ -66,10 +66,10 @@ targets** (see §3).
 
 ## 2. Current state (what's shipped + validated)
 
-- **Default ON** for TS/JS/tsx + Go (`CODEGRAPH_VALUE_REFS=0` disables). Shipped in **PR #895**
+- **Default ON** for TS/JS/tsx + Go + Python (`CODEGRAPH_VALUE_REFS=0` disables). Shipped in **PR #895**
   (flip-on + the shadow prune); Go added in a later PR (the shadow-prune declarator switch +
   `VALUE_REF_LANGS`).
-- **Validated S/M/L** in **TypeScript, JavaScript, tsx, and Go** — see the matrix in the
+- **Validated S/M/L** in **TypeScript, JavaScript, tsx, Go, and Python** — see the matrix in the
   design doc. All clean: node count identical on/off, precision guards held, impact win
   reproduced. Go required extending the shadow prune (per-grammar declarators) — the worked
   example of "step B is load-bearing."
@@ -87,13 +87,14 @@ Guards run in `flushValueRefs`, in order:
 1. **`isGeneratedFile(path)`** (`src/extraction/generated-detection.ts`) — skips
    *suffix-recognised* generated files (`.pb.ts`, `.min.js`, …). **Path-only** — cannot catch
    content-minified bundles.
-2. **Shadow prune (the #895 fix)** — drop any target whose name is bound by **more than one
-   `variable_declarator`** in the file. Rationale: a bundled/Emscripten `const Module`
-   re-declared as an inner `var Module` / param resolves to the *inner* binding for nested
-   readers, so a file-scope edge is wrong. The inner re-declarations aren't graph nodes, so we
-   count them at the **syntax-tree** level. *This is the per-language-sensitive guard:* it
-   scans for `variable_declarator` nodes — **a different grammar uses a different node type**
-   (§5 step B).
+2. **Shadow prune** — drop a target when its **declarator count exceeds its file-scope node
+   count** (so it's also bound in an inner/local scope). Rationale: a bundled/Emscripten `const
+   Module` re-declared as an inner `var Module`, a Go package const shadowed by a local `:=`, or
+   a Python module const shadowed by a local `=` resolves to the *inner* binding for nested
+   readers, so a file-scope edge is wrong. Inner re-bindings aren't graph nodes, so declarators
+   are counted at the **syntax-tree** level. *This is the per-language-sensitive guard:* the
+   declarator node types differ per grammar (§5 step B), and comparing against file-scope node
+   count (not a flat `>1`) is what keeps **conditional module defs** (`try: X=…; except: X=…`).
 3. **Distinctive-name + same-file** (the target gate).
 
 **What a real FP looks like** (fix it): a reader edged to a file-scope const it does **not**
@@ -220,11 +221,20 @@ silently does nothing for the new language and intra-file shadowing produces fal
 |---|---|---|---|
 | TS/JS/tsx | `variable_declarator` | `namedChild(0)` | done |
 | Go | `const_spec`, `var_spec`, `short_var_declaration` | spec → `namedChild(0)`; short-var → identifiers in the `left` field | **done** |
-| Python | `assignment` (module-level `NAME = …`) | LHS identifier(s) | to verify |
+| Python | `assignment` | `left` field: identifier, or iterate a `pattern_list`/`tuple_pattern` | **done** |
 | Rust | `const_item` / `static_item` (`let_declaration` = locals) | `name` field | to verify |
 | Ruby | `assignment` with constant LHS (`CONST`) | LHS | to verify |
 | C/C++ | `init_declarator` in a file-scope `declaration` | declarator id | to verify |
 
+**The prune rule is `declarators > file-scope-node-count`, NOT `> 1`.** A name can be bound
+twice *at file scope* legitimately — a **conditional module def** (`try: X = a; except: X = b`,
+or `if cond: X = a else: X = b`). Those make N file-scope nodes AND N declarators, so they're
+kept; a real local shadow makes declarators exceed file-scope nodes. Python forced this
+refinement (try/except const defs are everywhere); it's strictly more correct for all
+languages. `fileScopeValueCounts` (incremented in `captureValueRefScope`) tracks the file-scope
+node count per name. Also: same-name value-ref edges are suppressed (`refName !== scope.name`),
+since the two halves of a conditional def would otherwise cross-reference.
+
 **Go was the worked example of "step B matters":** the first pass added `go` to
 `VALUE_REF_LANGS` only, and a synthetic probe immediately showed a false positive —
 `func withShadow() { TimeoutSeconds := 5; return TimeoutSeconds }` got edged to the package

+ 33 - 10
docs/design/value-reference-edges.md

@@ -1,6 +1,6 @@
 # Design + status: same-file value-reference edges
 
-**Status:** SHIPPED (default-on for TS/JS/tsx + Go; `CODEGRAPH_VALUE_REFS=0` disables). The
+**Status:** SHIPPED (default-on for TS/JS/tsx + Go + Python; `CODEGRAPH_VALUE_REFS=0` disables). The
 emitter lives in `TreeSitterExtractor.flushValueRefs` (`src/extraction/tree-sitter.ts`).
 **Motivation:** close the impact-analysis hole for *value consumers*. Static
 extraction edges calls, imports, and inheritance, but never edges a constant to the
@@ -13,7 +13,7 @@ readers" class of change (the ReScript-PR false positive that motivated the work
 ## TL;DR for a new session
 
 We emit a `references` edge (`metadata: { valueRef: true }`) from a reader symbol to
-the **file/package-scope `const`/`var` it reads**, same-file only, for TS/JS/tsx + Go. Those edges
+the **file/package-scope `const`/`var` it reads**, same-file only, for TS/JS/tsx + Go + Python. Those edges
 flow straight into `getImpactRadius` / `codegraph impact` and the impact trail in
 `codegraph_explore` / `codegraph_node` — no agent-behaviour change required.
 
@@ -33,15 +33,19 @@ The win is **impact-radius correctness**, not agent read-reduction (see "Agent A
 
 1. **`isGeneratedFile(path)`** — skip suffix-recognised generated files (`.pb.ts`,
    `.min.js`, …). Path-only; it cannot catch content-minified bundles.
-2. **Shadow prune (#895)** — drop any target whose name is bound by **more than one
-   `variable_declarator`** in the file. A bundled/Emscripten `const Module` re-declared
-   as an inner `var Module` / param resolves to the *inner* binding for nested readers,
-   so a file-scope edge to it is a false positive. The inner re-declarations aren't
-   extracted as graph nodes, so we count them at the syntax level. This is what catches
+2. **Shadow prune** — drop a target when its **declarator count exceeds its file-scope node
+   count**, i.e. it's also bound in an *inner* (local) scope. A bundled/Emscripten `const
+   Module` re-declared as an inner `var Module`, a Go package const shadowed by a local `:=`,
+   or a Python module const shadowed by a local `=` all resolve to the inner binding for nested
+   readers — a file-scope edge would be a false positive. Inner re-bindings aren't graph nodes,
+   so declarators are counted at the syntax level (per-grammar node types: `variable_declarator`
+   for TS/JS, `const_spec`/`var_spec`/`short_var_declaration` for Go, `assignment` for Python).
+   Comparing against file-scope node count (not a flat ">1") keeps **conditional module defs**
+   (`try: X=…; except: X=…`), which legitimately bind a name twice at file scope. This catches
    the content-minified bundles guard #1 misses.
 3. **Distinctive-name + same-file** as above.
 
-## Validation matrix — TypeScript / JavaScript / Go
+## Validation matrix — TypeScript / JavaScript / Go / Python
 
 Method per repo: index the same tree twice (value-refs on vs `CODEGRAPH_VALUE_REFS=0`),
 diff node/edge counts, spot-check precision, and measure `codegraph impact` on a few
@@ -72,8 +76,16 @@ file-scope consts. Node count must be **identical** on/off (edges-only feature).
 | prometheus/prometheus | large | 1,329 | 23,322 (stable) | +3,466 (3.3%) | all sampled TP; guard holds | `rdsLabelInstance` 1→**82**, `ec2Label` 1→24 |
 | kubernetes/kubernetes | very large | 19,160 | 251,086 (stable) | +20,574 (1.9%) | all sampled TP; guard holds on 250 targets | `KubeletSubsystem` 3→**138**, `LEVEL_0` 1→102 |
 
-Across S/M/L in all three languages: node count never moved, the precision guards held, and
-the `impact` OFF column is the bug — a const that 80–90 symbols read reports "1 affected"
+**Python** (module-level `NAME = …`; required extending the prune *and* refining its rule — see below)
+
+| Repo | size | files | nodes (on=off) | +value-ref edges | precision | `impact` on→off example |
+|---|---|---|---|---|---|---|
+| psf/requests | small | 49 | 1,299 (stable) | +85 (2.9%) | all sampled TP; guard holds | `ITER_CHUNK_SIZE` 1→4, `DEFAULT_POOLBLOCK` 1→4 |
+| sqlalchemy/sqlalchemy | medium | 679 | 59,963 (stable) | +1,929 (0.8%) | all sampled TP; guard holds | `COMPARE_FAILED` 1→**26**, `DB_LINK_PLACEHOLDER` 1→19 |
+| django/django | large | 3,005 | 61,748 (stable) | +1,328 (0.7%) | all sampled TP; guard holds | `_trans` 1→**138**, `SEARCH_VAR` 4→8 |
+
+Across S/M/L in all four languages: node count never moved, the precision guards held, and
+the `impact` OFF column is the bug — a const that 80–140 symbols read reports "1 affected"
 without value-refs.
 
 **Go required a code change** (unlike JS/tsx, which the existing guards covered unchanged).
@@ -85,6 +97,17 @@ types fixed it (one synthetic repro, then clean across gin/hugo/prometheus). Thi
 template for the next language: **the shadow prune is per-grammar and must be wired per
 language** (see the playbook).
 
+**Python forced a refinement of the prune *rule* — a general improvement.** Python's
+declarator is `assignment` (added to the switch). But Python also **conditionally defines
+module constants** (`try: HAS_SSL = True; except: HAS_SSL = False`) — a very common idiom that
+binds the name twice *at module scope*. The old "bound more than once → drop" rule over-pruned
+these (dropping a real const and its readers). The fix distinguishes a conditional module def
+from a real shadow by comparing declarator count against the number of **file-scope nodes** the
+name has: a conditional def makes them equal (both bindings are file-scope), a local shadow
+makes declarators exceed file-scope nodes (the excess is the local). This is strictly more
+correct for *all* languages. (It also made the two halves of a conditional def cross-reference
+via their own names, so same-name value-ref edges are now suppressed.)
+
 **`tsx` is covered by the TS rows** — excalidraw is a React/.tsx codebase, so the headline
 `tablerIconProps` (1→170) and most of its targets live in `.tsx` files. The one
 tsx-specific path — a const read *only* inside JSX (`<Foo x={CONST}/>`) — relies on the

+ 35 - 17
src/extraction/tree-sitter.ts

@@ -224,11 +224,12 @@ export class TreeSitterExtractor {
   // Value-reference edges (default ON; set CODEGRAPH_VALUE_REFS=0 to disable; see flushValueRefs).
   // Same-file reads of file-scope const/var symbols → `references` edges so impact analysis catches
   // value consumers ("change this constant/table, affect its readers").
-  private static readonly VALUE_REF_LANGS = new Set<string>(['typescript', 'javascript', 'tsx', 'go']);
+  private static readonly VALUE_REF_LANGS = new Set<string>(['typescript', 'javascript', 'tsx', 'go', 'python']);
   private static readonly MAX_VALUE_REF_NODES = 20_000;
   private readonly valueRefsEnabled = process.env.CODEGRAPH_VALUE_REFS !== '0';
   private fileScopeValues = new Map<string, string>();
-  private valueRefScopes: Array<{ id: string; node: SyntaxNode }> = [];
+  private fileScopeValueCounts = new Map<string, number>(); // file-scope nodes per name (conditional-def detection)
+  private valueRefScopes: Array<{ id: string; node: SyntaxNode; name: string }> = [];
   private errors: ExtractionError[] = [];
   private extractor: LanguageExtractor | null = null;
   private nodeStack: string[] = []; // Stack of parent node IDs
@@ -533,10 +534,16 @@ export class TreeSitterExtractor {
   private captureValueRefScope(kind: NodeKind, name: string, id: string, node: SyntaxNode): void {
     if ((kind === 'constant' || kind === 'variable') && name.length >= 3 && /[A-Z_]/.test(name)) {
       const parentId = this.nodeStack[this.nodeStack.length - 1];
-      if (parentId?.startsWith('file:')) this.fileScopeValues.set(name, id);
+      if (parentId?.startsWith('file:')) {
+        this.fileScopeValues.set(name, id);
+        // How many file-scope nodes carry this name. A conditional module-level
+        // def (`try: X = a; except: X = b`) makes >1 — distinct from a local
+        // shadow, which adds a binding the prune must catch (see flushValueRefs).
+        this.fileScopeValueCounts.set(name, (this.fileScopeValueCounts.get(name) ?? 0) + 1);
+      }
     }
     if (kind === 'function' || kind === 'method' || kind === 'constant' || kind === 'variable') {
-      this.valueRefScopes.push({ id, node });
+      this.valueRefScopes.push({ id, node, name });
     }
   }
 
@@ -551,20 +558,26 @@ export class TreeSitterExtractor {
   private flushValueRefs(): void {
     const scopes = this.valueRefScopes;
     const targets = this.fileScopeValues;
+    const fileScopeCounts = this.fileScopeValueCounts;
     this.valueRefScopes = [];
     this.fileScopeValues = new Map();
+    this.fileScopeValueCounts = new Map();
     if (!this.valueRefsEnabled || !TreeSitterExtractor.VALUE_REF_LANGS.has(this.language)) return;
     if (targets.size === 0 || scopes.length === 0 || isGeneratedFile(this.filePath)) return;
 
-    // Prune SHADOWED targets. A name bound more than once in the file (e.g. a
-    // bundled/Emscripten `const Module` re-declared as an inner `var Module`, or
-    // a Go package `const Timeout` shadowed by a local `Timeout := …`) may
-    // resolve to the INNER binding for nested readers, so a file-scope edge to
-    // it is a false positive. Those inner re-declarations aren't extracted as
-    // graph nodes, so detect them at the syntax level: count every declarator
-    // name across the tree and drop any target bound twice or more. Single-
-    // binding (unambiguous) names are kept. Complements the path-based
-    // isGeneratedFile() check, which can't catch content-minified bundles.
+    // Prune SHADOWED targets. A target re-bound in an INNER scope (a
+    // bundled/Emscripten `const Module` re-declared as a nested `var Module`; a
+    // Go package `const Timeout` shadowed by a local `Timeout := …`; a Python
+    // module `CONFIG` shadowed by a local `CONFIG = …`) resolves to the inner
+    // binding for nested readers, so a file-scope edge is a false positive.
+    // Inner re-bindings aren't graph nodes, so detect them at the syntax level:
+    // count every declarator of the name across the tree and compare against how
+    // many FILE-SCOPE nodes carry it. A real shadow makes (declarators >
+    // file-scope nodes) — the excess is the local binding. A conditional
+    // module-level def (`try: X = a; except: X = b`) makes them EQUAL (both
+    // declarators are file-scope nodes), so it's correctly kept. Complements the
+    // path-based isGeneratedFile() check, which can't catch content-minified
+    // bundles.
     //
     // Declarator node types are per-grammar; a file only contains its own
     // language's nodes, so matching all of them in one switch is safe.
@@ -587,7 +600,8 @@ export class TreeSitterExtractor {
           case 'var_spec':            // Go  `var X = …`
             bump(n.namedChild(0));
             break;
-          case 'short_var_declaration': { // Go  `x, Y := …`
+          case 'short_var_declaration': // Go  `x, Y := …`
+          case 'assignment': {          // Python  `X = …` / `X: T = …` / `A, B = …`
             const left = getChildByField(n, 'left') ?? n.namedChild(0);
             if (left?.type === 'identifier') bump(left);
             else if (left) for (const c of left.namedChildren) bump(c);
@@ -599,7 +613,7 @@ export class TreeSitterExtractor {
           if (c) dstack.push(c);
         }
       }
-      for (const [nm, c] of declCounts) if (c > 1) targets.delete(nm);
+      for (const [nm, c] of declCounts) if (c > (fileScopeCounts.get(nm) ?? 1)) targets.delete(nm);
       if (targets.size === 0) return;
     }
 
@@ -611,8 +625,12 @@ export class TreeSitterExtractor {
         const n = stack.pop()!;
         visited++;
         if (n.type === 'identifier') {
-          const targetId = targets.get(getNodeText(n, this.source));
-          if (targetId && targetId !== scope.id && !seen.has(targetId)) {
+          const refName = getNodeText(n, this.source);
+          const targetId = targets.get(refName);
+          // Skip self and same-name targets: a symbol referencing a file-scope
+          // sibling of its own name (the two halves of a conditional `try: X=…;
+          // except: X=…`) is never a meaningful value read.
+          if (targetId && targetId !== scope.id && refName !== scope.name && !seen.has(targetId)) {
             seen.add(targetId);
             this.edges.push({
               source: scope.id,