Просмотр исходного кода

feat(resolution): close dynamic-dispatch coverage holes (callback synthesis + django ORM)

Static tree-sitter extraction misses calls whose target is computed or indirect,
so flows through callbacks, observers, and descriptors were absent from the graph.

- callback-synthesizer.ts: whole-graph pass after base resolution. Detects
  registrar/dispatcher channels (field-backed observers + string-keyed
  EventEmitters), correlates registration sites, and synthesizes
  dispatcher->callback `calls` edges (provenance:'heuristic'). Records the
  registration site (registeredAt) in edge metadata. Precision guards: named
  handlers only, registrar-name match, event fan-out cap.
- frameworks/python.ts + resolution/{index,types}.ts: claimsReference hook +
  django ORM resolver (_iterable_class -> ModelIterable.__iter__).
- extraction/tree-sitter.ts: extract named nested functions so inline named
  handlers become linkable nodes.

trace(mutateElement, triggerRender) and trace(_fetch_all, execute_sql) now
connect; node count stable (no explosion).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Colby McHenry 1 месяц назад
Родитель
Сommit
60457be7ea

+ 15 - 0
src/extraction/tree-sitter.ts

@@ -1678,6 +1678,21 @@ export class TreeSitterExtractor {
         }
       }
 
+      // Nested NAMED functions inside a body — function declarations and named
+      // function expressions like `.on('mount', function onmount(){})` — become
+      // their own nodes so the graph can link to them (callback handlers, local
+      // helpers). Anonymous arrows/expressions fall through to the default
+      // recursion below, keeping their inner calls attributed to the enclosing
+      // function: this bounds the new nodes to NAMED functions only (no explosion,
+      // no lost edges). extractFunction walks the nested body itself, so we return.
+      if (this.extractor!.functionTypes.includes(nodeType)) {
+        const nestedName = extractName(node, this.source, this.extractor!);
+        if (nestedName && nestedName !== '<anonymous>') {
+          this.extractFunction(node);
+          return;
+        }
+      }
+
       // Extract structural nodes found inside function bodies.
       // Each extract method visits its own children, so we return after extracting.
       if (this.extractor!.classTypes.includes(nodeType)) {

+ 203 - 0
src/resolution/callback-synthesizer.ts

@@ -0,0 +1,203 @@
+/**
+ * Callback / observer edge synthesis — Phase 1 + 2.
+ *
+ * Closes dynamic-dispatch holes where a dispatcher invokes callbacks registered
+ * elsewhere. Two channel shapes:
+ *
+ *  (1) Field-backed observer (Phase 1):
+ *      onUpdate(cb) { this.callbacks.add(cb); }            // registrar
+ *      triggerUpdate() { for (cb of this.callbacks) cb(); } // dispatcher
+ *      scene.onUpdate(this.triggerRender)                  // registration
+ *      → synthesize triggerUpdate → triggerRender
+ *
+ *  (2) String-keyed EventEmitter (Phase 2):
+ *      this.on('mount', function onmount(){...})           // registration
+ *      fn.emit('mount', this)                              // dispatch
+ *      → synthesize (method containing emit('mount')) → onmount
+ *
+ * Whole-graph pass after base resolution. High-precision/low-recall by design:
+ * named callbacks only; field channels paired by file+field; EventEmitter
+ * channels capped by event fan-out (generic names like 'error' skipped — they
+ * need receiver-type matching, deferred to Phase 3). All synthesized edges are
+ * tagged `provenance:'heuristic'`. See docs/design/callback-edge-synthesis.md.
+ */
+import type { Edge, Node } from '../types';
+import type { QueryBuilder } from '../db/queries';
+import type { ResolutionContext } from './types';
+
+const REGISTRAR_NAME = /^(on[A-Z]\w*|subscribe|addListener|addEventListener|register|watch|listen|addCallback)$/;
+const DISPATCHER_NAME = /(emit|trigger|notify|dispatch|fire|publish|flush)/i;
+const MAX_CALLBACKS_PER_CHANNEL = 40;
+const EVENT_FANOUT_CAP = 6; // skip events with more handlers/dispatchers than this (too generic without type info)
+
+const ON_RE = /\.(?:on|once|addListener)\(\s*['"]([^'"]+)['"]\s*,\s*(?:function\s+(\w+)|(?:this\.)?(\w+))/g;
+const EMIT_RE = /\.(?:emit|fire|dispatchEvent)\(\s*['"]([^'"]+)['"]/g;
+
+function sliceLines(content: string, startLine?: number, endLine?: number): string | null {
+  if (!startLine || !endLine) return null;
+  return content.split('\n').slice(startLine - 1, endLine).join('\n');
+}
+
+function registrarField(src: string): string | null {
+  const m = src.match(/this\.(\w+)\.(?:add|push|set)\(/);
+  return m ? m[1]! : null;
+}
+
+function dispatcherField(src: string): string | null {
+  const forOf = src.match(/\bof\s+(?:Array\.from\(\s*)?this\.(\w+)/);
+  if (forOf && /\b\w+\s*\(/.test(src)) return forOf[1]!;
+  const forEach = src.match(/this\.(\w+)\.forEach\(/);
+  if (forEach) return forEach[1]!;
+  return null;
+}
+
+const FN_KINDS = new Set(['method', 'function', 'component']);
+
+/** Innermost function/method node whose line range contains `line`. */
+function enclosingFn(nodesInFile: Node[], line: number): Node | null {
+  let best: Node | null = null;
+  for (const n of nodesInFile) {
+    if (!FN_KINDS.has(n.kind)) continue;
+    const end = n.endLine ?? n.startLine;
+    if (n.startLine <= line && end >= line) {
+      if (!best || n.startLine >= best.startLine) best = n; // prefer the tightest (latest-starting) encloser
+    }
+  }
+  return best;
+}
+
+/** Phase 1: field-backed observer channels (registrar/dispatcher share a store). */
+function fieldChannelEdges(queries: QueryBuilder, ctx: ResolutionContext): Edge[] {
+  const candidates = [...queries.getNodesByKind('method'), ...queries.getNodesByKind('function')];
+  const registrars: Array<{ node: Node; field: string }> = [];
+  const dispatchers: Array<{ node: Node; field: string }> = [];
+
+  for (const m of candidates) {
+    const isReg = REGISTRAR_NAME.test(m.name);
+    const isDisp = DISPATCHER_NAME.test(m.name);
+    if (!isReg && !isDisp) continue;
+    const content = ctx.readFile(m.filePath);
+    const src = content && sliceLines(content, m.startLine, m.endLine);
+    if (!src) continue;
+    if (isReg) { const f = registrarField(src); if (f) registrars.push({ node: m, field: f }); }
+    if (isDisp) { const f = dispatcherField(src); if (f) dispatchers.push({ node: m, field: f }); }
+  }
+
+  const edges: Edge[] = [];
+  const seen = new Set<string>();
+  for (const reg of registrars) {
+    const chDispatchers = dispatchers.filter(
+      (d) => d.node.filePath === reg.node.filePath && d.field === reg.field
+    );
+    if (chDispatchers.length === 0) continue;
+    const argRe = new RegExp(`${reg.node.name}\\s*\\(\\s*(?:this\\.)?(\\w+)`);
+    let added = 0;
+    for (const e of queries.getIncomingEdges(reg.node.id, ['calls'])) {
+      if (added >= MAX_CALLBACKS_PER_CHANNEL) break;
+      if (!e.line) continue;
+      const caller = queries.getNodeById(e.source);
+      if (!caller) continue;
+      const line = ctx.readFile(caller.filePath)?.split('\n')[e.line - 1];
+      const am = line?.match(argRe);
+      if (!am) continue;
+      const fn = ctx.getNodesByName(am[1]!).find((n) => n.kind === 'method' || n.kind === 'function');
+      if (!fn) continue;
+      for (const disp of chDispatchers) {
+        if (disp.node.id === fn.id) continue;
+        const key = `${disp.node.id}>${fn.id}`;
+        if (seen.has(key)) continue;
+        seen.add(key);
+        edges.push({
+          source: disp.node.id, target: fn.id, kind: 'calls', line: disp.node.startLine,
+          provenance: 'heuristic',
+          metadata: {
+            synthesizedBy: 'callback', via: reg.node.name, field: reg.field,
+            // Where the callback was wired up (`scene.onUpdate(this.triggerRender)`).
+            // This is the #1 thing an agent reads/greps to explain the flow — surface
+            // it so node/trace/context can show it without a callers() + Read round-trip.
+            registeredAt: `${caller.filePath}:${e.line}`,
+          },
+        });
+        added++;
+      }
+    }
+  }
+  return edges;
+}
+
+/** Phase 2: string-keyed EventEmitter channels (on('e', fn) ↔ emit('e')). */
+function eventEmitterEdges(ctx: ResolutionContext): Edge[] {
+  const emitsByEvent = new Map<string, Set<string>>();          // event → dispatcher node ids
+  const handlersByEvent = new Map<string, Map<string, string>>(); // event → handler id → registration site (file:line)
+
+  for (const file of ctx.getAllFiles()) {
+    const content = ctx.readFile(file);
+    if (!content) continue;
+    const hasEmit = content.includes('.emit(') || content.includes('.fire(') || content.includes('.dispatchEvent(');
+    const hasOn = content.includes('.on(') || content.includes('.once(') || content.includes('.addListener(');
+    if (!hasEmit && !hasOn) continue;
+    const nodesInFile = ctx.getNodesInFile(file);
+    const lineOf = (idx: number) => content.slice(0, idx).split('\n').length;
+
+    if (hasEmit) {
+      EMIT_RE.lastIndex = 0;
+      let m: RegExpExecArray | null;
+      while ((m = EMIT_RE.exec(content))) {
+        const disp = enclosingFn(nodesInFile, lineOf(m.index));
+        if (!disp) continue;
+        const set = emitsByEvent.get(m[1]!) ?? new Set<string>();
+        set.add(disp.id); emitsByEvent.set(m[1]!, set);
+      }
+    }
+    if (hasOn) {
+      ON_RE.lastIndex = 0;
+      let m: RegExpExecArray | null;
+      while ((m = ON_RE.exec(content))) {
+        const handlerName = m[2] || m[3];
+        if (!handlerName) continue;
+        const handler = ctx.getNodesByName(handlerName).find((n) => n.kind === 'function' || n.kind === 'method');
+        if (!handler) continue;
+        const map = handlersByEvent.get(m[1]!) ?? new Map<string, string>();
+        map.set(handler.id, `${file}:${lineOf(m.index)}`); handlersByEvent.set(m[1]!, map);
+      }
+    }
+  }
+
+  const edges: Edge[] = [];
+  const seen = new Set<string>();
+  for (const [event, dispatchers] of emitsByEvent) {
+    const handlers = handlersByEvent.get(event);
+    if (!handlers) continue;
+    // Precision guard: a generic event name with many handlers/dispatchers can't
+    // be matched without receiver-type info (Phase 3) — skip rather than over-link.
+    if (dispatchers.size > EVENT_FANOUT_CAP || handlers.size > EVENT_FANOUT_CAP) continue;
+    for (const d of dispatchers) for (const [h, registeredAt] of handlers) {
+      if (d === h) continue;
+      const key = `${d}>${h}`;
+      if (seen.has(key)) continue;
+      seen.add(key);
+      edges.push({ source: d, target: h, kind: 'calls', provenance: 'heuristic', metadata: { synthesizedBy: 'event-emitter', event, registeredAt } });
+    }
+  }
+  return edges;
+}
+
+/**
+ * Synthesize dispatcher→callback edges (field observers + EventEmitters).
+ * Returns the count added. Never throws into indexing — callers wrap in try/catch.
+ */
+export function synthesizeCallbackEdges(queries: QueryBuilder, ctx: ResolutionContext): number {
+  const fieldEdges = fieldChannelEdges(queries, ctx);
+  const emitterEdges = eventEmitterEdges(ctx);
+
+  const merged: Edge[] = [];
+  const seen = new Set<string>();
+  for (const e of [...fieldEdges, ...emitterEdges]) {
+    const key = `${e.source}>${e.target}`;
+    if (seen.has(key)) continue;
+    seen.add(key);
+    merged.push(e);
+  }
+  if (merged.length > 0) queries.insertEdges(merged);
+  return merged.length;
+}

+ 32 - 0
src/resolution/frameworks/python.ts

@@ -35,9 +35,25 @@ export const djangoResolver: FrameworkResolver = {
       const result = resolveByNameAndKind(ref.referenceName, CLASS_KINDS, FORM_DIRS, context);
       if (result) return { original: ref, targetNodeId: result, confidence: 0.8, resolvedBy: 'framework' };
     }
+    // ORM dynamic dispatch: QuerySet._fetch_all (and siblings) call
+    // `self._iterable_class(self)` — a runtime dispatch to the iterable class
+    // (default ModelIterable) whose __iter__ runs the SQL compiler. Static
+    // parsing can't resolve an attribute-as-callable, so it leaves an unresolved
+    // `_iterable_class` ref and a hole in the QuerySet→compiler chain. Bridge it
+    // to ModelIterable.__iter__ so the flow actually exists in the graph.
+    if (ref.referenceName === '_iterable_class') {
+      const target = resolveModelIterableIter(context);
+      if (target) return { original: ref, targetNodeId: target, confidence: 0.7, resolvedBy: 'framework' };
+    }
     return null;
   },
 
+  // Let the ORM dynamic-dispatch ref reach resolve() despite no symbol being
+  // named `_iterable_class` (it's a QuerySet attribute, not a declared method).
+  claimsReference(name) {
+    return name === '_iterable_class';
+  },
+
   extract(filePath, content) {
     if (!filePath.endsWith('.py')) return { nodes: [], references: [] };
 
@@ -90,6 +106,22 @@ export const djangoResolver: FrameworkResolver = {
   },
 };
 
+/**
+ * Find ModelIterable.__iter__ — the default iterable QuerySet invokes via
+ * `self._iterable_class(self)`. Its __iter__ statically calls the SQL compiler,
+ * so linking the dynamic dispatch here closes the QuerySet→SQL call chain.
+ * (Over-approximates to the default iterable; .values()/.values_list() swap in
+ * other BaseIterable subclasses, but ModelIterable is the canonical path.)
+ */
+function resolveModelIterableIter(context: ResolutionContext): string | null {
+  const cls = context.getNodesByName('ModelIterable').find((n) => n.kind === 'class');
+  if (!cls) return null;
+  const iter = context.getNodesByName('__iter__').find(
+    (n) => n.filePath === cls.filePath && n.startLine >= cls.startLine && n.startLine <= cls.endLine
+  );
+  return iter ? iter.id : null;
+}
+
 /**
  * Parse a Django URL handler expression and return the symbol/module to link.
  * Returns null for shapes we can't confidently link (e.g. lambdas).

+ 16 - 1
src/resolution/index.ts

@@ -19,6 +19,7 @@ import {
 import { matchReference } from './name-matcher';
 import { resolveViaImport, extractImportMappings, extractReExports } from './import-resolver';
 import { detectFrameworks } from './frameworks';
+import { synthesizeCallbackEdges } from './callback-synthesizer';
 import { loadProjectAliases, type AliasMap } from './path-aliases';
 import { logDebug } from '../errors';
 import type { ReExport } from './types';
@@ -493,7 +494,11 @@ export class ReferenceResolver {
     // from './barrel'` where the barrel has `export { signIn as login }
     // from './auth'`) intentionally call a name that has no
     // declaration anywhere — only the renamed upstream symbol does.
-    if (!this.hasAnyPossibleMatch(ref.referenceName) && !this.matchesAnyImport(ref)) {
+    if (
+      !this.hasAnyPossibleMatch(ref.referenceName) &&
+      !this.matchesAnyImport(ref) &&
+      !this.frameworks.some((f) => f.claimsReference?.(ref.referenceName))
+    ) {
       return null;
     }
 
@@ -681,6 +686,16 @@ export class ReferenceResolver {
       }
     }
 
+    // Dynamic-edge synthesis: now that all base `calls` edges are persisted,
+    // synthesize observer/callback dispatch edges (dispatcher → registered
+    // callbacks) that static parsing leaves out. Best-effort — never fail the
+    // index on it. See docs/design/callback-edge-synthesis.md.
+    try {
+      aggregateStats.byMethod['callback-synthesis'] = synthesizeCallbackEdges(this.queries, this.context);
+    } catch {
+      // synthesis is additive and optional; ignore failures
+    }
+
     return {
       resolved: [],
       unresolved: [],

+ 8 - 0
src/resolution/types.ts

@@ -131,6 +131,14 @@ export interface FrameworkResolver {
   detect(context: ResolutionContext): boolean;
   /** Resolve a reference using framework-specific patterns */
   resolve(ref: UnresolvedRef, context: ResolutionContext): ResolvedRef | null;
+  /**
+   * Opt a reference NAME through the resolver's name-exists pre-filter, even when
+   * no node is named that. Needed for dynamic dispatch where the call target is
+   * an attribute/descriptor, not a declared symbol (e.g. Django's
+   * `self._iterable_class(...)`, React effect callbacks). Returning true lets the
+   * ref reach `resolve()` instead of being dropped for having no name match.
+   */
+  claimsReference?(name: string): boolean;
   /**
    * Extract framework-specific nodes and references from a file.
    *