dynamic-boundaries.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. /**
  2. * Dynamic-dispatch boundary detection for codegraph_explore (#687).
  3. *
  4. * When the flow an agent asked about does NOT connect statically, the cause is
  5. * almost always a dynamic-dispatch site: a computed member call, getattr,
  6. * reflection, a string-keyed bus, a typed command/mediator dispatch. Guessing
  7. * the missing edge was rejected (silent beats wrong — a wrong edge poisons the
  8. * map and teaches abandonment). Instead, explore ANNOUNCES the boundary
  9. * honestly: the exact site where the static path ends, the dispatch form, and
  10. * — when a key is statically visible (string literal, `:symbol`, `new Type`)
  11. * — that key, so the caller can shortlist candidate targets.
  12. *
  13. * Detection is deterministic regex over the comment/string-stripped bodies of
  14. * the symbols the agent named, at QUERY TIME only. The graph is never mutated;
  15. * an unbroken flow never triggers a scan. Matching runs on the stripped text
  16. * (so commented-out / string-embedded code can't fire) but snippets and keys
  17. * are sliced from the ORIGINAL source at the same offsets — both strippers
  18. * blank contents in place, preserving offsets, precisely for this.
  19. * (`stripCommentsForRegex` blanks comments but deliberately KEEPS string
  20. * contents — framework extractors need route literals; here a dispatch shape
  21. * inside a string is a false positive, so {@link blankStringContents} blanks
  22. * them too, quotes preserved.)
  23. */
  24. import { stripCommentsForRegex, type CommentLang } from '../resolution/strip-comments';
  25. export interface BoundaryMatch {
  26. /** Stable form id, e.g. 'computed-call' — used for per-form dedupe. */
  27. form: string;
  28. /** Human label for the dispatch form, e.g. 'computed member call'. */
  29. label: string;
  30. /** One-line source snippet of the site (from the original, untrimmed text). */
  31. snippet: string;
  32. /** 1-based line within the scanned body's FILE (absolute, ready to print). */
  33. line: number;
  34. /**
  35. * Statically-visible dispatch key, when one exists: the string literal in
  36. * `handlers['save']`, the `:symbol` in ruby `send`, the type name in
  37. * `Send(new CreateCmd(...))`. Drives candidate lookup. Undefined when the
  38. * key is a runtime value (variable, computed expression).
  39. */
  40. key?: string;
  41. /** For typed-bus matches the key is a TYPE name (candidates ~ `${key}Handler`). */
  42. keyIsType?: boolean;
  43. /** Additional sites of the same form+key in this body beyond the reported one. */
  44. moreSites?: number;
  45. }
  46. interface FormSpec {
  47. form: string;
  48. label: string;
  49. /** Languages this form applies to; undefined = all. Node.language values. */
  50. langs?: Set<string>;
  51. re: RegExp;
  52. /**
  53. * Derive the dispatch key from the ORIGINAL-source snippet around the match
  54. * (match start .. match end + keyWindow). Return undefined when no static key.
  55. */
  56. keyFrom?: (orig: string) => { key: string; keyIsType?: boolean } | undefined;
  57. /**
  58. * Extra ORIGINAL chars after the match end handed to keyFrom, capped at the
  59. * first newline — for forms whose key trails the matched prefix, e.g.
  60. * `.getMethod(` → `"handlePing"`. Forms with $-anchored keyFrom regexes
  61. * must leave this unset (the anchor relies on the slice ending at the match).
  62. */
  63. keyWindow?: number;
  64. }
  65. const JS_FAMILY = new Set(['typescript', 'javascript', 'tsx', 'jsx', 'vue', 'svelte', 'astro']);
  66. const PY = new Set(['python']);
  67. const RB = new Set(['ruby']);
  68. const PHP = new Set(['php']);
  69. const JVM_CS_GO = new Set(['java', 'kotlin', 'scala', 'csharp', 'go']);
  70. const SWIFT_OBJC = new Set(['swift', 'objc', 'objcpp', 'objective-c']);
  71. /** Exactly one quoted literal and no concatenation → that literal is the key. */
  72. function singleStringLiteral(text: string): string | undefined {
  73. const m = text.match(/^[^'"`]*(['"`])([\w.:-]{2,64})\1[^'"`]*$/);
  74. return m ? m[2] : undefined;
  75. }
  76. const FORMS: FormSpec[] = [
  77. {
  78. // handlers[action.type](payload) / registry[key](args) / table[k](...) —
  79. // the `](` adjacency is the gate; a word/`)`/`]` char must precede `[` so
  80. // array literals and markdown-ish text in prose can't fire.
  81. form: 'computed-call',
  82. label: 'computed member call',
  83. re: /[\w$)\]]\s*\[([^[\]\n]{1,80})\]\s*\(/g,
  84. keyFrom: (orig) => {
  85. const inner = orig.match(/\[([^[\]\n]{1,80})\]\s*\($/);
  86. const key = inner ? singleStringLiteral(inner[1]!) : undefined;
  87. return key ? { key } : undefined;
  88. },
  89. },
  90. {
  91. // import(expr) / require(expr) with a NON-literal argument → runtime module
  92. // choice. Literal imports are ordinary edges and never reach this scanner.
  93. form: 'dynamic-import',
  94. label: 'dynamic import',
  95. langs: JS_FAMILY,
  96. re: /\b(?:import|require)\s*\(\s*(?![\s'"`)])/g,
  97. },
  98. {
  99. form: 'dynamic-import',
  100. label: 'dynamic import',
  101. langs: PY,
  102. re: /\bimportlib\.import_module\s*\(|\b__import__\s*\(/g,
  103. },
  104. {
  105. // obj.send(:method_name) / public_send / method(:name) — ruby metaprogramming.
  106. form: 'ruby-send',
  107. label: 'send dispatch',
  108. langs: RB,
  109. re: /\.(?:public_)?send\s*\(\s*:?\w+|\bmethod\s*\(\s*:\w+\s*\)/g,
  110. keyFrom: (orig) => {
  111. const m = orig.match(/:(\w+)/);
  112. return m ? { key: m[1]! } : undefined;
  113. },
  114. },
  115. {
  116. // call_user_func([$this, 'method']) / $this->$method() / $callback() —
  117. // PHP variable functions and callables.
  118. form: 'php-dynamic',
  119. label: 'dynamic call',
  120. langs: PHP,
  121. re: /\bcall_user_func(?:_array)?\s*\(|\$this\s*->\s*\$\w+\s*\(|\$\w+\s*\(/g,
  122. keyWindow: 80,
  123. keyFrom: (orig) => {
  124. const key = singleStringLiteral(orig);
  125. return key ? { key } : undefined;
  126. },
  127. },
  128. {
  129. // Reflection: Method.invoke / getMethod("x") / Class.forName / Go
  130. // reflect MethodByName / C# Activator.CreateInstance, GetMethod.
  131. form: 'reflection',
  132. label: 'reflective dispatch',
  133. langs: JVM_CS_GO,
  134. re: /\.invoke\s*\(|\.get(?:Declared)?Method\s*\(|\.GetMethod\s*\(|MethodByName\s*\(|Activator\.CreateInstance|Class\.forName\s*\(/g,
  135. keyWindow: 80,
  136. keyFrom: (orig) => {
  137. const key = singleStringLiteral(orig);
  138. return key ? { key } : undefined;
  139. },
  140. },
  141. {
  142. // new Proxy(target, handler) / Reflect.get|apply — JS metaobject dispatch.
  143. form: 'proxy-reflect',
  144. label: 'Proxy/Reflect dispatch',
  145. langs: JS_FAMILY,
  146. re: /\bnew\s+Proxy\s*\(|\bReflect\.(?:get|apply|construct)\s*\(/g,
  147. },
  148. {
  149. // mediator.Send(new CreateTodoItemCommand(...)) / bus.publish(new OrderEvent(...))
  150. // — typed message dispatch (MediatR/CQRS/event-bus). The request TYPE is the
  151. // key; the conventional target is `<Type>Handler`.
  152. form: 'typed-bus',
  153. label: 'typed message dispatch',
  154. re: /\.(?:[Ss]end|[Pp]ublish|[Dd]ispatch|[Ee]xecute|[Pp]ost|[Ee]mit)(?:Async)?\s*(?:<[^<>\n]{0,80}>)?\s*\(\s*new\s+([A-Z]\w*)/g,
  155. keyFrom: (orig) => {
  156. const m = orig.match(/new\s+([A-Z]\w*)$/);
  157. return m ? { key: m[1]!, keyIsType: true } : undefined;
  158. },
  159. },
  160. {
  161. // emitter.emit(eventVar, ...) / store.dispatch(action) — string-keyed
  162. // dispatch where the key is a RUNTIME value. (Literal-keyed emits are the
  163. // synthesizer's territory and connect statically when a handler matches.)
  164. form: 'var-key-dispatch',
  165. label: 'string-keyed dispatch (runtime key)',
  166. re: /\.(?:emit|dispatch|trigger|fire|publish|broadcast)\s*\(\s*[A-Za-z_$][\w$]*(?:\.[\w$]+){0,3}\s*[,)]/g,
  167. },
  168. {
  169. // Swift/ObjC: #selector(name) / NSClassFromString — runtime selector dispatch.
  170. form: 'selector',
  171. label: 'selector dispatch',
  172. langs: SWIFT_OBJC,
  173. re: /#selector\s*\(\s*([\w.]+)|NSClassFromString\s*\(/g,
  174. keyFrom: (orig) => {
  175. const m = orig.match(/#selector\s*\(\s*([\w.]+)/);
  176. if (!m) return undefined;
  177. const segs = m[1]!.split('.');
  178. return { key: segs[segs.length - 1]! };
  179. },
  180. },
  181. ];
  182. /** Map a Node.language to the comment-stripper's language set. */
  183. function commentLang(language: string): CommentLang | null {
  184. switch (language) {
  185. case 'python': return 'python';
  186. case 'ruby': return 'ruby';
  187. case 'rust': return 'rust';
  188. case 'php': return 'php';
  189. case 'go': return 'go';
  190. case 'javascript':
  191. case 'jsx':
  192. return 'javascript';
  193. case 'typescript':
  194. case 'tsx':
  195. case 'vue':
  196. case 'svelte':
  197. case 'astro':
  198. return 'typescript';
  199. case 'java':
  200. case 'kotlin':
  201. case 'scala':
  202. case 'dart':
  203. return 'java';
  204. case 'csharp': return 'csharp';
  205. case 'swift': return 'swift';
  206. case 'c':
  207. case 'cpp':
  208. case 'objc':
  209. case 'objcpp':
  210. return 'java'; // C-style comments + double-quoted strings — close enough for blanking
  211. default: return null;
  212. }
  213. }
  214. const MAX_MATCHES_PER_BODY = 3;
  215. const MAX_BODY_CHARS = 60_000; // a god-function tail is still scannable; beyond this, truncate
  216. /**
  217. * Blank the CONTENTS of string literals (quotes preserved, offsets preserved)
  218. * so dispatch-shaped prose — docs, error messages, template text — can't fire
  219. * a matcher. Run AFTER comment stripping (comments are already spaces).
  220. * Backslash escapes are honored; `'`/`"` strings end at a newline (treated as
  221. * unterminated, matching the comment stripper); backticks span lines, and
  222. * `${...}` interpolations inside them are blanked too — missing a dispatch
  223. * inside a template literal is acceptable, false-firing on prose is not.
  224. */
  225. export function blankStringContents(text: string): string {
  226. const out = text.split('');
  227. let i = 0;
  228. const n = text.length;
  229. while (i < n) {
  230. const c = text[i]!;
  231. if (c === '"' || c === "'" || c === '`') {
  232. const quote = c;
  233. i++;
  234. while (i < n && text[i] !== quote) {
  235. if (text[i] === '\\' && i + 1 < n) {
  236. out[i] = ' ';
  237. out[i + 1] = ' ';
  238. i += 2;
  239. continue;
  240. }
  241. if (quote !== '`' && text[i] === '\n') break; // unterminated — stop blanking
  242. if (text[i] !== '\n') out[i] = ' '; // keep newlines for line math
  243. i++;
  244. }
  245. if (i < n && text[i] === quote) i++;
  246. continue;
  247. }
  248. i++;
  249. }
  250. return out.join('');
  251. }
  252. /**
  253. * Scan one symbol's body for dynamic-dispatch sites.
  254. *
  255. * @param body the symbol's source text (sliced from the file)
  256. * @param language Node.language of the symbol
  257. * @param fileStartLine 1-based line where `body` starts in its file — returned
  258. * line numbers are absolute file lines.
  259. */
  260. export function scanDynamicDispatch(body: string, language: string, fileStartLine: number): BoundaryMatch[] {
  261. const original = body.length > MAX_BODY_CHARS ? body.slice(0, MAX_BODY_CHARS) : body;
  262. const lang = commentLang(language);
  263. const stripped = blankStringContents(lang ? stripCommentsForRegex(original, lang) : original);
  264. const out: BoundaryMatch[] = [];
  265. const seen = new Map<string, BoundaryMatch>(); // form+key → first match (counts extras)
  266. if (language === 'python') scanPythonGetattr(stripped, original, fileStartLine, out, seen);
  267. for (const spec of FORMS) {
  268. if (out.length >= MAX_MATCHES_PER_BODY) break;
  269. if (spec.langs && !spec.langs.has(language)) continue;
  270. spec.re.lastIndex = 0;
  271. let m: RegExpExecArray | null;
  272. while ((m = spec.re.exec(stripped)) !== null) {
  273. let sliceEnd = m.index + m[0].length;
  274. if (spec.keyWindow) {
  275. const windowEnd = Math.min(original.length, sliceEnd + spec.keyWindow);
  276. const nl = original.indexOf('\n', sliceEnd);
  277. sliceEnd = nl !== -1 && nl < windowEnd ? nl : windowEnd;
  278. }
  279. const origSlice = original.slice(m.index, sliceEnd);
  280. const derived = spec.keyFrom?.(origSlice);
  281. const dedupeKey = `${spec.form}|${derived?.key ?? ''}`;
  282. const prior = seen.get(dedupeKey);
  283. if (prior) {
  284. prior.moreSites = (prior.moreSites ?? 0) + 1;
  285. continue;
  286. }
  287. const line = fileStartLine + countNewlines(original, m.index);
  288. const match: BoundaryMatch = {
  289. form: spec.form,
  290. label: spec.label,
  291. snippet: snippetAround(original, m.index),
  292. line,
  293. ...(derived ?? {}),
  294. };
  295. seen.set(dedupeKey, match);
  296. out.push(match);
  297. if (out.length >= MAX_MATCHES_PER_BODY) return out;
  298. }
  299. }
  300. return out;
  301. }
  302. /**
  303. * Python getattr dispatch — handled in code, not the FORMS table, because real
  304. * getattr calls have nested-call arguments spanning lines
  305. * (`getattr(self, request.method.lower(),\n self.http_method_not_allowed)` —
  306. * DRF's APIView.dispatch) that a regex argument class can't bound. Two shapes:
  307. * getattr(obj, name)(args) → immediate call
  308. * handler = getattr(obj, name) ... handler(...) → assigned, called later
  309. */
  310. const GETATTR_RE = /\bgetattr\s*\(/g;
  311. const MAX_GETATTR_ARGS = 300;
  312. function scanPythonGetattr(stripped: string, original: string, fileStartLine: number, out: BoundaryMatch[], seen: Map<string, BoundaryMatch>): void {
  313. GETATTR_RE.lastIndex = 0;
  314. let m: RegExpExecArray | null;
  315. while ((m = GETATTR_RE.exec(stripped)) !== null && out.length < MAX_MATCHES_PER_BODY) {
  316. const open = m.index + m[0].length - 1;
  317. const close = matchBalancedParen(stripped, open);
  318. if (close === -1) continue;
  319. let form: string | undefined;
  320. let label = '';
  321. // Immediate call: getattr(...)(
  322. const after = stripped.slice(close + 1, close + 8);
  323. if (/^\s*\(/.test(after)) {
  324. form = 'getattr-call';
  325. label = 'getattr dispatch';
  326. } else {
  327. // Assigned form: look back for `name =` and forward for `name(`.
  328. const lineStart = stripped.lastIndexOf('\n', m.index) + 1;
  329. const before = stripped.slice(lineStart, m.index);
  330. const assign = before.match(/(\w+)\s*=\s*$/);
  331. if (assign && new RegExp(`\\b${assign[1]}\\s*\\(`).test(stripped.slice(close + 1))) {
  332. form = 'getattr-assign';
  333. label = 'getattr dispatch (assigned, called later)';
  334. }
  335. }
  336. if (!form) continue;
  337. const key = singleStringLiteral(original.slice(open + 1, close));
  338. const dedupeKey = `${form}|${key ?? ''}`;
  339. const prior = seen.get(dedupeKey);
  340. if (prior) {
  341. prior.moreSites = (prior.moreSites ?? 0) + 1;
  342. continue;
  343. }
  344. const match: BoundaryMatch = {
  345. form,
  346. label,
  347. snippet: snippetAround(original, m.index),
  348. line: fileStartLine + countNewlines(original, m.index),
  349. ...(key ? { key } : {}),
  350. };
  351. seen.set(dedupeKey, match);
  352. out.push(match);
  353. }
  354. }
  355. /** Index of the `)` balancing `text[open]`, or -1 (cap: MAX_GETATTR_ARGS chars). */
  356. function matchBalancedParen(text: string, open: number): number {
  357. let depth = 0;
  358. const end = Math.min(text.length, open + MAX_GETATTR_ARGS);
  359. for (let i = open; i < end; i++) {
  360. const c = text[i];
  361. if (c === '(') depth++;
  362. else if (c === ')' && --depth === 0) return i;
  363. }
  364. return -1;
  365. }
  366. function countNewlines(text: string, end: number): number {
  367. let n = 0;
  368. for (let i = 0; i < end; i++) if (text.charCodeAt(i) === 10) n++;
  369. return n;
  370. }
  371. /** The full source line containing `index`, trimmed and capped for display. */
  372. function snippetAround(text: string, index: number): string {
  373. const lineStart = text.lastIndexOf('\n', index) + 1;
  374. let lineEnd = text.indexOf('\n', index);
  375. if (lineEnd === -1) lineEnd = text.length;
  376. const line = text.slice(lineStart, lineEnd).trim();
  377. return line.length > 120 ? line.slice(0, 117) + '...' : line;
  378. }