index.ts 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372
  1. /**
  2. * Context Builder
  3. *
  4. * Builds rich context for tasks by combining FTS search with graph traversal.
  5. * Outputs structured context ready to inject into Claude.
  6. */
  7. import * as fs from 'fs';
  8. import * as path from 'path';
  9. import {
  10. Node,
  11. Edge,
  12. NodeKind,
  13. EdgeKind,
  14. Subgraph,
  15. CodeBlock,
  16. TaskContext,
  17. TaskInput,
  18. BuildContextOptions,
  19. FindRelevantContextOptions,
  20. SearchResult,
  21. } from '../types';
  22. import { QueryBuilder } from '../db/queries';
  23. import { GraphTraverser } from '../graph';
  24. import { formatContextAsMarkdown, formatContextAsJson } from './formatter';
  25. import { logDebug } from '../errors';
  26. import { validatePathWithinRoot, isConfigLeafNode } from '../utils';
  27. import { isTestFile, extractSearchTerms, scorePathRelevance, getStemVariants, isDistinctiveIdentifier } from '../search/query-utils';
  28. import { LOW_CONFIDENCE_MARKER } from './markers';
  29. /**
  30. * Extract likely symbol names from a natural language query
  31. *
  32. * Identifies potential code symbols using patterns:
  33. * - CamelCase: UserService, signInWithGoogle
  34. * - snake_case: user_service, sign_in
  35. * - SCREAMING_SNAKE: MAX_RETRIES
  36. * - dot.notation: app.isPackaged (extracts both sides)
  37. * - Single words that look like identifiers (no spaces, not common English words)
  38. *
  39. * @param query - Natural language query
  40. * @returns Array of potential symbol names
  41. */
  42. function extractSymbolsFromQuery(query: string): string[] {
  43. const symbols = new Set<string>();
  44. // Extract CamelCase identifiers (2+ chars, starts with letter)
  45. const camelCasePattern = /\b([A-Z][a-z]+(?:[A-Z][a-z]*)*|[a-z]+(?:[A-Z][a-z]*)+)\b/g;
  46. let match;
  47. while ((match = camelCasePattern.exec(query)) !== null) {
  48. if (match[1] && match[1].length >= 2) {
  49. symbols.add(match[1]);
  50. }
  51. }
  52. // Extract snake_case identifiers
  53. const snakeCasePattern = /\b([a-z][a-z0-9]*(?:_[a-z0-9]+)+)\b/gi;
  54. while ((match = snakeCasePattern.exec(query)) !== null) {
  55. if (match[1] && match[1].length >= 3) {
  56. symbols.add(match[1]);
  57. }
  58. }
  59. // Extract SCREAMING_SNAKE_CASE
  60. const screamingPattern = /\b([A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+)\b/g;
  61. while ((match = screamingPattern.exec(query)) !== null) {
  62. if (match[1]) {
  63. symbols.add(match[1]);
  64. }
  65. }
  66. // Extract ALL_CAPS acronyms (2+ chars, e.g., REST, HTTP, LRU, API)
  67. const acronymPattern = /\b([A-Z]{2,})\b/g;
  68. while ((match = acronymPattern.exec(query)) !== null) {
  69. if (match[1]) {
  70. symbols.add(match[1]);
  71. }
  72. }
  73. // Extract dot.notation and split into parts (e.g., "app.isPackaged" -> ["app", "isPackaged"])
  74. const dotPattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:\.[a-zA-Z][a-zA-Z0-9]*)+)\b/g;
  75. while ((match = dotPattern.exec(query)) !== null) {
  76. if (match[1]) {
  77. // Add both the full path and individual parts
  78. symbols.add(match[1]);
  79. const parts = match[1].split('.');
  80. for (const part of parts) {
  81. if (part.length >= 2) {
  82. symbols.add(part);
  83. }
  84. }
  85. }
  86. }
  87. // Extract plain lowercase identifiers (3+ chars, not already matched)
  88. // Catches symbol names like "undo", "redo", "history", "render", "parse"
  89. const lowercasePattern = /\b([a-z][a-z0-9]{2,})\b/g;
  90. while ((match = lowercasePattern.exec(query)) !== null) {
  91. if (match[1]) {
  92. symbols.add(match[1]);
  93. }
  94. }
  95. // Filter out common English words that aren't likely symbol names
  96. const commonWords = new Set([
  97. 'the', 'and', 'for', 'with', 'from', 'this', 'that', 'have', 'been',
  98. 'will', 'would', 'could', 'should', 'does', 'done', 'make', 'made',
  99. 'use', 'used', 'using', 'work', 'works', 'find', 'found', 'show',
  100. 'call', 'called', 'calling', 'get', 'set', 'add', 'all', 'any',
  101. 'how', 'what', 'when', 'where', 'which', 'who', 'why',
  102. 'not', 'but', 'are', 'was', 'were', 'has', 'had', 'its',
  103. 'can', 'did', 'may', 'also', 'into', 'than', 'then', 'them',
  104. 'each', 'other', 'some', 'such', 'only', 'same', 'about',
  105. 'after', 'before', 'between', 'through', 'during', 'without',
  106. 'again', 'further', 'once', 'here', 'there', 'both', 'just',
  107. 'more', 'most', 'very', 'being', 'having', 'doing',
  108. 'system', 'need', 'needs', 'want', 'wants', 'like', 'look',
  109. 'change', 'changes', 'changed', 'changing',
  110. // Common English nouns/verbs that match thousands of unrelated code symbols
  111. 'layer', 'handle', 'handles', 'handling', 'incoming', 'outgoing',
  112. 'data', 'flow', 'flows', 'level', 'levels', 'request', 'requests',
  113. 'response', 'responses', 'implement', 'implements', 'implementation',
  114. 'interface', 'interfaces', 'class', 'classes', 'method', 'methods',
  115. 'trigger', 'triggers', 'affected', 'affect', 'affects',
  116. 'else', 'code', 'failing', 'failed', 'silently', 'decide', 'decides',
  117. 'return', 'returns', 'returned', 'take', 'takes', 'taken',
  118. 'check', 'checks', 'checked', 'create', 'creates', 'created',
  119. 'read', 'reads', 'write', 'writes', 'written',
  120. 'start', 'starts', 'stop', 'stops', 'run', 'runs', 'running',
  121. ]);
  122. return Array.from(symbols).filter(s => !commonWords.has(s.toLowerCase()));
  123. }
  124. /**
  125. * Default options for context building
  126. *
  127. * Tuned for minimal context usage while still providing useful results:
  128. * - Fewer nodes and code blocks by default
  129. * - Smaller code block size limit
  130. * - Shallower traversal
  131. */
  132. const DEFAULT_BUILD_OPTIONS: Required<BuildContextOptions> = {
  133. maxNodes: 20, // Reduced from 50 - most tasks don't need 50 symbols
  134. maxCodeBlocks: 5, // Reduced from 10 - only show most relevant code
  135. maxCodeBlockSize: 1500, // Reduced from 2000
  136. includeCode: true,
  137. format: 'markdown',
  138. searchLimit: 3, // Reduced from 5 - fewer entry points
  139. traversalDepth: 1, // Reduced from 2 - shallower graph expansion
  140. minScore: 0.3,
  141. };
  142. /**
  143. * Node kinds that provide high information value in context results.
  144. * Imports/exports are excluded because they have near-zero information density -
  145. * they tell you something exists, not how it works.
  146. */
  147. const HIGH_VALUE_NODE_KINDS: NodeKind[] = [
  148. 'function', 'method', 'class', 'interface', 'type_alias', 'struct', 'trait',
  149. 'component', 'route', 'variable', 'constant', 'enum', 'module', 'namespace',
  150. ];
  151. /**
  152. * Default options for finding relevant context
  153. */
  154. const DEFAULT_FIND_OPTIONS: Required<FindRelevantContextOptions> = {
  155. searchLimit: 3, // Reduced from 5
  156. traversalDepth: 1, // Reduced from 2
  157. maxNodes: 20, // Reduced from 50
  158. minScore: 0.3,
  159. edgeKinds: [],
  160. nodeKinds: HIGH_VALUE_NODE_KINDS, // Filter out imports/exports by default
  161. };
  162. // Re-export the low-confidence sentinel (defined in a dependency-free leaf so
  163. // the MCP layer can import it without pulling this module's deps onto the
  164. // cold-start path). Builder code below uses the imported binding directly.
  165. export { LOW_CONFIDENCE_MARKER } from './markers';
  166. /**
  167. * Context Builder
  168. *
  169. * Coordinates semantic search and graph traversal to build
  170. * comprehensive context for tasks.
  171. */
  172. export class ContextBuilder {
  173. private projectRoot: string;
  174. private queries: QueryBuilder;
  175. private traverser: GraphTraverser;
  176. constructor(
  177. projectRoot: string,
  178. queries: QueryBuilder,
  179. traverser: GraphTraverser
  180. ) {
  181. this.projectRoot = projectRoot;
  182. this.queries = queries;
  183. this.traverser = traverser;
  184. }
  185. /**
  186. * Build context for a task
  187. *
  188. * Pipeline:
  189. * 1. Parse task input (string or {title, description})
  190. * 2. Run semantic search to find entry points
  191. * 3. Expand graph around entry points
  192. * 4. Extract code blocks for key nodes
  193. * 5. Format output for Claude
  194. *
  195. * @param input - Task description or object with title/description
  196. * @param options - Build options
  197. * @returns TaskContext (structured) or formatted string
  198. */
  199. async buildContext(
  200. input: TaskInput,
  201. options: BuildContextOptions = {}
  202. ): Promise<TaskContext | string> {
  203. const opts = { ...DEFAULT_BUILD_OPTIONS, ...options };
  204. // Parse input
  205. const query = typeof input === 'string' ? input : `${input.title}${input.description ? `: ${input.description}` : ''}`;
  206. // Find relevant context (semantic search + graph expansion)
  207. const subgraph = await this.findRelevantContext(query, {
  208. searchLimit: opts.searchLimit,
  209. traversalDepth: opts.traversalDepth,
  210. maxNodes: opts.maxNodes,
  211. minScore: opts.minScore,
  212. });
  213. // Get entry points (nodes from semantic search)
  214. const entryPoints = this.getEntryPoints(subgraph);
  215. // Extract code blocks for key nodes
  216. const codeBlocks = opts.includeCode
  217. ? await this.extractCodeBlocks(subgraph, opts.maxCodeBlocks, opts.maxCodeBlockSize)
  218. : [];
  219. // Get related files
  220. const relatedFiles = this.getRelatedFiles(subgraph);
  221. // Generate summary
  222. const summary = this.generateSummary(query, subgraph, entryPoints);
  223. // Calculate stats
  224. const stats = {
  225. nodeCount: subgraph.nodes.size,
  226. edgeCount: subgraph.edges.length,
  227. fileCount: relatedFiles.length,
  228. codeBlockCount: codeBlocks.length,
  229. totalCodeSize: codeBlocks.reduce((sum, block) => sum + block.content.length, 0),
  230. };
  231. const context: TaskContext = {
  232. query,
  233. subgraph,
  234. entryPoints,
  235. codeBlocks,
  236. relatedFiles,
  237. summary,
  238. stats,
  239. };
  240. // Return formatted output or raw context
  241. if (opts.format === 'markdown') {
  242. return formatContextAsMarkdown(context)
  243. + this.buildCallPathsSection(subgraph)
  244. + (subgraph.confidence === 'low' ? this.buildLowConfidenceNote(entryPoints) : '');
  245. } else if (opts.format === 'json') {
  246. return formatContextAsJson(context);
  247. }
  248. return context;
  249. }
  250. /**
  251. * Honest handoff appended when retrieval confidence is low (the query matched
  252. * mostly common words). Instead of the usual "this covers the surface" framing
  253. * — which, when wrong, sends the agent off to Read/Grep — it admits the
  254. * uncertainty and routes the agent to the precise tools (explore with real
  255. * symbol names, search, or files to browse the closest areas we *did* surface).
  256. */
  257. private buildLowConfidenceNote(entryPoints: Node[]): string {
  258. const dirs: string[] = [];
  259. const seen = new Set<string>();
  260. for (const n of entryPoints) {
  261. const slash = n.filePath.lastIndexOf('/');
  262. const dir = slash > 0 ? n.filePath.slice(0, slash) : n.filePath;
  263. if (!seen.has(dir)) { seen.add(dir); dirs.push(dir); }
  264. if (dirs.length >= 4) break;
  265. }
  266. const dirLine = dirs.length
  267. ? `\n- \`codegraph_files\` a likely area: ${dirs.map(d => `\`${d}\``).join(', ')}`
  268. : '';
  269. return `\n\n${LOW_CONFIDENCE_MARKER}\n\n`
  270. + 'This query matched mostly on common words, so the entry points above may '
  271. + 'be off-target — treat them as a starting point, not a complete answer. '
  272. + 'For a reliable result:\n'
  273. + '- `codegraph_explore` with the **exact symbol names** you are after '
  274. + '(class / function / method names), or\n'
  275. + '- `codegraph_search <name>` for one specific symbol'
  276. + dirLine
  277. + '\n\nDo not assume the list above is comprehensive.';
  278. }
  279. /**
  280. * Surface short call-paths among the symbols this context already found,
  281. * derived in-memory from the subgraph's `calls` edges (no extra queries).
  282. *
  283. * This bakes the value of path-finding INTO the always-loaded `context` tool.
  284. * Agents reliably read context's output but do NOT discover/adopt a standalone
  285. * trace tool (in deferred-MCP harnesses they only ToolSearch-select tools they
  286. * already know). Delivering the flow here means "how does X reach Y" is
  287. * answered without the agent needing to find, load, or choose a new tool.
  288. * Chains stop where the static call graph ends (e.g. dynamic dispatch) — that
  289. * truncation is honest, and the agent can codegraph_node the last hop to bridge.
  290. */
  291. private buildCallPathsSection(subgraph: Subgraph): string {
  292. const adj = new Map<string, string[]>();
  293. for (const e of subgraph.edges) {
  294. if (e.kind !== 'calls') continue;
  295. if (!subgraph.nodes.has(e.source) || !subgraph.nodes.has(e.target)) continue;
  296. const list = adj.get(e.source);
  297. if (list) list.push(e.target);
  298. else adj.set(e.source, [e.target]);
  299. }
  300. if (adj.size === 0) return '';
  301. const MAX_HOPS = 6;
  302. const chains: string[][] = [];
  303. let budget = 2000; // bound DFS work on dense subgraphs
  304. const dfs = (id: string, path: string[], seen: Set<string>): void => {
  305. if (budget-- <= 0) return;
  306. const next = (adj.get(id) ?? []).filter((t) => !seen.has(t));
  307. if (next.length === 0 || path.length >= MAX_HOPS) {
  308. if (path.length >= 3) chains.push([...path]); // >=3 nodes = a real flow, not a single call
  309. return;
  310. }
  311. for (const t of next) {
  312. seen.add(t);
  313. dfs(t, [...path, t], seen);
  314. seen.delete(t);
  315. }
  316. };
  317. const starts = (subgraph.roots.length > 0
  318. ? subgraph.roots.filter((id) => adj.has(id))
  319. : [...adj.keys()]
  320. ).slice(0, 5);
  321. for (const s of starts) dfs(s, [s], new Set([s]));
  322. if (chains.length === 0) return '';
  323. // Keep only chains that connect TWO OR MORE query-relevant symbols (roots).
  324. // A chain from a root into an arbitrary callee (render → onMagicFrameGenerate)
  325. // is structurally valid but tangential to the question; requiring ≥2 roots
  326. // keeps the chain anchored to what the user actually asked about. Rank by
  327. // #roots then length, and drop any that are a sub-path of a longer kept chain.
  328. const rootSet = new Set(subgraph.roots);
  329. const rootCount = (c: string[]): number => c.reduce((n, id) => n + (rootSet.has(id) ? 1 : 0), 0);
  330. const relevant = chains.filter((c) => rootCount(c) >= 2);
  331. relevant.sort((a, b) => rootCount(b) - rootCount(a) || b.length - a.length);
  332. const kept: string[][] = [];
  333. for (const c of relevant) {
  334. const key = c.join('>');
  335. if (kept.some((k) => k.join('>').includes(key))) continue;
  336. kept.push(c);
  337. if (kept.length >= 3) break;
  338. }
  339. if (kept.length === 0) return '';
  340. const name = (id: string): string => subgraph.nodes.get(id)?.name ?? id;
  341. // Synthesized (dynamic-dispatch) hops are real `calls` edges but invisible to
  342. // static parsing — mark them inline so the agent sees WHERE the callback was
  343. // wired up (`registered @file:line`) instead of grepping for it. Keyed by
  344. // "source>target".
  345. const synthByPair = new Map<string, string>();
  346. for (const e of subgraph.edges) {
  347. if (e.kind !== 'calls' || e.provenance !== 'heuristic') continue;
  348. const m = e.metadata as Record<string, unknown> | undefined;
  349. if (!m?.synthesizedBy) continue;
  350. const at = typeof m.registeredAt === 'string' ? ` @${m.registeredAt}` : '';
  351. const label = m.synthesizedBy === 'callback'
  352. ? `callback via ${m.via ? `\`${String(m.via)}\`` : 'registrar'}${at}`
  353. : m.synthesizedBy === 'react-render'
  354. ? `React re-render via setState${at}`
  355. : m.synthesizedBy === 'jsx-render'
  356. ? `renders <${String(m.via || 'child')}>`
  357. : m.synthesizedBy === 'vue-handler'
  358. ? `Vue @${String(m.event || 'event')} handler`
  359. : `event ${m.event ? `\`${String(m.event)}\`` : ''}${at}`;
  360. synthByPair.set(`${e.source}>${e.target}`, label);
  361. }
  362. const renderChain = (c: string[]): string => {
  363. let s = name(c[0]!);
  364. for (let i = 1; i < c.length; i++) {
  365. const synth = synthByPair.get(`${c[i - 1]}>${c[i]}`);
  366. s += synth ? ` →[${synth}] ${name(c[i]!)}` : ` → ${name(c[i]!)}`;
  367. }
  368. return s;
  369. };
  370. const hasSynth = kept.some((c) => c.some((_, i) => i > 0 && synthByPair.has(`${c[i - 1]}>${c[i]}`)));
  371. const lines = [
  372. '',
  373. '## Call paths',
  374. '',
  375. 'Execution flow among the key symbols (traced through the call graph):',
  376. '',
  377. ...kept.map((c) => `- ${renderChain(c)}`),
  378. '',
  379. hasSynth
  380. ? '_Hops marked `[callback/event …]` are dynamic dispatch bridged by codegraph (with the registration site); the rest are direct calls. codegraph_node any symbol for its body._'
  381. : '_codegraph_node any symbol above for its source + its own callers/callees._',
  382. ];
  383. return '\n' + lines.join('\n') + '\n';
  384. }
  385. /**
  386. * Find relevant subgraph for a query
  387. *
  388. * Uses hybrid search combining exact symbol lookup with semantic search:
  389. * 1. Extract potential symbol names from query
  390. * 2. Look up exact matches for those symbols (high confidence)
  391. * 3. Use semantic search for concept matching
  392. * 4. Merge results, prioritizing exact matches
  393. * 5. Traverse graph from entry points
  394. *
  395. * @param query - Natural language query
  396. * @param options - Search and traversal options
  397. * @returns Subgraph of relevant nodes and edges
  398. */
  399. async findRelevantContext(
  400. query: string,
  401. options: FindRelevantContextOptions = {}
  402. ): Promise<Subgraph> {
  403. const opts = { ...DEFAULT_FIND_OPTIONS, ...options };
  404. // Start with empty subgraph
  405. const nodes = new Map<string, Node>();
  406. const edges: Edge[] = [];
  407. const roots: string[] = [];
  408. // Handle empty query - return empty subgraph
  409. if (!query || query.trim().length === 0) {
  410. return { nodes, edges, roots };
  411. }
  412. // === HYBRID SEARCH ===
  413. // Step 1: Extract potential symbol names from query
  414. const symbolsFromQuery = extractSymbolsFromQuery(query);
  415. logDebug('Extracted symbols from query', { query, symbols: symbolsFromQuery });
  416. // Step 2: Look up exact matches for extracted symbols
  417. let exactMatches: SearchResult[] = [];
  418. if (symbolsFromQuery.length > 0) {
  419. try {
  420. // Get more results so we can apply co-location boosting before trimming
  421. exactMatches = this.queries.findNodesByExactName(symbolsFromQuery, {
  422. limit: Math.ceil(opts.searchLimit * 5),
  423. kinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
  424. });
  425. // Co-location boost: when multiple extracted symbols appear in the same file,
  426. // those results are much more likely to be what the user is looking for.
  427. // E.g., "scrapeLoop" + "run" both in scrape/scrape.go → boost both.
  428. if (exactMatches.length > 1) {
  429. // Build a map of files → how many distinct symbol names matched in that file
  430. const fileSymbolCounts = new Map<string, Set<string>>();
  431. for (const r of exactMatches) {
  432. const names = fileSymbolCounts.get(r.node.filePath) || new Set();
  433. names.add(r.node.name.toLowerCase());
  434. fileSymbolCounts.set(r.node.filePath, names);
  435. }
  436. // Boost results in files where multiple query symbols co-occur
  437. exactMatches = exactMatches.map(r => {
  438. const symbolCount = fileSymbolCounts.get(r.node.filePath)?.size || 1;
  439. return {
  440. ...r,
  441. score: symbolCount > 1 ? r.score + (symbolCount - 1) * 20 : r.score,
  442. };
  443. });
  444. exactMatches.sort((a, b) => b.score - a.score);
  445. }
  446. // Trim back to reasonable size
  447. exactMatches = exactMatches.slice(0, Math.ceil(opts.searchLimit * 2));
  448. logDebug('Exact symbol matches', { count: exactMatches.length });
  449. } catch (error) {
  450. logDebug('Exact symbol lookup failed', { error: String(error) });
  451. }
  452. }
  453. // Step 2b: Search for extracted symbols as definition (class/interface) prefixes.
  454. // When the user writes "REST", "bulk", or "allocation", they usually mean classes
  455. // like RestController, BulkRequest, AllocationService — not nodes named exactly that.
  456. // Also tries stem variants: "caching" → "cache" finds Cache, CacheBuilder.
  457. if (symbolsFromQuery.length > 0) {
  458. const definitionKinds: NodeKind[] = ['class', 'interface', 'struct', 'trait',
  459. 'protocol', 'enum', 'type_alias'];
  460. // Expand symbols with stem variants for broader definition matching
  461. const expandedSymbols = new Set(symbolsFromQuery);
  462. for (const sym of symbolsFromQuery) {
  463. for (const variant of getStemVariants(sym)) {
  464. expandedSymbols.add(variant);
  465. }
  466. }
  467. for (const sym of expandedSymbols) {
  468. // Title-case the symbol: "REST" → "Rest", "bulk" → "Bulk", "allocation" → "Allocation"
  469. const titleCased = sym.charAt(0).toUpperCase() + sym.slice(1).toLowerCase();
  470. if (titleCased === sym) continue; // already title-case (e.g., "Engine") — handled by exact match
  471. // Fetch more results since popular prefixes have many matches
  472. const prefixResults = this.queries.searchNodes(titleCased, {
  473. limit: 30,
  474. kinds: definitionKinds,
  475. });
  476. const matched: SearchResult[] = [];
  477. for (const r of prefixResults) {
  478. if (r.node.name.toLowerCase().startsWith(titleCased.toLowerCase())) {
  479. // Favor shorter names: "AllocationService" (18 chars) over
  480. // "AllocationBalancingRoundMetrics" (31 chars). Core classes tend
  481. // to have concise names; test/helper classes are verbose.
  482. const brevityBonus = Math.max(0, 10 - (r.node.name.length - titleCased.length) / 3);
  483. matched.push({ ...r, score: r.score + 15 + brevityBonus });
  484. }
  485. }
  486. matched.sort((a, b) => b.score - a.score);
  487. for (const r of matched.slice(0, Math.ceil(opts.searchLimit))) {
  488. const existing = exactMatches.find(e => e.node.id === r.node.id);
  489. if (!existing) {
  490. exactMatches.push(r);
  491. }
  492. }
  493. }
  494. exactMatches.sort((a, b) => b.score - a.score);
  495. exactMatches = exactMatches.slice(0, Math.ceil(opts.searchLimit * 3));
  496. }
  497. // Step 3: Run text search for natural language term matching
  498. // This catches file-name and node-name matches that semantic search may miss,
  499. // which is critical for template-heavy codebases (e.g., Liquid/Shopify themes)
  500. // where file names are the primary identifiers.
  501. let textResults: SearchResult[] = [];
  502. try {
  503. const searchTerms = extractSearchTerms(query);
  504. if (searchTerms.length > 0) {
  505. // Search each term individually to get broader coverage,
  506. // then boost results that match multiple terms
  507. const termResultsMap = new Map<string, { result: SearchResult; termHits: number }>();
  508. // When no explicit kind filter is set, exclude imports — they flood FTS
  509. // results with qualified name matches (e.g., "REST" matches 445K import paths)
  510. // but are almost never what exploration queries want.
  511. const searchKinds = opts.nodeKinds && opts.nodeKinds.length > 0
  512. ? opts.nodeKinds
  513. : ['file', 'module', 'class', 'struct', 'interface', 'trait', 'protocol',
  514. 'function', 'method', 'property', 'field', 'variable', 'constant',
  515. 'enum', 'enum_member', 'type_alias', 'namespace', 'export',
  516. 'route', 'component'] as NodeKind[];
  517. for (const term of searchTerms) {
  518. const termResults = this.queries.searchNodes(term, {
  519. limit: opts.searchLimit * 2,
  520. kinds: searchKinds,
  521. });
  522. for (const r of termResults) {
  523. const existing = termResultsMap.get(r.node.id);
  524. if (existing) {
  525. existing.termHits++;
  526. existing.result.score = Math.max(existing.result.score, r.score);
  527. } else {
  528. termResultsMap.set(r.node.id, { result: r, termHits: 1 });
  529. }
  530. }
  531. }
  532. // Boost results matching multiple terms and sort
  533. textResults = Array.from(termResultsMap.values())
  534. .map(({ result, termHits }) => ({
  535. ...result,
  536. score: result.score + (termHits - 1) * 5,
  537. }))
  538. .sort((a, b) => b.score - a.score)
  539. .slice(0, opts.searchLimit * 2);
  540. }
  541. logDebug('Text search results', { count: textResults.length });
  542. } catch (error) {
  543. logDebug('Text search failed', { query, error: String(error) });
  544. }
  545. // Step 4: Merge results, taking the max score when duplicates appear
  546. // across search channels. Exact matches may have lower scores than FTS
  547. // results for the same node — use the best score from any channel.
  548. const resultById = new Map<string, SearchResult>();
  549. let searchResults: SearchResult[] = [];
  550. // Add exact matches first
  551. for (const result of exactMatches) {
  552. const existing = resultById.get(result.node.id);
  553. if (existing) {
  554. existing.score = Math.max(existing.score, result.score);
  555. } else {
  556. resultById.set(result.node.id, result);
  557. searchResults.push(result);
  558. }
  559. }
  560. // Add text search results, upgrading scores for duplicates
  561. for (const result of textResults) {
  562. const existing = resultById.get(result.node.id);
  563. if (existing) {
  564. existing.score = Math.max(existing.score, result.score);
  565. } else {
  566. resultById.set(result.node.id, result);
  567. searchResults.push(result);
  568. }
  569. }
  570. const queryLower = query.toLowerCase();
  571. const isTestQuery = queryLower.includes('test') || queryLower.includes('spec');
  572. // Deprioritize test files early so they don't take multi-term boost slots
  573. if (!isTestQuery) {
  574. for (const result of searchResults) {
  575. if (isTestFile(result.node.filePath)) {
  576. result.score *= 0.3;
  577. }
  578. }
  579. }
  580. // Iter7 — Core-directory boost. On projects with one file that holds
  581. // the dense majority of internal call edges (e.g. sinatra's
  582. // `lib/sinatra/base.rb` at 85% of all in-file edges), the agent's
  583. // task usually asks about the framework's core. Without this boost,
  584. // ranking favors small focused extension files (e.g. text search
  585. // picks `sinatra-contrib/lib/sinatra/multi_route.rb`'s 10-line
  586. // `route` method over `base.rb`'s `route!` because the extension
  587. // file's `route` matches the query verbatim AND the file is small,
  588. // dwarfing the longer name `route!` in a 1500-line file). Boost
  589. // results that share a directory prefix with the dominant file's
  590. // directory so the core file's siblings outrank sibling-package
  591. // extensions.
  592. try {
  593. const dominant = this.queries.getDominantFile?.();
  594. if (dominant && dominant.edgeCount >= 3 * dominant.nextEdgeCount) {
  595. // Take the directory of the dominant file (everything up to the
  596. // last slash). For `lib/sinatra/base.rb` → `lib/sinatra/`.
  597. const slash = dominant.filePath.lastIndexOf('/');
  598. if (slash > 0) {
  599. const coreDir = dominant.filePath.slice(0, slash + 1);
  600. for (const result of searchResults) {
  601. if (result.node.filePath.startsWith(coreDir)) {
  602. result.score += 25;
  603. }
  604. }
  605. }
  606. }
  607. } catch {
  608. // SQL failure — fall through, scoring works without the boost
  609. }
  610. // Step 5a: Multi-term co-occurrence re-ranking (applied BEFORE truncation).
  611. // For multi-word queries like "search execution from request to shard",
  612. // nodes matching 2+ query terms in their name or path are far more relevant
  613. // than nodes matching just one generic term. Without this, "ExecutionUtils"
  614. // (matches only "execution") fills budget slots meant for "ShardSearchRequest"
  615. // (matches "shard" + "search" + "request").
  616. const queryTermsForBoost = extractSearchTerms(query);
  617. if (queryTermsForBoost.length >= 2) {
  618. // Group terms that are substrings of each other (stem variants of the same
  619. // root word). "indexed", "indexe", "index" should count as ONE concept match,
  620. // not three. Without this, stem variants inflate matchCount and give false
  621. // multi-term boosts to symbols matching one root word multiple times.
  622. const termGroups: string[][] = [];
  623. const sorted = [...queryTermsForBoost].sort((a, b) => b.length - a.length);
  624. const assigned = new Set<string>();
  625. for (const term of sorted) {
  626. if (assigned.has(term)) continue;
  627. const group = [term];
  628. assigned.add(term);
  629. for (const other of sorted) {
  630. if (assigned.has(other)) continue;
  631. if (term.includes(other) || other.includes(term)) {
  632. group.push(other);
  633. assigned.add(other);
  634. }
  635. }
  636. termGroups.push(group);
  637. }
  638. // Build a set of exact-match node IDs so we can exempt them from dampening.
  639. // When the query is "LiveEditMode DevServerPreview", these are specific
  640. // symbols the user asked for — dampening them because they only match 1
  641. // term group is counter-productive.
  642. const exactMatchIds = new Set(exactMatches.map(r => r.node.id));
  643. // ...but only exempt exact matches the user *named as an identifier*
  644. // (camelCase/snake_case/acronym). A plain dictionary word that happens to
  645. // exact-match an unrelated symbol — query "flat object" → a constant named
  646. // FLAT — must NOT be exempt, or the +exact-name bonus floats it to the top
  647. // of a prose query with zero corroboration from any other term. Classify by
  648. // the QUERY token (what the user typed), not the matched symbol's name.
  649. const distinctiveTokens = new Set(
  650. symbolsFromQuery.filter(isDistinctiveIdentifier).map(s => s.toLowerCase())
  651. );
  652. const distinctiveExactMatchIds = new Set(
  653. exactMatches
  654. .filter(r => distinctiveTokens.has(r.node.name.toLowerCase()))
  655. .map(r => r.node.id)
  656. );
  657. for (const result of searchResults) {
  658. // Check term matches in name (substring) and path DIRECTORIES (exact).
  659. // Directory segments must match exactly — "search" matches directory
  660. // "search/" but NOT "elasticsearch/". The class name is checked
  661. // separately via substring match on the node name.
  662. const nameLower = result.node.name.toLowerCase();
  663. const dirSegments = path.dirname(result.node.filePath).toLowerCase().split('/');
  664. let matchCount = 0;
  665. for (const group of termGroups) {
  666. const groupMatches = group.some(term => {
  667. const inName = nameLower.includes(term);
  668. const inDir = dirSegments.some(seg => seg === term);
  669. return inName || inDir;
  670. });
  671. if (groupMatches) matchCount++;
  672. }
  673. if (matchCount >= 2) {
  674. // Multiplicative boost — 2 terms → 2x, 3 terms → 2.5x
  675. result.score *= 1 + matchCount * 0.5;
  676. } else if (distinctiveExactMatchIds.has(result.node.id)) {
  677. // Exact match on a distinctive identifier the user explicitly named —
  678. // keep full score (e.g. "LiveEditMode DevServerPreview").
  679. } else if (exactMatchIds.has(result.node.id)) {
  680. // Exact match on a COMMON word (e.g. "flat" → FLAT): high-scoring noise
  681. // inflated by the +exact-name bonus, corroborated by no other query
  682. // term. Demote hard so corroborated matches win.
  683. result.score *= 0.3;
  684. } else {
  685. // Mild dampen for generic single-term matches — they might be generic
  686. // but could also be the right result (e.g., "Protocol" class for an IPC query).
  687. result.score *= 0.6;
  688. }
  689. }
  690. searchResults.sort((a, b) => b.score - a.score);
  691. }
  692. // Step 5b: CamelCase-boundary matching via LIKE query.
  693. // FTS can't find "Search" inside "TransportSearchAction" (one FTS token).
  694. // LIKE reliably finds these substring matches. Results are appended with
  695. // guaranteed slots so they don't compete with higher-scoring prefix matches.
  696. if (symbolsFromQuery.length > 0) {
  697. const camelDefinitionKinds: NodeKind[] = ['class', 'interface', 'struct', 'trait',
  698. 'protocol', 'enum', 'type_alias'];
  699. const camelSearchedTerms = new Set<string>();
  700. const searchIdSet = new Set(searchResults.map(r => r.node.id));
  701. // Track per-node term hits for multi-term boosting
  702. const camelNodeTerms = new Map<string, { result: SearchResult; termCount: number }>();
  703. const maxCamelPerTerm = Math.ceil(opts.searchLimit / 2);
  704. for (const sym of symbolsFromQuery) {
  705. const titleCased = sym.charAt(0).toUpperCase() + sym.slice(1).toLowerCase();
  706. if (titleCased.length < 3) continue;
  707. const termKey = titleCased.toLowerCase();
  708. if (camelSearchedTerms.has(termKey)) continue;
  709. camelSearchedTerms.add(termKey);
  710. // Fetch a large batch — popular terms like "Search" in Elasticsearch
  711. // have hundreds of substring matches. The LIKE scan cost is the same
  712. // regardless of LIMIT (SQLite scans all matches to sort), so we fetch
  713. // generously and let path-relevance scoring pick the best ones.
  714. const likeResults = this.queries.findNodesByNameSubstring(titleCased, {
  715. limit: 200,
  716. kinds: camelDefinitionKinds,
  717. excludePrefix: true,
  718. });
  719. // Filter to CamelCase boundaries, score by path relevance, and take top N
  720. const termCandidates: SearchResult[] = [];
  721. for (const r of likeResults) {
  722. const name = r.node.name;
  723. const idx = name.indexOf(titleCased);
  724. if (idx <= 0) continue;
  725. // Accept CamelCase boundary (lowercase before match) OR
  726. // acronym boundary (uppercase before match, e.g., RPCProtocol)
  727. if (!/[a-zA-Z]/.test(name.charAt(idx - 1))) continue;
  728. if (searchIdSet.has(r.node.id)) continue;
  729. if (isTestFile(r.node.filePath) && !isTestQuery) continue;
  730. const pathScore = scorePathRelevance(r.node.filePath, query);
  731. const brevityBonus = Math.max(0, 6 - (name.length - titleCased.length) / 4);
  732. termCandidates.push({ node: r.node, score: 8 + brevityBonus + pathScore });
  733. }
  734. termCandidates.sort((a, b) => b.score - a.score);
  735. // Widen the per-term pool for accumulation so multi-term co-occurrences
  736. // can be discovered. A class matching 3 query terms at CamelCase boundaries
  737. // is far more relevant than one matching just 1, but it needs to survive
  738. // the per-term cut for EACH term to accumulate its count.
  739. const accumPerTerm = maxCamelPerTerm * 4;
  740. for (const r of termCandidates.slice(0, accumPerTerm)) {
  741. const existing = camelNodeTerms.get(r.node.id);
  742. if (existing) {
  743. existing.termCount++;
  744. } else {
  745. camelNodeTerms.set(r.node.id, {
  746. result: r,
  747. termCount: 1,
  748. });
  749. }
  750. }
  751. }
  752. // Append CamelCase matches with multi-term boost.
  753. // These are structurally important (class names containing query terms at
  754. // CamelCase boundaries) but score much lower than FTS results. Scale their
  755. // scores up so multi-term CamelCase matches can compete with FTS results.
  756. const camelResults: SearchResult[] = [];
  757. for (const [, info] of camelNodeTerms) {
  758. // Multi-term CamelCase matches are extremely relevant — a class matching
  759. // 3+ query terms in its name (e.g., ExtensionHostProcess) is almost
  760. // certainly what the user wants. Scale aggressively.
  761. info.result.score = info.result.score * (1 + info.termCount) + (info.termCount - 1) * 30;
  762. camelResults.push(info.result);
  763. }
  764. camelResults.sort((a, b) => b.score - a.score);
  765. const maxCamelTotal = opts.searchLimit;
  766. for (const r of camelResults.slice(0, maxCamelTotal)) {
  767. searchResults.push(r);
  768. searchIdSet.add(r.node.id);
  769. }
  770. // Step 5c: Compound term matching — find classes whose name contains 2+
  771. // query terms at ANY position (not just CamelCase boundaries).
  772. // The CamelCase step above requires idx > 0, which misses classes that
  773. // START with a query term (e.g., "SearchShardsRequest" starts with "Search").
  774. // For multi-word queries, a class matching multiple query terms in its name
  775. // is almost certainly relevant regardless of position.
  776. if (symbolsFromQuery.length >= 2) {
  777. // Collect ALL LIKE results per term (reusing findNodesByNameSubstring)
  778. // but without the CamelCase boundary or prefix exclusion filters.
  779. const compoundTermMap = new Map<string, { node: Node; terms: Set<string> }>();
  780. for (const sym of symbolsFromQuery) {
  781. const titleCased = sym.charAt(0).toUpperCase() + sym.slice(1).toLowerCase();
  782. if (titleCased.length < 3) continue;
  783. const likeResults = this.queries.findNodesByNameSubstring(titleCased, {
  784. limit: 200,
  785. kinds: camelDefinitionKinds,
  786. excludePrefix: false,
  787. });
  788. for (const r of likeResults) {
  789. if (searchIdSet.has(r.node.id)) continue;
  790. if (isTestFile(r.node.filePath) && !isTestQuery) continue;
  791. const entry = compoundTermMap.get(r.node.id);
  792. if (entry) {
  793. entry.terms.add(titleCased);
  794. } else {
  795. compoundTermMap.set(r.node.id, { node: r.node, terms: new Set([titleCased]) });
  796. }
  797. }
  798. }
  799. // Keep only nodes matching 2+ distinct terms
  800. const compoundResults: SearchResult[] = [];
  801. for (const [, entry] of compoundTermMap) {
  802. if (entry.terms.size >= 2) {
  803. const pathScore = scorePathRelevance(entry.node.filePath, query);
  804. const brevityBonus = Math.max(0, 6 - entry.node.name.length / 8);
  805. compoundResults.push({
  806. node: entry.node,
  807. score: 10 + (entry.terms.size - 1) * 20 + pathScore + brevityBonus,
  808. });
  809. }
  810. }
  811. compoundResults.sort((a, b) => b.score - a.score);
  812. const maxCompound = Math.ceil(opts.searchLimit / 2);
  813. for (const r of compoundResults.slice(0, maxCompound)) {
  814. searchResults.push(r);
  815. searchIdSet.add(r.node.id);
  816. }
  817. }
  818. }
  819. // Final sort and truncation — all search channels (exact, text, CamelCase,
  820. // compound) have now contributed. Sort by score so multi-term matches from
  821. // later steps can outrank dampened single-term matches from earlier steps.
  822. searchResults.sort((a, b) => b.score - a.score);
  823. searchResults = searchResults.slice(0, opts.searchLimit * 3);
  824. // Filter by minimum score
  825. let filteredResults = searchResults.filter((r) => r.score >= opts.minScore);
  826. // Resolve imports/exports to their actual definitions
  827. // If someone searches "terminal" and finds `import { TerminalPanel }`,
  828. // they want the TerminalPanel class, not the import statement
  829. filteredResults = this.resolveImportsToDefinitions(filteredResults);
  830. // Cap entry points so traversal budget isn't spread too thin.
  831. // With 36 entry points and maxNodes=120, each gets only 3 nodes — useless.
  832. // Cap to searchLimit so each entry point gets a meaningful traversal budget.
  833. if (filteredResults.length > opts.searchLimit) {
  834. filteredResults = filteredResults.slice(0, opts.searchLimit);
  835. }
  836. // Confidence signal for the honest-handoff footer (consumed in buildContext).
  837. // A multi-term prose query that resolves only to isolated common-word matches
  838. // — no entry point corroborated by 2+ distinct query terms, and none a
  839. // distinctive identifier the user explicitly named — is LOW confidence: the
  840. // results are best-effort, not a located answer, so the agent should be told
  841. // to drill in with explore/trace rather than trust the list as comprehensive.
  842. // Single-keyword and symbol-name queries are exempt (their single match IS the
  843. // answer), so the handoff never fires on them.
  844. let confidence: 'high' | 'low' = 'high';
  845. const confTerms = extractSearchTerms(query, { stems: false }).filter(t => t.length >= 3);
  846. if (confTerms.length >= 2 && filteredResults.length > 0) {
  847. const distinctive = new Set(
  848. symbolsFromQuery.filter(isDistinctiveIdentifier).map(s => s.toLowerCase())
  849. );
  850. const anyStrong = filteredResults.some(r => {
  851. if (distinctive.has(r.node.name.toLowerCase())) return true;
  852. const nameLower = r.node.name.toLowerCase();
  853. const dirSegs = path.dirname(r.node.filePath).toLowerCase().split('/');
  854. let hits = 0;
  855. for (const t of confTerms) {
  856. if (nameLower.includes(t) || dirSegs.includes(t)) {
  857. if (++hits >= 2) return true;
  858. }
  859. }
  860. return false;
  861. });
  862. if (!anyStrong) confidence = 'low';
  863. }
  864. // Add entry points to subgraph
  865. for (const result of filteredResults) {
  866. nodes.set(result.node.id, result.node);
  867. roots.push(result.node.id);
  868. }
  869. // Expand type hierarchy for class/interface entry points.
  870. // BFS often exhausts its per-entry-point budget on contained methods
  871. // before reaching extends/implements neighbors. This dedicated step
  872. // ensures subclasses and superclasses always appear in results.
  873. // Budget: up to maxNodes/4 hierarchy nodes to avoid flooding.
  874. const typeHierarchyKinds = new Set<string>(['class', 'interface', 'struct', 'trait', 'protocol']);
  875. const maxHierarchyNodes = Math.ceil(opts.maxNodes / 4);
  876. let hierarchyNodesAdded = 0;
  877. for (const result of filteredResults) {
  878. if (hierarchyNodesAdded >= maxHierarchyNodes) break;
  879. if (typeHierarchyKinds.has(result.node.kind)) {
  880. const hierarchy = this.traverser.getTypeHierarchy(result.node.id);
  881. for (const [id, node] of hierarchy.nodes) {
  882. if (!nodes.has(id)) {
  883. nodes.set(id, node);
  884. hierarchyNodesAdded++;
  885. }
  886. }
  887. for (const edge of hierarchy.edges) {
  888. const exists = edges.some(
  889. (e) => e.source === edge.source && e.target === edge.target && e.kind === edge.kind
  890. );
  891. if (!exists) {
  892. edges.push(edge);
  893. }
  894. }
  895. }
  896. }
  897. // Pass 2: expand hierarchy of newly-discovered parent types to find siblings.
  898. // E.g., InternalEngine → Engine (parent, from pass 1) → ReadOnlyEngine (sibling).
  899. if (hierarchyNodesAdded > 0) {
  900. const pass2Candidates = [...nodes.values()].filter(
  901. n => typeHierarchyKinds.has(n.kind) && !roots.includes(n.id)
  902. );
  903. for (const candidate of pass2Candidates) {
  904. if (hierarchyNodesAdded >= maxHierarchyNodes) break;
  905. const siblingHierarchy = this.traverser.getTypeHierarchy(candidate.id);
  906. for (const [id, node] of siblingHierarchy.nodes) {
  907. if (!nodes.has(id) && hierarchyNodesAdded < maxHierarchyNodes) {
  908. nodes.set(id, node);
  909. hierarchyNodesAdded++;
  910. }
  911. }
  912. for (const edge of siblingHierarchy.edges) {
  913. if (nodes.has(edge.source) && nodes.has(edge.target)) {
  914. const exists = edges.some(
  915. (e) => e.source === edge.source && e.target === edge.target && e.kind === edge.kind
  916. );
  917. if (!exists) {
  918. edges.push(edge);
  919. }
  920. }
  921. }
  922. }
  923. }
  924. // Traverse from each entry point
  925. for (const result of filteredResults) {
  926. const traversalResult = this.traverser.traverseBFS(result.node.id, {
  927. maxDepth: opts.traversalDepth,
  928. edgeKinds: opts.edgeKinds && opts.edgeKinds.length > 0 ? opts.edgeKinds : undefined,
  929. nodeKinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
  930. direction: 'both',
  931. limit: Math.ceil(opts.maxNodes / Math.max(1, filteredResults.length)),
  932. });
  933. // Merge nodes
  934. for (const [id, node] of traversalResult.nodes) {
  935. if (!nodes.has(id)) {
  936. nodes.set(id, node);
  937. }
  938. }
  939. // Merge edges (avoid duplicates)
  940. for (const edge of traversalResult.edges) {
  941. const exists = edges.some(
  942. (e) => e.source === edge.source && e.target === edge.target && e.kind === edge.kind
  943. );
  944. if (!exists) {
  945. edges.push(edge);
  946. }
  947. }
  948. }
  949. // Trim to max nodes if needed
  950. let finalNodes = nodes;
  951. let finalEdges = edges;
  952. if (nodes.size > opts.maxNodes) {
  953. // Prioritize entry points and their direct neighbors
  954. const priorityIds = new Set(roots);
  955. for (const edge of edges) {
  956. if (priorityIds.has(edge.source)) {
  957. priorityIds.add(edge.target);
  958. }
  959. if (priorityIds.has(edge.target)) {
  960. priorityIds.add(edge.source);
  961. }
  962. }
  963. // Keep priority nodes, then fill remaining slots
  964. finalNodes = new Map<string, Node>();
  965. for (const id of priorityIds) {
  966. const node = nodes.get(id);
  967. if (node && finalNodes.size < opts.maxNodes) {
  968. finalNodes.set(id, node);
  969. }
  970. }
  971. // Fill remaining from other nodes
  972. for (const [id, node] of nodes) {
  973. if (finalNodes.size >= opts.maxNodes) break;
  974. if (!finalNodes.has(id)) {
  975. finalNodes.set(id, node);
  976. }
  977. }
  978. // Filter edges to only include kept nodes
  979. finalEdges = edges.filter(
  980. (e) => finalNodes.has(e.source) && finalNodes.has(e.target)
  981. );
  982. }
  983. // Per-file diversity cap: prevent any single file from monopolizing the
  984. // node budget. When BFS traverses from a method, it follows `contains`
  985. // to the parent class, then back down to all sibling methods. With
  986. // multiple entry points in the same class, one file can consume 30-40%
  987. // of maxNodes. Cap each file to ~20% to ensure cross-file diversity.
  988. const maxPerFile = Math.max(5, Math.ceil(opts.maxNodes * 0.2));
  989. const fileCounts = new Map<string, string[]>();
  990. for (const [id, node] of finalNodes) {
  991. const ids = fileCounts.get(node.filePath) || [];
  992. ids.push(id);
  993. fileCounts.set(node.filePath, ids);
  994. }
  995. const rootSet = new Set(roots);
  996. for (const [, nodeIds] of fileCounts) {
  997. if (nodeIds.length <= maxPerFile) continue;
  998. // Sort: entry points first, then classes/interfaces, then others
  999. const kindPriority: Record<string, number> = {
  1000. class: 3, interface: 3, struct: 3, trait: 3, protocol: 3, enum: 3,
  1001. method: 1, function: 1, property: 0, field: 0, variable: 0,
  1002. };
  1003. nodeIds.sort((a, b) => {
  1004. const aRoot = rootSet.has(a) ? 10 : 0;
  1005. const bRoot = rootSet.has(b) ? 10 : 0;
  1006. const aKind = kindPriority[finalNodes.get(a)!.kind] ?? 0;
  1007. const bKind = kindPriority[finalNodes.get(b)!.kind] ?? 0;
  1008. return (bRoot + bKind) - (aRoot + aKind);
  1009. });
  1010. // Remove excess nodes (keep the highest-priority ones)
  1011. for (const id of nodeIds.slice(maxPerFile)) {
  1012. finalNodes.delete(id);
  1013. }
  1014. }
  1015. // Non-production node cap: limit test/sample/integration/example files to
  1016. // at most 15% of the budget. Many codebases have dozens of near-identical
  1017. // test implementations (e.g., 6 Guard classes in integration tests) that
  1018. // individually survive score dampening but collectively flood the result.
  1019. // Test entry points are NOT exempt — they should be evicted too.
  1020. if (!isTestQuery) {
  1021. const maxNonProd = Math.max(3, Math.ceil(opts.maxNodes * 0.15));
  1022. const nonProdIds: string[] = [];
  1023. for (const [id, node] of finalNodes) {
  1024. if (isTestFile(node.filePath)) {
  1025. nonProdIds.push(id);
  1026. }
  1027. }
  1028. if (nonProdIds.length > maxNonProd) {
  1029. for (const id of nonProdIds.slice(maxNonProd)) {
  1030. finalNodes.delete(id);
  1031. // Also remove from roots — test file entry points shouldn't anchor results
  1032. const rootIdx = roots.indexOf(id);
  1033. if (rootIdx !== -1) roots.splice(rootIdx, 1);
  1034. }
  1035. }
  1036. }
  1037. // Re-filter edges after per-file and non-production caps
  1038. finalEdges = finalEdges.filter(
  1039. (e) => finalNodes.has(e.source) && finalNodes.has(e.target)
  1040. );
  1041. // Edge recovery: BFS with many entry points leaves most nodes disconnected.
  1042. // Discover edges between already-selected nodes to recover connectivity.
  1043. const recoveryKinds: EdgeKind[] = ['calls', 'extends', 'implements', 'references', 'overrides'];
  1044. const recoveredEdges = this.queries.findEdgesBetweenNodes(
  1045. [...finalNodes.keys()],
  1046. recoveryKinds,
  1047. );
  1048. const existingEdgeKeys = new Set(
  1049. finalEdges.map((e) => `${e.source}:${e.target}:${e.kind}`)
  1050. );
  1051. for (const edge of recoveredEdges) {
  1052. const key = `${edge.source}:${edge.target}:${edge.kind}`;
  1053. if (!existingEdgeKeys.has(key)) {
  1054. finalEdges.push(edge);
  1055. existingEdgeKeys.add(key);
  1056. }
  1057. }
  1058. return { nodes: finalNodes, edges: finalEdges, roots, confidence };
  1059. }
  1060. /**
  1061. * Get the source code for a node
  1062. *
  1063. * Reads the file and extracts the code between startLine and endLine.
  1064. *
  1065. * @param nodeId - ID of the node
  1066. * @returns Code string or null if not found
  1067. */
  1068. async getCode(nodeId: string): Promise<string | null> {
  1069. const node = this.queries.getNodeById(nodeId);
  1070. if (!node) {
  1071. return null;
  1072. }
  1073. return this.extractNodeCode(node);
  1074. }
  1075. /**
  1076. * Extract code from a node's source file
  1077. */
  1078. private async extractNodeCode(node: Node): Promise<string | null> {
  1079. // SECURITY (#383): a config-leaf node's on-disk line is `key = <secret>`.
  1080. // Return the KEY only — never read the value off disk. This closes the
  1081. // includeCode / buildContext code-block path, mirroring the explore source
  1082. // renderer; an agent that genuinely needs a value can read the file itself.
  1083. if (isConfigLeafNode(node)) {
  1084. return node.signature || node.qualifiedName || node.name;
  1085. }
  1086. const filePath = validatePathWithinRoot(this.projectRoot, node.filePath);
  1087. if (!filePath || !fs.existsSync(filePath)) {
  1088. return null;
  1089. }
  1090. try {
  1091. const content = fs.readFileSync(filePath, 'utf-8');
  1092. const lines = content.split('\n');
  1093. // Extract lines (1-indexed to 0-indexed)
  1094. const startIdx = Math.max(0, node.startLine - 1);
  1095. const endIdx = Math.min(lines.length, node.endLine);
  1096. return lines.slice(startIdx, endIdx).join('\n');
  1097. } catch (error) {
  1098. logDebug('Failed to extract code from node', { nodeId: node.id, filePath: node.filePath, error: String(error) });
  1099. return null;
  1100. }
  1101. }
  1102. /**
  1103. * Get entry points from a subgraph (the root nodes)
  1104. */
  1105. private getEntryPoints(subgraph: Subgraph): Node[] {
  1106. return subgraph.roots
  1107. .map((id) => subgraph.nodes.get(id))
  1108. .filter((n): n is Node => n !== undefined);
  1109. }
  1110. /**
  1111. * Extract code blocks for key nodes in the subgraph
  1112. */
  1113. private async extractCodeBlocks(
  1114. subgraph: Subgraph,
  1115. maxBlocks: number,
  1116. maxBlockSize: number
  1117. ): Promise<CodeBlock[]> {
  1118. const blocks: CodeBlock[] = [];
  1119. // Prioritize entry points, then functions/methods
  1120. const priorityNodes: Node[] = [];
  1121. // First: entry points
  1122. for (const id of subgraph.roots) {
  1123. const node = subgraph.nodes.get(id);
  1124. if (node) {
  1125. priorityNodes.push(node);
  1126. }
  1127. }
  1128. // Then: functions and methods
  1129. for (const node of subgraph.nodes.values()) {
  1130. if (!subgraph.roots.includes(node.id)) {
  1131. if (node.kind === 'function' || node.kind === 'method') {
  1132. priorityNodes.push(node);
  1133. }
  1134. }
  1135. }
  1136. // Then: classes
  1137. for (const node of subgraph.nodes.values()) {
  1138. if (!subgraph.roots.includes(node.id)) {
  1139. if (node.kind === 'class') {
  1140. priorityNodes.push(node);
  1141. }
  1142. }
  1143. }
  1144. // Extract code for priority nodes
  1145. for (const node of priorityNodes) {
  1146. if (blocks.length >= maxBlocks) break;
  1147. const code = await this.extractNodeCode(node);
  1148. if (code) {
  1149. // Truncate if too long. Language-neutral marker (no `//` — not a
  1150. // comment in Python, Ruby, etc.); this renders inside a fenced
  1151. // source block whose language varies.
  1152. const truncated = code.length > maxBlockSize
  1153. ? code.slice(0, maxBlockSize) + '\n... (truncated) ...'
  1154. : code;
  1155. blocks.push({
  1156. content: truncated,
  1157. filePath: node.filePath,
  1158. startLine: node.startLine,
  1159. endLine: node.endLine,
  1160. language: node.language,
  1161. node,
  1162. });
  1163. }
  1164. }
  1165. return blocks;
  1166. }
  1167. /**
  1168. * Get unique files from a subgraph
  1169. */
  1170. private getRelatedFiles(subgraph: Subgraph): string[] {
  1171. const files = new Set<string>();
  1172. for (const node of subgraph.nodes.values()) {
  1173. files.add(node.filePath);
  1174. }
  1175. return Array.from(files).sort();
  1176. }
  1177. /**
  1178. * Generate a summary of the context
  1179. */
  1180. private generateSummary(_query: string, subgraph: Subgraph, entryPoints: Node[]): string {
  1181. const nodeCount = subgraph.nodes.size;
  1182. const edgeCount = subgraph.edges.length;
  1183. const files = this.getRelatedFiles(subgraph);
  1184. const entryPointNames = entryPoints
  1185. .slice(0, 3)
  1186. .map((n) => n.name)
  1187. .join(', ');
  1188. const remaining = entryPoints.length > 3 ? ` and ${entryPoints.length - 3} more` : '';
  1189. return `Found ${nodeCount} relevant code symbols across ${files.length} files. ` +
  1190. `Key entry points: ${entryPointNames}${remaining}. ` +
  1191. `${edgeCount} relationships identified.`;
  1192. }
  1193. /**
  1194. * Resolve import/export nodes to their actual definitions
  1195. *
  1196. * When search returns `import { TerminalPanel }`, users want the TerminalPanel
  1197. * class definition, not the import statement. This follows the `imports` edge
  1198. * to find and return the actual definition instead.
  1199. *
  1200. * @param results - Search results that may include import/export nodes
  1201. * @returns Results with imports resolved to definitions where possible
  1202. */
  1203. private resolveImportsToDefinitions(results: SearchResult[]): SearchResult[] {
  1204. const resolved: SearchResult[] = [];
  1205. const seenIds = new Set<string>();
  1206. for (const result of results) {
  1207. const { node, score } = result;
  1208. // If it's not an import/export, keep it as-is
  1209. if (node.kind !== 'import' && node.kind !== 'export') {
  1210. if (!seenIds.has(node.id)) {
  1211. seenIds.add(node.id);
  1212. resolved.push(result);
  1213. }
  1214. continue;
  1215. }
  1216. // For imports/exports, try to find what they reference
  1217. // Imports have outgoing 'imports' edges to the definition
  1218. // Exports have outgoing 'exports' edges to the definition
  1219. const edgeKind = node.kind === 'import' ? 'imports' : 'exports';
  1220. const outgoingEdges = this.queries.getOutgoingEdges(node.id, [edgeKind as EdgeKind]);
  1221. let foundDefinition = false;
  1222. for (const edge of outgoingEdges) {
  1223. const targetNode = this.queries.getNodeById(edge.target);
  1224. if (targetNode && !seenIds.has(targetNode.id)) {
  1225. // Found the definition - use it instead of the import
  1226. seenIds.add(targetNode.id);
  1227. resolved.push({
  1228. node: targetNode,
  1229. score: score, // Preserve the original score
  1230. });
  1231. foundDefinition = true;
  1232. logDebug('Resolved import to definition', {
  1233. import: node.name,
  1234. definition: targetNode.name,
  1235. kind: targetNode.kind,
  1236. });
  1237. }
  1238. }
  1239. // If we couldn't resolve the import, skip it (it's low-value on its own)
  1240. if (!foundDefinition) {
  1241. logDebug('Skipping unresolved import', { name: node.name, file: node.filePath });
  1242. }
  1243. }
  1244. return resolved;
  1245. }
  1246. }
  1247. /**
  1248. * Create a context builder
  1249. */
  1250. export function createContextBuilder(
  1251. projectRoot: string,
  1252. queries: QueryBuilder,
  1253. traverser: GraphTraverser
  1254. ): ContextBuilder {
  1255. return new ContextBuilder(projectRoot, queries, traverser);
  1256. }
  1257. // Re-export formatter
  1258. export { formatContextAsMarkdown, formatContextAsJson } from './formatter';