index.ts 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. /**
  2. * Context Builder
  3. *
  4. * Builds rich context for tasks by combining semantic search with graph traversal.
  5. * Outputs structured context ready to inject into Claude.
  6. */
  7. import * as fs from 'fs';
  8. import {
  9. Node,
  10. Edge,
  11. NodeKind,
  12. EdgeKind,
  13. Subgraph,
  14. CodeBlock,
  15. TaskContext,
  16. TaskInput,
  17. BuildContextOptions,
  18. FindRelevantContextOptions,
  19. SearchResult,
  20. } from '../types';
  21. import { QueryBuilder } from '../db/queries';
  22. import { GraphTraverser } from '../graph';
  23. import { VectorManager } from '../vectors';
  24. import { formatContextAsMarkdown, formatContextAsJson } from './formatter';
  25. import { logDebug, logWarn } from '../errors';
  26. import { validatePathWithinRoot } from '../utils';
  27. import { isTestFile } from '../search/query-utils';
  28. /**
  29. * Extract likely symbol names from a natural language query
  30. *
  31. * Identifies potential code symbols using patterns:
  32. * - CamelCase: UserService, signInWithGoogle
  33. * - snake_case: user_service, sign_in
  34. * - SCREAMING_SNAKE: MAX_RETRIES
  35. * - dot.notation: app.isPackaged (extracts both sides)
  36. * - Single words that look like identifiers (no spaces, not common English words)
  37. *
  38. * @param query - Natural language query
  39. * @returns Array of potential symbol names
  40. */
  41. function extractSymbolsFromQuery(query: string): string[] {
  42. const symbols = new Set<string>();
  43. // Extract CamelCase identifiers (2+ chars, starts with letter)
  44. const camelCasePattern = /\b([A-Z][a-z]+(?:[A-Z][a-z]*)*|[a-z]+(?:[A-Z][a-z]*)+)\b/g;
  45. let match;
  46. while ((match = camelCasePattern.exec(query)) !== null) {
  47. if (match[1] && match[1].length >= 2) {
  48. symbols.add(match[1]);
  49. }
  50. }
  51. // Extract snake_case identifiers
  52. const snakeCasePattern = /\b([a-z][a-z0-9]*(?:_[a-z0-9]+)+)\b/gi;
  53. while ((match = snakeCasePattern.exec(query)) !== null) {
  54. if (match[1] && match[1].length >= 3) {
  55. symbols.add(match[1]);
  56. }
  57. }
  58. // Extract SCREAMING_SNAKE_CASE
  59. const screamingPattern = /\b([A-Z][A-Z0-9]*(?:_[A-Z0-9]+)+)\b/g;
  60. while ((match = screamingPattern.exec(query)) !== null) {
  61. if (match[1]) {
  62. symbols.add(match[1]);
  63. }
  64. }
  65. // Extract dot.notation and split into parts (e.g., "app.isPackaged" -> ["app", "isPackaged"])
  66. const dotPattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:\.[a-zA-Z][a-zA-Z0-9]*)+)\b/g;
  67. while ((match = dotPattern.exec(query)) !== null) {
  68. if (match[1]) {
  69. // Add both the full path and individual parts
  70. symbols.add(match[1]);
  71. const parts = match[1].split('.');
  72. for (const part of parts) {
  73. if (part.length >= 2) {
  74. symbols.add(part);
  75. }
  76. }
  77. }
  78. }
  79. // Filter out common English words that might match patterns
  80. const commonWords = new Set([
  81. 'the', 'and', 'for', 'with', 'from', 'this', 'that', 'have', 'been',
  82. 'will', 'would', 'could', 'should', 'does', 'done', 'make', 'made',
  83. 'use', 'used', 'using', 'work', 'works', 'find', 'found', 'show',
  84. 'call', 'called', 'calling', 'get', 'set', 'add', 'all', 'any',
  85. 'how', 'what', 'when', 'where', 'which', 'who', 'why'
  86. ]);
  87. return Array.from(symbols).filter(s => !commonWords.has(s.toLowerCase()));
  88. }
  89. /**
  90. * Default options for context building
  91. *
  92. * Tuned for minimal context usage while still providing useful results:
  93. * - Fewer nodes and code blocks by default
  94. * - Smaller code block size limit
  95. * - Shallower traversal
  96. */
  97. const DEFAULT_BUILD_OPTIONS: Required<BuildContextOptions> = {
  98. maxNodes: 20, // Reduced from 50 - most tasks don't need 50 symbols
  99. maxCodeBlocks: 5, // Reduced from 10 - only show most relevant code
  100. maxCodeBlockSize: 1500, // Reduced from 2000
  101. includeCode: true,
  102. format: 'markdown',
  103. searchLimit: 3, // Reduced from 5 - fewer entry points
  104. traversalDepth: 1, // Reduced from 2 - shallower graph expansion
  105. minScore: 0.3,
  106. };
  107. /**
  108. * Node kinds that provide high information value in context results.
  109. * Imports/exports are excluded because they have near-zero information density -
  110. * they tell you something exists, not how it works.
  111. */
  112. const HIGH_VALUE_NODE_KINDS: NodeKind[] = [
  113. 'function', 'method', 'class', 'interface', 'type_alias', 'struct', 'trait',
  114. 'component', 'route', 'variable', 'constant', 'enum', 'module', 'namespace',
  115. ];
  116. /**
  117. * Default options for finding relevant context
  118. */
  119. const DEFAULT_FIND_OPTIONS: Required<FindRelevantContextOptions> = {
  120. searchLimit: 3, // Reduced from 5
  121. traversalDepth: 1, // Reduced from 2
  122. maxNodes: 20, // Reduced from 50
  123. minScore: 0.3,
  124. edgeKinds: [],
  125. nodeKinds: HIGH_VALUE_NODE_KINDS, // Filter out imports/exports by default
  126. };
  127. /**
  128. * Context Builder
  129. *
  130. * Coordinates semantic search and graph traversal to build
  131. * comprehensive context for tasks.
  132. */
  133. export class ContextBuilder {
  134. private projectRoot: string;
  135. private queries: QueryBuilder;
  136. private traverser: GraphTraverser;
  137. private vectorManager: VectorManager | null;
  138. constructor(
  139. projectRoot: string,
  140. queries: QueryBuilder,
  141. traverser: GraphTraverser,
  142. vectorManager: VectorManager | null
  143. ) {
  144. this.projectRoot = projectRoot;
  145. this.queries = queries;
  146. this.traverser = traverser;
  147. this.vectorManager = vectorManager;
  148. }
  149. /**
  150. * Build context for a task
  151. *
  152. * Pipeline:
  153. * 1. Parse task input (string or {title, description})
  154. * 2. Run semantic search to find entry points
  155. * 3. Expand graph around entry points
  156. * 4. Extract code blocks for key nodes
  157. * 5. Format output for Claude
  158. *
  159. * @param input - Task description or object with title/description
  160. * @param options - Build options
  161. * @returns TaskContext (structured) or formatted string
  162. */
  163. async buildContext(
  164. input: TaskInput,
  165. options: BuildContextOptions = {}
  166. ): Promise<TaskContext | string> {
  167. const opts = { ...DEFAULT_BUILD_OPTIONS, ...options };
  168. // Parse input
  169. const query = typeof input === 'string' ? input : `${input.title}${input.description ? `: ${input.description}` : ''}`;
  170. // Find relevant context (semantic search + graph expansion)
  171. const subgraph = await this.findRelevantContext(query, {
  172. searchLimit: opts.searchLimit,
  173. traversalDepth: opts.traversalDepth,
  174. maxNodes: opts.maxNodes,
  175. minScore: opts.minScore,
  176. });
  177. // Get entry points (nodes from semantic search)
  178. const entryPoints = this.getEntryPoints(subgraph);
  179. // Extract code blocks for key nodes
  180. const codeBlocks = opts.includeCode
  181. ? await this.extractCodeBlocks(subgraph, opts.maxCodeBlocks, opts.maxCodeBlockSize)
  182. : [];
  183. // Get related files
  184. const relatedFiles = this.getRelatedFiles(subgraph);
  185. // Generate summary
  186. const summary = this.generateSummary(query, subgraph, entryPoints);
  187. // Calculate stats
  188. const stats = {
  189. nodeCount: subgraph.nodes.size,
  190. edgeCount: subgraph.edges.length,
  191. fileCount: relatedFiles.length,
  192. codeBlockCount: codeBlocks.length,
  193. totalCodeSize: codeBlocks.reduce((sum, block) => sum + block.content.length, 0),
  194. };
  195. const context: TaskContext = {
  196. query,
  197. subgraph,
  198. entryPoints,
  199. codeBlocks,
  200. relatedFiles,
  201. summary,
  202. stats,
  203. };
  204. // Return formatted output or raw context
  205. if (opts.format === 'markdown') {
  206. return formatContextAsMarkdown(context);
  207. } else if (opts.format === 'json') {
  208. return formatContextAsJson(context);
  209. }
  210. return context;
  211. }
  212. /**
  213. * Find relevant subgraph for a query
  214. *
  215. * Uses hybrid search combining exact symbol lookup with semantic search:
  216. * 1. Extract potential symbol names from query
  217. * 2. Look up exact matches for those symbols (high confidence)
  218. * 3. Use semantic search for concept matching
  219. * 4. Merge results, prioritizing exact matches
  220. * 5. Traverse graph from entry points
  221. *
  222. * @param query - Natural language query
  223. * @param options - Search and traversal options
  224. * @returns Subgraph of relevant nodes and edges
  225. */
  226. async findRelevantContext(
  227. query: string,
  228. options: FindRelevantContextOptions = {}
  229. ): Promise<Subgraph> {
  230. const opts = { ...DEFAULT_FIND_OPTIONS, ...options };
  231. // Start with empty subgraph
  232. const nodes = new Map<string, Node>();
  233. const edges: Edge[] = [];
  234. const roots: string[] = [];
  235. // Handle empty query - return empty subgraph
  236. if (!query || query.trim().length === 0) {
  237. return { nodes, edges, roots };
  238. }
  239. // === HYBRID SEARCH ===
  240. // Step 1: Extract potential symbol names from query
  241. const symbolsFromQuery = extractSymbolsFromQuery(query);
  242. logDebug('Extracted symbols from query', { query, symbols: symbolsFromQuery });
  243. // Step 2: Look up exact matches for extracted symbols
  244. let exactMatches: SearchResult[] = [];
  245. if (symbolsFromQuery.length > 0) {
  246. try {
  247. exactMatches = this.queries.findNodesByExactName(symbolsFromQuery, {
  248. limit: Math.ceil(opts.searchLimit * 2), // Get more since we'll merge
  249. kinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
  250. });
  251. logDebug('Exact symbol matches', { count: exactMatches.length });
  252. } catch (error) {
  253. logDebug('Exact symbol lookup failed', { error: String(error) });
  254. }
  255. }
  256. // Step 3: Try semantic search if vector manager is available
  257. let semanticResults: SearchResult[] = [];
  258. if (this.vectorManager && this.vectorManager.isInitialized()) {
  259. try {
  260. semanticResults = await this.vectorManager.search(query, {
  261. limit: opts.searchLimit,
  262. kinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
  263. });
  264. logDebug('Semantic search results', { count: semanticResults.length });
  265. } catch (error) {
  266. logDebug('Semantic search failed, falling back to text search', { query, error: String(error) });
  267. }
  268. }
  269. // Step 4: Fall back to text search if no semantic results
  270. if (semanticResults.length === 0 && exactMatches.length === 0) {
  271. try {
  272. const textResults = this.queries.searchNodes(query, {
  273. limit: opts.searchLimit,
  274. kinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
  275. });
  276. semanticResults = textResults;
  277. } catch (error) {
  278. logWarn('Text search failed', { query, error: String(error) });
  279. // Return empty results
  280. }
  281. }
  282. // Step 5: Merge results, prioritizing exact matches
  283. const seenIds = new Set<string>();
  284. let searchResults: SearchResult[] = [];
  285. // Add exact matches first (highest priority)
  286. for (const result of exactMatches) {
  287. if (!seenIds.has(result.node.id)) {
  288. seenIds.add(result.node.id);
  289. searchResults.push(result);
  290. }
  291. }
  292. // Add semantic/text results
  293. for (const result of semanticResults) {
  294. if (!seenIds.has(result.node.id)) {
  295. seenIds.add(result.node.id);
  296. searchResults.push(result);
  297. }
  298. }
  299. // Limit total results
  300. searchResults = searchResults.slice(0, opts.searchLimit * 2);
  301. // Deprioritize test files unless the query is about tests
  302. const queryLower = query.toLowerCase();
  303. const isTestQuery = queryLower.includes('test') || queryLower.includes('spec');
  304. if (!isTestQuery) {
  305. searchResults = searchResults.map(r => ({
  306. ...r,
  307. score: isTestFile(r.node.filePath) ? r.score * 0.3 : r.score,
  308. }));
  309. searchResults.sort((a, b) => b.score - a.score);
  310. }
  311. // Filter by minimum score
  312. let filteredResults = searchResults.filter((r) => r.score >= opts.minScore);
  313. // Resolve imports/exports to their actual definitions
  314. // If someone searches "terminal" and finds `import { TerminalPanel }`,
  315. // they want the TerminalPanel class, not the import statement
  316. filteredResults = this.resolveImportsToDefinitions(filteredResults);
  317. // Add entry points to subgraph
  318. for (const result of filteredResults) {
  319. nodes.set(result.node.id, result.node);
  320. roots.push(result.node.id);
  321. }
  322. // Traverse from each entry point
  323. for (const result of filteredResults) {
  324. const traversalResult = this.traverser.traverseBFS(result.node.id, {
  325. maxDepth: opts.traversalDepth,
  326. edgeKinds: opts.edgeKinds && opts.edgeKinds.length > 0 ? opts.edgeKinds : undefined,
  327. nodeKinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
  328. direction: 'both',
  329. limit: Math.ceil(opts.maxNodes / Math.max(1, filteredResults.length)),
  330. });
  331. // Merge nodes
  332. for (const [id, node] of traversalResult.nodes) {
  333. if (!nodes.has(id)) {
  334. nodes.set(id, node);
  335. }
  336. }
  337. // Merge edges (avoid duplicates)
  338. for (const edge of traversalResult.edges) {
  339. const exists = edges.some(
  340. (e) => e.source === edge.source && e.target === edge.target && e.kind === edge.kind
  341. );
  342. if (!exists) {
  343. edges.push(edge);
  344. }
  345. }
  346. }
  347. // Trim to max nodes if needed
  348. if (nodes.size > opts.maxNodes) {
  349. // Prioritize entry points and their direct neighbors
  350. const priorityIds = new Set(roots);
  351. for (const edge of edges) {
  352. if (priorityIds.has(edge.source)) {
  353. priorityIds.add(edge.target);
  354. }
  355. if (priorityIds.has(edge.target)) {
  356. priorityIds.add(edge.source);
  357. }
  358. }
  359. // Keep priority nodes, then fill remaining slots
  360. const trimmedNodes = new Map<string, Node>();
  361. for (const id of priorityIds) {
  362. const node = nodes.get(id);
  363. if (node && trimmedNodes.size < opts.maxNodes) {
  364. trimmedNodes.set(id, node);
  365. }
  366. }
  367. // Fill remaining from other nodes
  368. for (const [id, node] of nodes) {
  369. if (trimmedNodes.size >= opts.maxNodes) break;
  370. if (!trimmedNodes.has(id)) {
  371. trimmedNodes.set(id, node);
  372. }
  373. }
  374. // Filter edges to only include kept nodes
  375. const trimmedEdges = edges.filter(
  376. (e) => trimmedNodes.has(e.source) && trimmedNodes.has(e.target)
  377. );
  378. return { nodes: trimmedNodes, edges: trimmedEdges, roots };
  379. }
  380. return { nodes, edges, roots };
  381. }
  382. /**
  383. * Get the source code for a node
  384. *
  385. * Reads the file and extracts the code between startLine and endLine.
  386. *
  387. * @param nodeId - ID of the node
  388. * @returns Code string or null if not found
  389. */
  390. async getCode(nodeId: string): Promise<string | null> {
  391. const node = this.queries.getNodeById(nodeId);
  392. if (!node) {
  393. return null;
  394. }
  395. return this.extractNodeCode(node);
  396. }
  397. /**
  398. * Extract code from a node's source file
  399. */
  400. private async extractNodeCode(node: Node): Promise<string | null> {
  401. const filePath = validatePathWithinRoot(this.projectRoot, node.filePath);
  402. if (!filePath || !fs.existsSync(filePath)) {
  403. return null;
  404. }
  405. try {
  406. const content = fs.readFileSync(filePath, 'utf-8');
  407. const lines = content.split('\n');
  408. // Extract lines (1-indexed to 0-indexed)
  409. const startIdx = Math.max(0, node.startLine - 1);
  410. const endIdx = Math.min(lines.length, node.endLine);
  411. return lines.slice(startIdx, endIdx).join('\n');
  412. } catch (error) {
  413. logDebug('Failed to extract code from node', { nodeId: node.id, filePath: node.filePath, error: String(error) });
  414. return null;
  415. }
  416. }
  417. /**
  418. * Get entry points from a subgraph (the root nodes)
  419. */
  420. private getEntryPoints(subgraph: Subgraph): Node[] {
  421. return subgraph.roots
  422. .map((id) => subgraph.nodes.get(id))
  423. .filter((n): n is Node => n !== undefined);
  424. }
  425. /**
  426. * Extract code blocks for key nodes in the subgraph
  427. */
  428. private async extractCodeBlocks(
  429. subgraph: Subgraph,
  430. maxBlocks: number,
  431. maxBlockSize: number
  432. ): Promise<CodeBlock[]> {
  433. const blocks: CodeBlock[] = [];
  434. // Prioritize entry points, then functions/methods
  435. const priorityNodes: Node[] = [];
  436. // First: entry points
  437. for (const id of subgraph.roots) {
  438. const node = subgraph.nodes.get(id);
  439. if (node) {
  440. priorityNodes.push(node);
  441. }
  442. }
  443. // Then: functions and methods
  444. for (const node of subgraph.nodes.values()) {
  445. if (!subgraph.roots.includes(node.id)) {
  446. if (node.kind === 'function' || node.kind === 'method') {
  447. priorityNodes.push(node);
  448. }
  449. }
  450. }
  451. // Then: classes
  452. for (const node of subgraph.nodes.values()) {
  453. if (!subgraph.roots.includes(node.id)) {
  454. if (node.kind === 'class') {
  455. priorityNodes.push(node);
  456. }
  457. }
  458. }
  459. // Extract code for priority nodes
  460. for (const node of priorityNodes) {
  461. if (blocks.length >= maxBlocks) break;
  462. const code = await this.extractNodeCode(node);
  463. if (code) {
  464. // Truncate if too long
  465. const truncated = code.length > maxBlockSize
  466. ? code.slice(0, maxBlockSize) + '\n// ... truncated ...'
  467. : code;
  468. blocks.push({
  469. content: truncated,
  470. filePath: node.filePath,
  471. startLine: node.startLine,
  472. endLine: node.endLine,
  473. language: node.language,
  474. node,
  475. });
  476. }
  477. }
  478. return blocks;
  479. }
  480. /**
  481. * Get unique files from a subgraph
  482. */
  483. private getRelatedFiles(subgraph: Subgraph): string[] {
  484. const files = new Set<string>();
  485. for (const node of subgraph.nodes.values()) {
  486. files.add(node.filePath);
  487. }
  488. return Array.from(files).sort();
  489. }
  490. /**
  491. * Generate a summary of the context
  492. */
  493. private generateSummary(_query: string, subgraph: Subgraph, entryPoints: Node[]): string {
  494. const nodeCount = subgraph.nodes.size;
  495. const edgeCount = subgraph.edges.length;
  496. const files = this.getRelatedFiles(subgraph);
  497. const entryPointNames = entryPoints
  498. .slice(0, 3)
  499. .map((n) => n.name)
  500. .join(', ');
  501. const remaining = entryPoints.length > 3 ? ` and ${entryPoints.length - 3} more` : '';
  502. return `Found ${nodeCount} relevant code symbols across ${files.length} files. ` +
  503. `Key entry points: ${entryPointNames}${remaining}. ` +
  504. `${edgeCount} relationships identified.`;
  505. }
  506. /**
  507. * Resolve import/export nodes to their actual definitions
  508. *
  509. * When search returns `import { TerminalPanel }`, users want the TerminalPanel
  510. * class definition, not the import statement. This follows the `imports` edge
  511. * to find and return the actual definition instead.
  512. *
  513. * @param results - Search results that may include import/export nodes
  514. * @returns Results with imports resolved to definitions where possible
  515. */
  516. private resolveImportsToDefinitions(results: SearchResult[]): SearchResult[] {
  517. const resolved: SearchResult[] = [];
  518. const seenIds = new Set<string>();
  519. for (const result of results) {
  520. const { node, score } = result;
  521. // If it's not an import/export, keep it as-is
  522. if (node.kind !== 'import' && node.kind !== 'export') {
  523. if (!seenIds.has(node.id)) {
  524. seenIds.add(node.id);
  525. resolved.push(result);
  526. }
  527. continue;
  528. }
  529. // For imports/exports, try to find what they reference
  530. // Imports have outgoing 'imports' edges to the definition
  531. // Exports have outgoing 'exports' edges to the definition
  532. const edgeKind = node.kind === 'import' ? 'imports' : 'exports';
  533. const outgoingEdges = this.queries.getOutgoingEdges(node.id, [edgeKind as EdgeKind]);
  534. let foundDefinition = false;
  535. for (const edge of outgoingEdges) {
  536. const targetNode = this.queries.getNodeById(edge.target);
  537. if (targetNode && !seenIds.has(targetNode.id)) {
  538. // Found the definition - use it instead of the import
  539. seenIds.add(targetNode.id);
  540. resolved.push({
  541. node: targetNode,
  542. score: score, // Preserve the original score
  543. });
  544. foundDefinition = true;
  545. logDebug('Resolved import to definition', {
  546. import: node.name,
  547. definition: targetNode.name,
  548. kind: targetNode.kind,
  549. });
  550. }
  551. }
  552. // If we couldn't resolve the import, skip it (it's low-value on its own)
  553. if (!foundDefinition) {
  554. logDebug('Skipping unresolved import', { name: node.name, file: node.filePath });
  555. }
  556. }
  557. return resolved;
  558. }
  559. }
  560. /**
  561. * Create a context builder
  562. */
  563. export function createContextBuilder(
  564. projectRoot: string,
  565. queries: QueryBuilder,
  566. traverser: GraphTraverser,
  567. vectorManager: VectorManager | null
  568. ): ContextBuilder {
  569. return new ContextBuilder(projectRoot, queries, traverser, vectorManager);
  570. }
  571. // Re-export formatter
  572. export { formatContextAsMarkdown, formatContextAsJson } from './formatter';