Procházet zdrojové kódy

feat: Add comprehensive evaluation framework for CodeGraph API testing

Introduces automated testing infrastructure to measure CodeGraph performance across searchNodes and findRelevantContext APIs. Includes recall/MRR scoring metrics, predefined test cases for symbol lookup and context exploration, and JSON report generation. Enhances context building with acronym extraction, definition prefix matching, and improved FTS filtering to exclude imports by default.
Colby McHenry před 2 měsíci
rodič
revize
13d3ff3613

+ 123 - 0
__tests__/evaluation/runner.ts

@@ -0,0 +1,123 @@
+import { execSync } from 'child_process';
+import * as fs from 'fs';
+import * as path from 'path';
+import { CodeGraph } from '../../src/index.js';
+import { scoreSearchNodes, scoreFindRelevantContext } from './scoring.js';
+import { testCases } from './test-cases.js';
+import type { EvalReport, EvalResult } from './types.js';
+
+const codebasePath = process.env.EVAL_CODEBASE || process.argv[2];
+if (!codebasePath) {
+  console.error('Usage: EVAL_CODEBASE=/path/to/codebase npx tsx __tests__/evaluation/runner.ts');
+  console.error('   or: npx tsx __tests__/evaluation/runner.ts /path/to/codebase');
+  process.exit(1);
+}
+
+const resolvedPath = path.resolve(codebasePath);
+if (!fs.existsSync(path.join(resolvedPath, '.codegraph', 'codegraph.db'))) {
+  console.error(`No .codegraph/codegraph.db found at ${resolvedPath}`);
+  process.exit(1);
+}
+
+let codegraphSha = 'unknown';
+try {
+  codegraphSha = execSync('git rev-parse --short HEAD', { encoding: 'utf-8' }).trim();
+} catch {}
+
+console.log(`\nCodeGraph Eval — ${path.basename(resolvedPath)}`);
+console.log(`Codebase: ${resolvedPath}`);
+console.log(`Commit:   ${codegraphSha}`);
+console.log(`Cases:    ${testCases.length}`);
+console.log('');
+
+async function run() {
+  const cg = CodeGraph.openSync(resolvedPath);
+  const results: EvalResult[] = [];
+
+  for (const tc of testCases) {
+    const start = performance.now();
+
+    if (tc.api === 'searchNodes') {
+      const searchResults = cg.searchNodes(tc.query, {
+        limit: 10,
+        kinds: tc.kinds,
+        ...(tc.options as Record<string, unknown>),
+      });
+      const latency = performance.now() - start;
+      const result = scoreSearchNodes(tc.id, tc.expectedSymbols, searchResults, latency);
+      results.push(result);
+    } else {
+      const subgraph = await cg.findRelevantContext(tc.query, {
+        searchLimit: 8,
+        traversalDepth: 3,
+        maxNodes: 80,
+        minScore: 0.2,
+        ...(tc.options as Record<string, unknown>),
+      });
+      const latency = performance.now() - start;
+      const result = scoreFindRelevantContext(tc.id, tc.expectedSymbols, subgraph, latency);
+      results.push(result);
+    }
+  }
+
+  cg.close();
+
+  // Print results table
+  const maxIdLen = Math.max(...results.map((r) => r.caseId.length));
+
+  for (const r of results) {
+    const status = r.pass ? '\x1b[32mPASS\x1b[0m' : '\x1b[31mFAIL\x1b[0m';
+    const id = r.caseId.padEnd(maxIdLen);
+    const recall = `recall=${r.recall.toFixed(2)}`;
+    const extra =
+      r.edgeDensity !== undefined
+        ? `density=${r.edgeDensity.toFixed(2)}`
+        : `mrr=${r.mrr.toFixed(2)}`;
+    const latency = `${Math.round(r.latencyMs)}ms`;
+
+    console.log(`  ${id}  ${status}  ${recall}  ${extra}  ${latency}`);
+
+    if (r.missedSymbols.length > 0) {
+      console.log(`  ${' '.repeat(maxIdLen)}        missed: ${r.missedSymbols.join(', ')}`);
+    }
+  }
+
+  // Summary
+  const passed = results.filter((r) => r.pass).length;
+  const failed = results.length - passed;
+  const meanRecall = results.reduce((s, r) => s + r.recall, 0) / results.length;
+  const mrrResults = results.filter((r) => r.mrr > 0 || r.caseId.startsWith('search-'));
+  const meanMRR =
+    mrrResults.length > 0 ? mrrResults.reduce((s, r) => s + r.mrr, 0) / mrrResults.length : 0;
+
+  console.log('');
+  const summaryColor = failed === 0 ? '\x1b[32m' : '\x1b[33m';
+  console.log(
+    `${summaryColor}SUMMARY: ${passed}/${results.length} passed | recall=${meanRecall.toFixed(2)} | mrr=${meanMRR.toFixed(2)}\x1b[0m`
+  );
+
+  // Save JSON report
+  const report: EvalReport = {
+    timestamp: new Date().toISOString(),
+    codebasePath: resolvedPath,
+    codegraphSha,
+    summary: { total: results.length, passed, failed, meanRecall, meanMRR },
+    results,
+  };
+
+  const resultsDir = path.join(__dirname, 'results');
+  fs.mkdirSync(resultsDir, { recursive: true });
+  const reportFile = path.join(
+    resultsDir,
+    `${new Date().toISOString().replace(/[:.]/g, '-')}.json`
+  );
+  fs.writeFileSync(reportFile, JSON.stringify(report, null, 2));
+  console.log(`\nReport saved: ${reportFile}`);
+
+  process.exit(failed > 0 ? 1 : 0);
+}
+
+run().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});

+ 82 - 0
__tests__/evaluation/scoring.ts

@@ -0,0 +1,82 @@
+import type { EvalResult } from './types.js';
+
+export const PASS_THRESHOLD = 0.5;
+
+export function scoreSearchNodes(
+  caseId: string,
+  expectedSymbols: string[],
+  results: Array<{ node: { name: string }; score: number }>,
+  latencyMs: number
+): EvalResult {
+  const expectedLower = expectedSymbols.map((s) => s.toLowerCase());
+  const resultNames = results.map((r) => r.node.name.toLowerCase());
+
+  const found: string[] = [];
+  const missed: string[] = [];
+  let firstRank = 0;
+
+  for (let i = 0; i < expectedLower.length; i++) {
+    const idx = resultNames.indexOf(expectedLower[i]);
+    if (idx !== -1) {
+      found.push(expectedSymbols[i]);
+      if (firstRank === 0) firstRank = idx + 1;
+    } else {
+      missed.push(expectedSymbols[i]);
+    }
+  }
+
+  const recall = expectedSymbols.length > 0 ? found.length / expectedSymbols.length : 0;
+  const mrr = firstRank > 0 ? 1 / firstRank : 0;
+
+  return {
+    caseId,
+    pass: recall >= PASS_THRESHOLD,
+    recall,
+    mrr,
+    foundSymbols: found,
+    missedSymbols: missed,
+    latencyMs,
+  };
+}
+
+export function scoreFindRelevantContext(
+  caseId: string,
+  expectedSymbols: string[],
+  subgraph: { nodes: Map<string, { name: string }>; edges: unknown[]; roots: string[] },
+  latencyMs: number
+): EvalResult {
+  const expectedLower = new Set(expectedSymbols.map((s) => s.toLowerCase()));
+  const nodeNames = new Set<string>();
+  for (const node of subgraph.nodes.values()) {
+    nodeNames.add(node.name.toLowerCase());
+  }
+
+  const found: string[] = [];
+  const missed: string[] = [];
+
+  for (const sym of expectedSymbols) {
+    if (nodeNames.has(sym.toLowerCase())) {
+      found.push(sym);
+    } else {
+      missed.push(sym);
+    }
+  }
+
+  const recall = expectedSymbols.length > 0 ? found.length / expectedSymbols.length : 0;
+  const nodeCount = subgraph.nodes.size;
+  const edgeCount = subgraph.edges.length;
+  const edgeDensity = nodeCount > 0 ? edgeCount / nodeCount : 0;
+
+  return {
+    caseId,
+    pass: recall >= PASS_THRESHOLD,
+    recall,
+    mrr: 0,
+    foundSymbols: found,
+    missedSymbols: missed,
+    nodeCount,
+    edgeCount,
+    edgeDensity,
+    latencyMs,
+  };
+}

+ 93 - 0
__tests__/evaluation/test-cases.ts

@@ -0,0 +1,93 @@
+import type { EvalTestCase } from './types.js';
+
+export const testCases: EvalTestCase[] = [
+  // === searchNodes: Symbol Lookup Precision ===
+
+  {
+    id: 'search-class-exact',
+    query: 'TransportService',
+    api: 'searchNodes',
+    expectedSymbols: ['TransportService'],
+    kinds: ['class'],
+  },
+  {
+    id: 'search-method-qualified',
+    query: 'TransportService sendRequest',
+    api: 'searchNodes',
+    expectedSymbols: ['sendRequest'],
+    kinds: ['method'],
+  },
+  {
+    id: 'search-interface',
+    query: 'ActionListener',
+    api: 'searchNodes',
+    expectedSymbols: ['ActionListener'],
+    kinds: ['interface'],
+  },
+  {
+    id: 'search-enum',
+    query: 'RestStatus',
+    api: 'searchNodes',
+    expectedSymbols: ['RestStatus'],
+    kinds: ['enum'],
+  },
+  {
+    id: 'search-exception',
+    query: 'SearchPhaseExecutionException',
+    api: 'searchNodes',
+    expectedSymbols: ['SearchPhaseExecutionException'],
+    kinds: ['class'],
+  },
+  {
+    id: 'search-nested-class',
+    query: 'Engine Index',
+    api: 'searchNodes',
+    expectedSymbols: ['Index'],
+    kinds: ['class'],
+  },
+
+  // === findRelevantContext: Exploration Quality ===
+
+  {
+    id: 'explore-rest-layer',
+    query: 'How does the REST layer handle HTTP requests?',
+    api: 'findRelevantContext',
+    expectedSymbols: ['RestController', 'RestHandler', 'BaseRestHandler', 'RestRequest'],
+    options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
+  },
+  {
+    id: 'explore-search-execution',
+    query: 'How does search execution work from request to shard?',
+    api: 'findRelevantContext',
+    expectedSymbols: ['TransportSearchAction', 'AbstractSearchAsyncAction', 'QueryPhase', 'FetchPhase'],
+    options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
+  },
+  {
+    id: 'explore-bulk-indexing',
+    query: 'How does bulk indexing work?',
+    api: 'findRelevantContext',
+    expectedSymbols: ['TransportBulkAction', 'BulkRequest', 'BulkResponse'],
+    options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
+  },
+  {
+    id: 'explore-shard-allocation',
+    query: 'How does shard rebalancing and allocation work?',
+    api: 'findRelevantContext',
+    expectedSymbols: ['AllocationService', 'BalancedShardsAllocator'],
+    options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
+  },
+  {
+    id: 'explore-transport-search',
+    query: 'How does TransportService connect to SearchTransportService?',
+    api: 'findRelevantContext',
+    expectedSymbols: ['TransportService', 'SearchTransportService'],
+    options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
+  },
+  {
+    id: 'explore-engine-implementations',
+    query: 'What are the Engine implementations for indexing?',
+    api: 'findRelevantContext',
+    expectedSymbols: ['InternalEngine', 'ReadOnlyEngine', 'Engine'],
+    options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
+  },
+];

+ 37 - 0
__tests__/evaluation/types.ts

@@ -0,0 +1,37 @@
+import type { NodeKind } from '../../src/types.js';
+
+export interface EvalTestCase {
+  id: string;
+  query: string;
+  api: 'searchNodes' | 'findRelevantContext';
+  expectedSymbols: string[];
+  kinds?: NodeKind[];
+  options?: Record<string, unknown>;
+}
+
+export interface EvalResult {
+  caseId: string;
+  pass: boolean;
+  recall: number;
+  mrr: number;
+  foundSymbols: string[];
+  missedSymbols: string[];
+  nodeCount?: number;
+  edgeCount?: number;
+  edgeDensity?: number;
+  latencyMs: number;
+}
+
+export interface EvalReport {
+  timestamp: string;
+  codebasePath: string;
+  codegraphSha: string;
+  summary: {
+    total: number;
+    passed: number;
+    failed: number;
+    meanRecall: number;
+    meanMRR: number;
+  };
+  results: EvalResult[];
+}

+ 68 - 1
src/context/index.ts

@@ -68,6 +68,14 @@ function extractSymbolsFromQuery(query: string): string[] {
     }
   }
 
+  // Extract ALL_CAPS acronyms (2+ chars, e.g., REST, HTTP, LRU, API)
+  const acronymPattern = /\b([A-Z]{2,})\b/g;
+  while ((match = acronymPattern.exec(query)) !== null) {
+    if (match[1]) {
+      symbols.add(match[1]);
+    }
+  }
+
   // Extract dot.notation and split into parts (e.g., "app.isPackaged" -> ["app", "isPackaged"])
   const dotPattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:\.[a-zA-Z][a-zA-Z0-9]*)+)\b/g;
   while ((match = dotPattern.exec(query)) !== null) {
@@ -107,6 +115,19 @@ function extractSymbolsFromQuery(query: string): string[] {
     'more', 'most', 'very', 'being', 'having', 'doing',
     'system', 'need', 'needs', 'want', 'wants', 'like', 'look',
     'change', 'changes', 'changed', 'changing',
+    // Common English nouns/verbs that match thousands of unrelated code symbols
+    'layer', 'handle', 'handles', 'handling', 'incoming', 'outgoing',
+    'data', 'flow', 'flows', 'level', 'levels', 'request', 'requests',
+    'response', 'responses', 'implement', 'implements', 'implementation',
+    'interface', 'interfaces', 'class', 'classes', 'method', 'methods',
+    'trigger', 'triggers', 'affected', 'affect', 'affects',
+    'else', 'code', 'failing', 'failed', 'silently', 'decide', 'decides',
+    'connect', 'connection', 'connections',
+    'return', 'returns', 'returned', 'take', 'takes', 'taken',
+    'send', 'sends', 'receive', 'receives', 'process', 'processes',
+    'check', 'checks', 'checked', 'create', 'creates', 'created',
+    'read', 'reads', 'write', 'writes', 'written',
+    'start', 'starts', 'stop', 'stops', 'run', 'runs', 'running',
   ]);
 
   return Array.from(symbols).filter(s => !commonWords.has(s.toLowerCase()));
@@ -327,6 +348,43 @@ export class ContextBuilder {
       }
     }
 
+    // Step 2b: Search for extracted symbols as definition (class/interface) prefixes.
+    // When the user writes "REST", "bulk", or "allocation", they usually mean classes
+    // like RestController, BulkRequest, AllocationService — not nodes named exactly that.
+    if (symbolsFromQuery.length > 0) {
+      const definitionKinds: NodeKind[] = ['class', 'interface', 'struct', 'trait',
+        'protocol', 'enum', 'type_alias'];
+      for (const sym of symbolsFromQuery) {
+        // Title-case the symbol: "REST" → "Rest", "bulk" → "Bulk", "allocation" → "Allocation"
+        const titleCased = sym.charAt(0).toUpperCase() + sym.slice(1).toLowerCase();
+        if (titleCased === sym) continue; // already title-case (e.g., "Engine") — handled by exact match
+        // Fetch more results since popular prefixes have many matches
+        const prefixResults = this.queries.searchNodes(titleCased, {
+          limit: 30,
+          kinds: definitionKinds,
+        });
+        const matched: SearchResult[] = [];
+        for (const r of prefixResults) {
+          if (r.node.name.toLowerCase().startsWith(titleCased.toLowerCase())) {
+            // Favor shorter names: "AllocationService" (18 chars) over
+            // "AllocationBalancingRoundMetrics" (31 chars). Core classes tend
+            // to have concise names; test/helper classes are verbose.
+            const brevityBonus = Math.max(0, 10 - (r.node.name.length - titleCased.length) / 3);
+            matched.push({ ...r, score: r.score + 15 + brevityBonus });
+          }
+        }
+        matched.sort((a, b) => b.score - a.score);
+        for (const r of matched.slice(0, Math.ceil(opts.searchLimit))) {
+          const existing = exactMatches.find(e => e.node.id === r.node.id);
+          if (!existing) {
+            exactMatches.push(r);
+          }
+        }
+      }
+      exactMatches.sort((a, b) => b.score - a.score);
+      exactMatches = exactMatches.slice(0, Math.ceil(opts.searchLimit * 3));
+    }
+
     // Step 3: Try semantic search if vector manager is available
     let semanticResults: SearchResult[] = [];
     if (this.vectorManager && this.vectorManager.isInitialized()) {
@@ -352,10 +410,19 @@ export class ContextBuilder {
         // Search each term individually to get broader coverage,
         // then boost results that match multiple terms
         const termResultsMap = new Map<string, { result: SearchResult; termHits: number }>();
+        // When no explicit kind filter is set, exclude imports — they flood FTS
+        // results with qualified name matches (e.g., "REST" matches 445K import paths)
+        // but are almost never what exploration queries want.
+        const searchKinds = opts.nodeKinds && opts.nodeKinds.length > 0
+          ? opts.nodeKinds
+          : ['file', 'module', 'class', 'struct', 'interface', 'trait', 'protocol',
+             'function', 'method', 'property', 'field', 'variable', 'constant',
+             'enum', 'enum_member', 'type_alias', 'namespace', 'export',
+             'route', 'component'] as NodeKind[];
         for (const term of searchTerms) {
           const termResults = this.queries.searchNodes(term, {
             limit: opts.searchLimit * 2,
-            kinds: opts.nodeKinds && opts.nodeKinds.length > 0 ? opts.nodeKinds : undefined,
+            kinds: searchKinds,
           });
           for (const r of termResults) {
             const existing = termResultsMap.get(r.node.id);