runner.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. /**
  2. * Evaluation Runner
  3. *
  4. * Runs test cases against CodeGraph fixtures and measures precision/recall.
  5. */
  6. import * as path from 'path';
  7. import * as fs from 'fs';
  8. import CodeGraph from '../../src/index';
  9. import type { Node, SearchResult, NodeKind } from '../../src/types';
  10. import type {
  11. TestCase,
  12. TestCaseResult,
  13. FixtureGroundTruth,
  14. FixtureEvaluationResult,
  15. EvaluationSummary,
  16. } from './types';
  17. // Import fixtures
  18. import { typescriptFixture } from './fixtures/typescript-project/ground-truth';
  19. import { pythonFixture } from './fixtures/python-project/ground-truth';
  20. /**
  21. * Simple token counter (approximation using word count * 1.3)
  22. */
  23. function countTokens(text: string): number {
  24. const words = text.split(/\s+/).filter(w => w.length > 0);
  25. return Math.ceil(words.length * 1.3);
  26. }
  27. /**
  28. * Extract symbol names from CodeGraph results
  29. */
  30. function extractSymbolNames(nodes: Node[]): Set<string> {
  31. const names = new Set<string>();
  32. for (const node of nodes) {
  33. // Add the simple name
  34. names.add(node.name);
  35. // Add qualified name if we have parent info (Class.method format)
  36. // This is a simplification - real implementation would use containment edges
  37. if (node.kind === 'method' || node.kind === 'function') {
  38. // Try to infer class from file path or other context
  39. const fileName = path.basename(node.filePath, path.extname(node.filePath));
  40. names.add(`${fileName}.${node.name}`);
  41. }
  42. }
  43. return names;
  44. }
  45. /**
  46. * Normalize symbol name for comparison
  47. */
  48. function normalizeSymbol(symbol: string): string {
  49. // Remove common prefixes and normalize
  50. return symbol
  51. .replace(/^(db\.|authService\.|paymentService\.|auth_service\.|task_service\.)/, '')
  52. .toLowerCase();
  53. }
  54. /**
  55. * Check if a symbol matches any in a set (with fuzzy matching)
  56. */
  57. function symbolMatches(symbol: string, candidates: Set<string>): boolean {
  58. const normalized = normalizeSymbol(symbol);
  59. for (const candidate of candidates) {
  60. const normalizedCandidate = normalizeSymbol(candidate);
  61. // Exact match
  62. if (normalized === normalizedCandidate) return true;
  63. // Partial match (e.g., "login" matches "AuthService.login")
  64. if (normalizedCandidate.endsWith(`.${normalized}`)) return true;
  65. if (normalized.endsWith(`.${normalizedCandidate}`)) return true;
  66. // Simple name match
  67. const simpleName = normalized.split('.').pop();
  68. const simpleCandidateName = normalizedCandidate.split('.').pop();
  69. if (simpleName === simpleCandidateName) return true;
  70. }
  71. return false;
  72. }
  73. /**
  74. * Run a single test case
  75. */
  76. async function runTestCase(
  77. cg: CodeGraph,
  78. testCase: TestCase,
  79. fixtureTokens: number
  80. ): Promise<TestCaseResult> {
  81. const startTime = Date.now();
  82. let retrievedNodes: Node[] = [];
  83. let contextText = '';
  84. try {
  85. switch (testCase.type) {
  86. case 'search': {
  87. const results = cg.searchNodes(testCase.query, { limit: 20 });
  88. retrievedNodes = results.map(r => r.node);
  89. break;
  90. }
  91. case 'context': {
  92. const context = await cg.buildContext(testCase.query, {
  93. maxNodes: 30,
  94. includeCode: true,
  95. format: 'markdown',
  96. });
  97. contextText = typeof context === 'string' ? context : '';
  98. // Also get the nodes that were used to build context
  99. const results = cg.searchNodes(testCase.query, { limit: 30 });
  100. retrievedNodes = results.map(r => r.node);
  101. break;
  102. }
  103. case 'callers': {
  104. if (testCase.targetSymbol) {
  105. const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
  106. if (results.length > 0 && results[0]) {
  107. const callers = cg.getCallers(results[0].node.id);
  108. retrievedNodes = callers.map(c => c.node);
  109. }
  110. }
  111. break;
  112. }
  113. case 'callees': {
  114. if (testCase.targetSymbol) {
  115. const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
  116. if (results.length > 0 && results[0]) {
  117. const callees = cg.getCallees(results[0].node.id);
  118. retrievedNodes = callees.map(c => c.node);
  119. }
  120. }
  121. break;
  122. }
  123. case 'impact': {
  124. if (testCase.targetSymbol) {
  125. const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
  126. if (results.length > 0 && results[0]) {
  127. const impact = cg.getImpactRadius(results[0].node.id, 2);
  128. retrievedNodes = Array.from(impact.nodes.values());
  129. }
  130. }
  131. break;
  132. }
  133. }
  134. } catch (err) {
  135. console.error(`Error running test case ${testCase.id}:`, err);
  136. }
  137. const executionTimeMs = Date.now() - startTime;
  138. // Extract retrieved symbol names
  139. const retrievedSymbols = extractSymbolNames(retrievedNodes);
  140. // Calculate metrics
  141. const expectedSet = new Set(testCase.expectedSymbols.map(s => normalizeSymbol(s)));
  142. const irrelevantSet = new Set(testCase.irrelevantSymbols.map(s => normalizeSymbol(s)));
  143. const truePositives: string[] = [];
  144. const falsePositives: string[] = [];
  145. for (const symbol of retrievedSymbols) {
  146. const normalized = normalizeSymbol(symbol);
  147. if (symbolMatches(symbol, new Set(testCase.expectedSymbols))) {
  148. truePositives.push(symbol);
  149. } else if (symbolMatches(symbol, new Set(testCase.irrelevantSymbols))) {
  150. falsePositives.push(symbol);
  151. }
  152. // Symbols not in either list are ignored (neutral)
  153. }
  154. // Find false negatives (expected but not retrieved)
  155. const falseNegatives: string[] = [];
  156. for (const expected of testCase.expectedSymbols) {
  157. if (!symbolMatches(expected, retrievedSymbols)) {
  158. falseNegatives.push(expected);
  159. }
  160. }
  161. // Calculate precision and recall
  162. const totalRetrieved = truePositives.length + falsePositives.length;
  163. const precision = totalRetrieved > 0 ? truePositives.length / totalRetrieved : 0;
  164. const totalRelevant = testCase.expectedSymbols.length;
  165. const recall = totalRelevant > 0 ? truePositives.length / totalRelevant : 0;
  166. const f1Score = precision + recall > 0
  167. ? 2 * (precision * recall) / (precision + recall)
  168. : 0;
  169. // Count context tokens
  170. const contextTokens = contextText
  171. ? countTokens(contextText)
  172. : retrievedNodes.reduce((sum, node) => {
  173. // Estimate tokens from node info
  174. return sum + countTokens(node.name + ' ' + (node.signature || ''));
  175. }, 0);
  176. // Determine if test passed
  177. const meetsRecall = !testCase.minRecall || recall >= testCase.minRecall;
  178. const meetsPrecision = !testCase.minPrecision || precision >= testCase.minPrecision;
  179. const passed = meetsRecall && meetsPrecision;
  180. return {
  181. testCaseId: testCase.id,
  182. passed,
  183. precision,
  184. recall,
  185. f1Score,
  186. truePositives,
  187. falsePositives,
  188. falseNegatives,
  189. contextTokens,
  190. executionTimeMs,
  191. };
  192. }
  193. /**
  194. * Run evaluation on a single fixture
  195. */
  196. async function evaluateFixture(
  197. fixture: FixtureGroundTruth
  198. ): Promise<FixtureEvaluationResult> {
  199. const fixturePath = path.resolve(process.cwd(), fixture.path);
  200. const startTime = Date.now();
  201. console.log(`\nEvaluating fixture: ${fixture.name}`);
  202. console.log(` Path: ${fixturePath}`);
  203. // Initialize CodeGraph for this fixture
  204. let cg: CodeGraph;
  205. if (CodeGraph.isInitialized(fixturePath)) {
  206. console.log(' Opening existing index...');
  207. cg = await CodeGraph.open(fixturePath);
  208. } else {
  209. console.log(' Initializing and indexing...');
  210. cg = await CodeGraph.init(fixturePath, { index: true });
  211. }
  212. const stats = cg.getStats();
  213. console.log(` Indexed ${stats.fileCount} files, ${stats.nodeCount} nodes`);
  214. // Run all test cases
  215. const testCaseResults: TestCaseResult[] = [];
  216. for (const testCase of fixture.testCases) {
  217. console.log(` Running: ${testCase.id}...`);
  218. const result = await runTestCase(cg, testCase, fixture.approximateTokens);
  219. testCaseResults.push(result);
  220. const status = result.passed ? '✓' : '✗';
  221. console.log(` ${status} P=${(result.precision * 100).toFixed(0)}% R=${(result.recall * 100).toFixed(0)}% F1=${(result.f1Score * 100).toFixed(0)}%`);
  222. }
  223. // Close CodeGraph
  224. cg.destroy();
  225. // Calculate aggregate metrics
  226. const totalTimeMs = Date.now() - startTime;
  227. const passedTestCases = testCaseResults.filter(r => r.passed).length;
  228. const averagePrecision = testCaseResults.reduce((sum, r) => sum + r.precision, 0) / testCaseResults.length;
  229. const averageRecall = testCaseResults.reduce((sum, r) => sum + r.recall, 0) / testCaseResults.length;
  230. const averageF1Score = testCaseResults.reduce((sum, r) => sum + r.f1Score, 0) / testCaseResults.length;
  231. const averageContextTokens = testCaseResults.reduce((sum, r) => sum + r.contextTokens, 0) / testCaseResults.length;
  232. const tokenReductionPercent = fixture.approximateTokens > 0
  233. ? ((fixture.approximateTokens - averageContextTokens) / fixture.approximateTokens) * 100
  234. : 0;
  235. return {
  236. fixtureName: fixture.name,
  237. totalTestCases: testCaseResults.length,
  238. passedTestCases,
  239. averagePrecision,
  240. averageRecall,
  241. averageF1Score,
  242. fullCodebaseTokens: fixture.approximateTokens,
  243. averageContextTokens,
  244. tokenReductionPercent,
  245. testCaseResults,
  246. totalTimeMs,
  247. };
  248. }
  249. /**
  250. * Run full evaluation across all fixtures
  251. */
  252. export async function runEvaluation(): Promise<EvaluationSummary> {
  253. console.log('╔════════════════════════════════════════════════════════════════╗');
  254. console.log('║ CodeGraph Evaluation Suite ║');
  255. console.log('╚════════════════════════════════════════════════════════════════╝');
  256. const fixtures: FixtureGroundTruth[] = [
  257. typescriptFixture,
  258. pythonFixture,
  259. ];
  260. const fixtureResults: FixtureEvaluationResult[] = [];
  261. for (const fixture of fixtures) {
  262. const result = await evaluateFixture(fixture);
  263. fixtureResults.push(result);
  264. }
  265. // Calculate overall metrics
  266. const totalTests = fixtureResults.reduce((sum, r) => sum + r.totalTestCases, 0);
  267. const totalPassed = fixtureResults.reduce((sum, r) => sum + r.passedTestCases, 0);
  268. const overallPrecision = fixtureResults.reduce((sum, r) => sum + r.averagePrecision, 0) / fixtureResults.length;
  269. const overallRecall = fixtureResults.reduce((sum, r) => sum + r.averageRecall, 0) / fixtureResults.length;
  270. const overallF1Score = fixtureResults.reduce((sum, r) => sum + r.averageF1Score, 0) / fixtureResults.length;
  271. const overallTokenReduction = fixtureResults.reduce((sum, r) => sum + r.tokenReductionPercent, 0) / fixtureResults.length;
  272. // Print summary
  273. console.log('\n╔════════════════════════════════════════════════════════════════╗');
  274. console.log('║ EVALUATION SUMMARY ║');
  275. console.log('╚════════════════════════════════════════════════════════════════╝');
  276. console.log(`\nTest Results: ${totalPassed}/${totalTests} passed`);
  277. console.log(`\nOverall Metrics:`);
  278. console.log(` Precision: ${(overallPrecision * 100).toFixed(1)}%`);
  279. console.log(` Recall: ${(overallRecall * 100).toFixed(1)}%`);
  280. console.log(` F1 Score: ${(overallF1Score * 100).toFixed(1)}%`);
  281. console.log(` Token Reduction: ${overallTokenReduction.toFixed(1)}%`);
  282. console.log('\nPer-Fixture Results:');
  283. for (const result of fixtureResults) {
  284. console.log(` ${result.fixtureName}:`);
  285. console.log(` Tests: ${result.passedTestCases}/${result.totalTestCases} passed`);
  286. console.log(` P=${(result.averagePrecision * 100).toFixed(0)}% R=${(result.averageRecall * 100).toFixed(0)}% F1=${(result.averageF1Score * 100).toFixed(0)}%`);
  287. }
  288. const summary: EvaluationSummary = {
  289. timestamp: new Date(),
  290. version: '0.1.0',
  291. fixtureResults,
  292. overallPrecision,
  293. overallRecall,
  294. overallF1Score,
  295. overallTokenReduction,
  296. };
  297. // Save results to file
  298. const resultsPath = path.join(__dirname, 'results', `eval-${Date.now()}.json`);
  299. const resultsDir = path.dirname(resultsPath);
  300. if (!fs.existsSync(resultsDir)) {
  301. fs.mkdirSync(resultsDir, { recursive: true });
  302. }
  303. fs.writeFileSync(resultsPath, JSON.stringify(summary, null, 2));
  304. console.log(`\nResults saved to: ${resultsPath}`);
  305. return summary;
  306. }
  307. // Run if called directly
  308. if (require.main === module) {
  309. runEvaluation()
  310. .then(() => process.exit(0))
  311. .catch(err => {
  312. console.error('Evaluation failed:', err);
  313. process.exit(1);
  314. });
  315. }