evaluation.test.ts 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. /**
  2. * Evaluation Tests
  3. *
  4. * Runs the evaluation suite as part of the test suite.
  5. * Use `npm run test:eval` to run just these tests.
  6. */
  7. import { describe, it, expect, beforeAll, afterAll } from 'vitest';
  8. import * as path from 'path';
  9. import * as fs from 'fs';
  10. import CodeGraph from '../../src/index';
  11. import type { TestCase, TestCaseResult } from './types';
  12. import { typescriptFixture } from './fixtures/typescript-project/ground-truth';
  13. import { pythonFixture } from './fixtures/python-project/ground-truth';
  14. /**
  15. * Extract symbol names from nodes
  16. */
  17. function extractSymbolNames(nodes: { name: string }[]): Set<string> {
  18. return new Set(nodes.map(n => n.name.toLowerCase()));
  19. }
  20. /**
  21. * Normalize symbol name
  22. */
  23. function normalizeSymbol(symbol: string): string {
  24. return symbol.split('.').pop()?.toLowerCase() || symbol.toLowerCase();
  25. }
  26. /**
  27. * Check if symbol matches
  28. */
  29. function symbolMatches(symbol: string, candidates: Set<string>): boolean {
  30. const normalized = normalizeSymbol(symbol);
  31. for (const candidate of candidates) {
  32. if (normalizeSymbol(candidate) === normalized) return true;
  33. }
  34. return false;
  35. }
  36. /**
  37. * Find a target node by name, supporting qualified names like "ClassName.methodName"
  38. */
  39. function findTargetNode(cg: CodeGraph, targetSymbol: string): { id: string; name: string } | null {
  40. // Check if it's a qualified name (e.g., "OrderService.createOrder")
  41. const parts = targetSymbol.split('.');
  42. if (parts.length === 2) {
  43. const [className, methodName] = parts;
  44. // Search for the method name and filter by qualified name containing the class
  45. const results = cg.searchNodes(methodName!, { limit: 20 });
  46. for (const r of results) {
  47. if (r.node.qualifiedName.includes(className!) && r.node.name === methodName) {
  48. return { id: r.node.id, name: r.node.name };
  49. }
  50. }
  51. }
  52. // Fall back to simple search
  53. const results = cg.searchNodes(targetSymbol, { limit: 1 });
  54. if (results.length > 0 && results[0]) {
  55. return { id: results[0].node.id, name: results[0].node.name };
  56. }
  57. return null;
  58. }
  59. /**
  60. * Run a single test case and return metrics
  61. */
  62. async function runSingleTest(cg: CodeGraph, testCase: TestCase): Promise<TestCaseResult> {
  63. let retrievedNodes: { name: string; id: string }[] = [];
  64. switch (testCase.type) {
  65. case 'search': {
  66. const results = cg.searchNodes(testCase.query, { limit: 20 });
  67. retrievedNodes = results.map(r => ({ name: r.node.name, id: r.node.id }));
  68. break;
  69. }
  70. case 'context': {
  71. // Use buildContext to get semantic search + graph traversal
  72. const context = await cg.buildContext(testCase.query, {
  73. maxNodes: 30,
  74. traversalDepth: 2,
  75. searchLimit: 5,
  76. format: 'object',
  77. });
  78. // Extract nodes from the subgraph
  79. if (typeof context !== 'string' && context.subgraph) {
  80. retrievedNodes = Array.from(context.subgraph.nodes.values()).map(n => ({
  81. name: n.name,
  82. id: n.id,
  83. }));
  84. }
  85. break;
  86. }
  87. case 'callers': {
  88. if (testCase.targetSymbol) {
  89. const targetNode = findTargetNode(cg, testCase.targetSymbol);
  90. if (targetNode) {
  91. const callers = cg.getCallers(targetNode.id);
  92. retrievedNodes = callers.map(c => ({ name: c.node.name, id: c.node.id }));
  93. }
  94. }
  95. break;
  96. }
  97. case 'callees': {
  98. if (testCase.targetSymbol) {
  99. const targetNode = findTargetNode(cg, testCase.targetSymbol);
  100. if (targetNode) {
  101. const callees = cg.getCallees(targetNode.id);
  102. retrievedNodes = callees.map(c => ({ name: c.node.name, id: c.node.id }));
  103. }
  104. }
  105. break;
  106. }
  107. case 'impact': {
  108. if (testCase.targetSymbol) {
  109. const targetNode = findTargetNode(cg, testCase.targetSymbol);
  110. if (targetNode) {
  111. const impact = cg.getImpactRadius(targetNode.id, 2);
  112. retrievedNodes = Array.from(impact.nodes.values()).map(n => ({ name: n.name, id: n.id }));
  113. }
  114. }
  115. break;
  116. }
  117. }
  118. // Calculate metrics
  119. const retrievedSymbols = extractSymbolNames(retrievedNodes);
  120. const truePositives: string[] = [];
  121. const falsePositives: string[] = [];
  122. for (const symbol of retrievedSymbols) {
  123. if (symbolMatches(symbol, new Set(testCase.expectedSymbols))) {
  124. truePositives.push(symbol);
  125. } else if (symbolMatches(symbol, new Set(testCase.irrelevantSymbols))) {
  126. falsePositives.push(symbol);
  127. }
  128. }
  129. const falseNegatives: string[] = [];
  130. for (const expected of testCase.expectedSymbols) {
  131. if (!symbolMatches(expected, retrievedSymbols)) {
  132. falseNegatives.push(expected);
  133. }
  134. }
  135. const totalRetrieved = truePositives.length + falsePositives.length;
  136. const precision = totalRetrieved > 0 ? truePositives.length / totalRetrieved : 0;
  137. const totalRelevant = testCase.expectedSymbols.length;
  138. const recall = totalRelevant > 0 ? truePositives.length / totalRelevant : 0;
  139. const f1Score = precision + recall > 0
  140. ? 2 * (precision * recall) / (precision + recall)
  141. : 0;
  142. // Check if passed thresholds (with 20% margin)
  143. const passedRecall = !testCase.minRecall || recall >= testCase.minRecall * 0.8;
  144. const passedPrecision = !testCase.minPrecision || precision >= testCase.minPrecision * 0.8;
  145. return {
  146. testCaseId: testCase.id,
  147. passed: passedRecall && passedPrecision,
  148. precision,
  149. recall,
  150. f1Score,
  151. truePositives,
  152. falsePositives,
  153. falseNegatives,
  154. contextTokens: 0,
  155. executionTimeMs: 0,
  156. };
  157. }
  158. /**
  159. * Print a results table
  160. */
  161. function printResultsTable(results: TestCaseResult[], fixtureName: string): void {
  162. console.log(`\n${'='.repeat(80)}`);
  163. console.log(` ${fixtureName} Results`);
  164. console.log('='.repeat(80));
  165. console.log('');
  166. console.log(' Test ID Type Prec Recall F1 Status');
  167. console.log(' ' + '-'.repeat(76));
  168. for (const r of results) {
  169. const id = r.testCaseId.padEnd(35);
  170. const type = r.testCaseId.split('-')[1]?.padEnd(10) || ''.padEnd(10);
  171. const prec = `${(r.precision * 100).toFixed(0)}%`.padStart(5);
  172. const recall = `${(r.recall * 100).toFixed(0)}%`.padStart(6);
  173. const f1 = `${(r.f1Score * 100).toFixed(0)}%`.padStart(5);
  174. const status = r.passed ? '✓' : '✗';
  175. console.log(` ${id} ${type} ${prec} ${recall} ${f1} ${status}`);
  176. }
  177. const avgPrecision = results.reduce((sum, r) => sum + r.precision, 0) / results.length;
  178. const avgRecall = results.reduce((sum, r) => sum + r.recall, 0) / results.length;
  179. const avgF1 = results.reduce((sum, r) => sum + r.f1Score, 0) / results.length;
  180. const passRate = results.filter(r => r.passed).length / results.length;
  181. console.log(' ' + '-'.repeat(76));
  182. console.log(` ${'AVERAGE'.padEnd(35)} ${''.padEnd(10)} ${`${(avgPrecision * 100).toFixed(0)}%`.padStart(5)} ${`${(avgRecall * 100).toFixed(0)}%`.padStart(6)} ${`${(avgF1 * 100).toFixed(0)}%`.padStart(5)} ${(passRate * 100).toFixed(0)}%`);
  183. console.log('');
  184. }
  185. describe('CodeGraph Evaluation', () => {
  186. describe('TypeScript Fixture', () => {
  187. let cg: CodeGraph;
  188. const fixturePath = path.resolve(__dirname, 'fixtures/typescript-project');
  189. const results: TestCaseResult[] = [];
  190. beforeAll(async () => {
  191. // Clean up any existing index
  192. const codegraphDir = path.join(fixturePath, '.codegraph');
  193. if (fs.existsSync(codegraphDir)) {
  194. fs.rmSync(codegraphDir, { recursive: true });
  195. }
  196. // Initialize and index
  197. cg = await CodeGraph.init(fixturePath, { index: true });
  198. // Initialize embeddings for semantic search
  199. await cg.initializeEmbeddings();
  200. await cg.generateEmbeddings();
  201. }, 120000);
  202. afterAll(() => {
  203. // Print summary table after all tests
  204. printResultsTable(results, 'TypeScript');
  205. if (cg) {
  206. cg.destroy();
  207. }
  208. });
  209. it('should index all files', () => {
  210. const stats = cg.getStats();
  211. expect(stats.fileCount).toBeGreaterThanOrEqual(typescriptFixture.totalFiles);
  212. });
  213. // Generate test for each test case - collect results but don't fail
  214. for (const testCase of typescriptFixture.testCases) {
  215. it(`${testCase.id}: ${testCase.description}`, async () => {
  216. const result = await runSingleTest(cg, testCase);
  217. results.push(result);
  218. // Don't assert - just collect results
  219. expect(true).toBe(true);
  220. });
  221. }
  222. });
  223. describe('Python Fixture', () => {
  224. let cg: CodeGraph;
  225. const fixturePath = path.resolve(__dirname, 'fixtures/python-project');
  226. const results: TestCaseResult[] = [];
  227. beforeAll(async () => {
  228. // Clean up any existing index
  229. const codegraphDir = path.join(fixturePath, '.codegraph');
  230. if (fs.existsSync(codegraphDir)) {
  231. fs.rmSync(codegraphDir, { recursive: true });
  232. }
  233. // Initialize and index
  234. cg = await CodeGraph.init(fixturePath, { index: true });
  235. // Initialize embeddings for semantic search
  236. await cg.initializeEmbeddings();
  237. await cg.generateEmbeddings();
  238. }, 120000);
  239. afterAll(() => {
  240. // Print summary table after all tests
  241. printResultsTable(results, 'Python');
  242. if (cg) {
  243. cg.destroy();
  244. }
  245. });
  246. it('should index all files', () => {
  247. const stats = cg.getStats();
  248. expect(stats.fileCount).toBeGreaterThanOrEqual(pythonFixture.totalFiles);
  249. });
  250. // Generate test for each test case - collect results but don't fail
  251. for (const testCase of pythonFixture.testCases) {
  252. it(`${testCase.id}: ${testCase.description}`, async () => {
  253. const result = await runSingleTest(cg, testCase);
  254. results.push(result);
  255. // Don't assert - just collect results
  256. expect(true).toBe(true);
  257. });
  258. }
  259. });
  260. });