| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374 |
- /**
- * Evaluation Runner
- *
- * Runs test cases against CodeGraph fixtures and measures precision/recall.
- */
- import * as path from 'path';
- import * as fs from 'fs';
- import CodeGraph from '../../src/index';
- import type { Node, SearchResult, NodeKind } from '../../src/types';
- import type {
- TestCase,
- TestCaseResult,
- FixtureGroundTruth,
- FixtureEvaluationResult,
- EvaluationSummary,
- } from './types';
- // Import fixtures
- import { typescriptFixture } from './fixtures/typescript-project/ground-truth';
- import { pythonFixture } from './fixtures/python-project/ground-truth';
- /**
- * Simple token counter (approximation using word count * 1.3)
- */
- function countTokens(text: string): number {
- const words = text.split(/\s+/).filter(w => w.length > 0);
- return Math.ceil(words.length * 1.3);
- }
- /**
- * Extract symbol names from CodeGraph results
- */
- function extractSymbolNames(nodes: Node[]): Set<string> {
- const names = new Set<string>();
- for (const node of nodes) {
- // Add the simple name
- names.add(node.name);
- // Add qualified name if we have parent info (Class.method format)
- // This is a simplification - real implementation would use containment edges
- if (node.kind === 'method' || node.kind === 'function') {
- // Try to infer class from file path or other context
- const fileName = path.basename(node.filePath, path.extname(node.filePath));
- names.add(`${fileName}.${node.name}`);
- }
- }
- return names;
- }
- /**
- * Normalize symbol name for comparison
- */
- function normalizeSymbol(symbol: string): string {
- // Remove common prefixes and normalize
- return symbol
- .replace(/^(db\.|authService\.|paymentService\.|auth_service\.|task_service\.)/, '')
- .toLowerCase();
- }
- /**
- * Check if a symbol matches any in a set (with fuzzy matching)
- */
- function symbolMatches(symbol: string, candidates: Set<string>): boolean {
- const normalized = normalizeSymbol(symbol);
- for (const candidate of candidates) {
- const normalizedCandidate = normalizeSymbol(candidate);
- // Exact match
- if (normalized === normalizedCandidate) return true;
- // Partial match (e.g., "login" matches "AuthService.login")
- if (normalizedCandidate.endsWith(`.${normalized}`)) return true;
- if (normalized.endsWith(`.${normalizedCandidate}`)) return true;
- // Simple name match
- const simpleName = normalized.split('.').pop();
- const simpleCandidateName = normalizedCandidate.split('.').pop();
- if (simpleName === simpleCandidateName) return true;
- }
- return false;
- }
- /**
- * Run a single test case
- */
- async function runTestCase(
- cg: CodeGraph,
- testCase: TestCase,
- fixtureTokens: number
- ): Promise<TestCaseResult> {
- const startTime = Date.now();
- let retrievedNodes: Node[] = [];
- let contextText = '';
- try {
- switch (testCase.type) {
- case 'search': {
- const results = cg.searchNodes(testCase.query, { limit: 20 });
- retrievedNodes = results.map(r => r.node);
- break;
- }
- case 'context': {
- const context = await cg.buildContext(testCase.query, {
- maxNodes: 30,
- includeCode: true,
- format: 'markdown',
- });
- contextText = typeof context === 'string' ? context : '';
- // Also get the nodes that were used to build context
- const results = cg.searchNodes(testCase.query, { limit: 30 });
- retrievedNodes = results.map(r => r.node);
- break;
- }
- case 'callers': {
- if (testCase.targetSymbol) {
- const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
- if (results.length > 0 && results[0]) {
- const callers = cg.getCallers(results[0].node.id);
- retrievedNodes = callers.map(c => c.node);
- }
- }
- break;
- }
- case 'callees': {
- if (testCase.targetSymbol) {
- const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
- if (results.length > 0 && results[0]) {
- const callees = cg.getCallees(results[0].node.id);
- retrievedNodes = callees.map(c => c.node);
- }
- }
- break;
- }
- case 'impact': {
- if (testCase.targetSymbol) {
- const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
- if (results.length > 0 && results[0]) {
- const impact = cg.getImpactRadius(results[0].node.id, 2);
- retrievedNodes = Array.from(impact.nodes.values());
- }
- }
- break;
- }
- }
- } catch (err) {
- console.error(`Error running test case ${testCase.id}:`, err);
- }
- const executionTimeMs = Date.now() - startTime;
- // Extract retrieved symbol names
- const retrievedSymbols = extractSymbolNames(retrievedNodes);
- // Calculate metrics
- const expectedSet = new Set(testCase.expectedSymbols.map(s => normalizeSymbol(s)));
- const irrelevantSet = new Set(testCase.irrelevantSymbols.map(s => normalizeSymbol(s)));
- const truePositives: string[] = [];
- const falsePositives: string[] = [];
- for (const symbol of retrievedSymbols) {
- const normalized = normalizeSymbol(symbol);
- if (symbolMatches(symbol, new Set(testCase.expectedSymbols))) {
- truePositives.push(symbol);
- } else if (symbolMatches(symbol, new Set(testCase.irrelevantSymbols))) {
- falsePositives.push(symbol);
- }
- // Symbols not in either list are ignored (neutral)
- }
- // Find false negatives (expected but not retrieved)
- const falseNegatives: string[] = [];
- for (const expected of testCase.expectedSymbols) {
- if (!symbolMatches(expected, retrievedSymbols)) {
- falseNegatives.push(expected);
- }
- }
- // Calculate precision and recall
- const totalRetrieved = truePositives.length + falsePositives.length;
- const precision = totalRetrieved > 0 ? truePositives.length / totalRetrieved : 0;
- const totalRelevant = testCase.expectedSymbols.length;
- const recall = totalRelevant > 0 ? truePositives.length / totalRelevant : 0;
- const f1Score = precision + recall > 0
- ? 2 * (precision * recall) / (precision + recall)
- : 0;
- // Count context tokens
- const contextTokens = contextText
- ? countTokens(contextText)
- : retrievedNodes.reduce((sum, node) => {
- // Estimate tokens from node info
- return sum + countTokens(node.name + ' ' + (node.signature || ''));
- }, 0);
- // Determine if test passed
- const meetsRecall = !testCase.minRecall || recall >= testCase.minRecall;
- const meetsPrecision = !testCase.minPrecision || precision >= testCase.minPrecision;
- const passed = meetsRecall && meetsPrecision;
- return {
- testCaseId: testCase.id,
- passed,
- precision,
- recall,
- f1Score,
- truePositives,
- falsePositives,
- falseNegatives,
- contextTokens,
- executionTimeMs,
- };
- }
- /**
- * Run evaluation on a single fixture
- */
- async function evaluateFixture(
- fixture: FixtureGroundTruth
- ): Promise<FixtureEvaluationResult> {
- const fixturePath = path.resolve(process.cwd(), fixture.path);
- const startTime = Date.now();
- console.log(`\nEvaluating fixture: ${fixture.name}`);
- console.log(` Path: ${fixturePath}`);
- // Initialize CodeGraph for this fixture
- let cg: CodeGraph;
- if (CodeGraph.isInitialized(fixturePath)) {
- console.log(' Opening existing index...');
- cg = await CodeGraph.open(fixturePath);
- } else {
- console.log(' Initializing and indexing...');
- cg = await CodeGraph.init(fixturePath, { index: true });
- }
- const stats = cg.getStats();
- console.log(` Indexed ${stats.fileCount} files, ${stats.nodeCount} nodes`);
- // Run all test cases
- const testCaseResults: TestCaseResult[] = [];
- for (const testCase of fixture.testCases) {
- console.log(` Running: ${testCase.id}...`);
- const result = await runTestCase(cg, testCase, fixture.approximateTokens);
- testCaseResults.push(result);
- const status = result.passed ? '✓' : '✗';
- console.log(` ${status} P=${(result.precision * 100).toFixed(0)}% R=${(result.recall * 100).toFixed(0)}% F1=${(result.f1Score * 100).toFixed(0)}%`);
- }
- // Close CodeGraph
- cg.destroy();
- // Calculate aggregate metrics
- const totalTimeMs = Date.now() - startTime;
- const passedTestCases = testCaseResults.filter(r => r.passed).length;
- const averagePrecision = testCaseResults.reduce((sum, r) => sum + r.precision, 0) / testCaseResults.length;
- const averageRecall = testCaseResults.reduce((sum, r) => sum + r.recall, 0) / testCaseResults.length;
- const averageF1Score = testCaseResults.reduce((sum, r) => sum + r.f1Score, 0) / testCaseResults.length;
- const averageContextTokens = testCaseResults.reduce((sum, r) => sum + r.contextTokens, 0) / testCaseResults.length;
- const tokenReductionPercent = fixture.approximateTokens > 0
- ? ((fixture.approximateTokens - averageContextTokens) / fixture.approximateTokens) * 100
- : 0;
- return {
- fixtureName: fixture.name,
- totalTestCases: testCaseResults.length,
- passedTestCases,
- averagePrecision,
- averageRecall,
- averageF1Score,
- fullCodebaseTokens: fixture.approximateTokens,
- averageContextTokens,
- tokenReductionPercent,
- testCaseResults,
- totalTimeMs,
- };
- }
- /**
- * Run full evaluation across all fixtures
- */
- export async function runEvaluation(): Promise<EvaluationSummary> {
- console.log('╔════════════════════════════════════════════════════════════════╗');
- console.log('║ CodeGraph Evaluation Suite ║');
- console.log('╚════════════════════════════════════════════════════════════════╝');
- const fixtures: FixtureGroundTruth[] = [
- typescriptFixture,
- pythonFixture,
- ];
- const fixtureResults: FixtureEvaluationResult[] = [];
- for (const fixture of fixtures) {
- const result = await evaluateFixture(fixture);
- fixtureResults.push(result);
- }
- // Calculate overall metrics
- const totalTests = fixtureResults.reduce((sum, r) => sum + r.totalTestCases, 0);
- const totalPassed = fixtureResults.reduce((sum, r) => sum + r.passedTestCases, 0);
- const overallPrecision = fixtureResults.reduce((sum, r) => sum + r.averagePrecision, 0) / fixtureResults.length;
- const overallRecall = fixtureResults.reduce((sum, r) => sum + r.averageRecall, 0) / fixtureResults.length;
- const overallF1Score = fixtureResults.reduce((sum, r) => sum + r.averageF1Score, 0) / fixtureResults.length;
- const overallTokenReduction = fixtureResults.reduce((sum, r) => sum + r.tokenReductionPercent, 0) / fixtureResults.length;
- // Print summary
- console.log('\n╔════════════════════════════════════════════════════════════════╗');
- console.log('║ EVALUATION SUMMARY ║');
- console.log('╚════════════════════════════════════════════════════════════════╝');
- console.log(`\nTest Results: ${totalPassed}/${totalTests} passed`);
- console.log(`\nOverall Metrics:`);
- console.log(` Precision: ${(overallPrecision * 100).toFixed(1)}%`);
- console.log(` Recall: ${(overallRecall * 100).toFixed(1)}%`);
- console.log(` F1 Score: ${(overallF1Score * 100).toFixed(1)}%`);
- console.log(` Token Reduction: ${overallTokenReduction.toFixed(1)}%`);
- console.log('\nPer-Fixture Results:');
- for (const result of fixtureResults) {
- console.log(` ${result.fixtureName}:`);
- console.log(` Tests: ${result.passedTestCases}/${result.totalTestCases} passed`);
- console.log(` P=${(result.averagePrecision * 100).toFixed(0)}% R=${(result.averageRecall * 100).toFixed(0)}% F1=${(result.averageF1Score * 100).toFixed(0)}%`);
- }
- const summary: EvaluationSummary = {
- timestamp: new Date(),
- version: '0.1.0',
- fixtureResults,
- overallPrecision,
- overallRecall,
- overallF1Score,
- overallTokenReduction,
- };
- // Save results to file
- const resultsPath = path.join(__dirname, 'results', `eval-${Date.now()}.json`);
- const resultsDir = path.dirname(resultsPath);
- if (!fs.existsSync(resultsDir)) {
- fs.mkdirSync(resultsDir, { recursive: true });
- }
- fs.writeFileSync(resultsPath, JSON.stringify(summary, null, 2));
- console.log(`\nResults saved to: ${resultsPath}`);
- return summary;
- }
- // Run if called directly
- if (require.main === module) {
- runEvaluation()
- .then(() => process.exit(0))
- .catch(err => {
- console.error('Evaluation failed:', err);
- process.exit(1);
- });
- }
|