haiany
/
codegraph
mirror of https://github.com/colbymchenry/codegraph.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
							/**
 * Evaluation Runner
 *
 * Runs test cases against CodeGraph fixtures and measures precision/recall.
 */

import * as path from 'path';
import * as fs from 'fs';
import CodeGraph from '../../src/index';
import type { Node, SearchResult, NodeKind } from '../../src/types';
import type {
  TestCase,
  TestCaseResult,
  FixtureGroundTruth,
  FixtureEvaluationResult,
  EvaluationSummary,
} from './types';

// Import fixtures
import { typescriptFixture } from './fixtures/typescript-project/ground-truth';
import { pythonFixture } from './fixtures/python-project/ground-truth';

/**
 * Simple token counter (approximation using word count * 1.3)
 */
function countTokens(text: string): number {
  const words = text.split(/\s+/).filter(w => w.length > 0);
  return Math.ceil(words.length * 1.3);
}

/**
 * Extract symbol names from CodeGraph results
 */
function extractSymbolNames(nodes: Node[]): Set<string> {
  const names = new Set<string>();
  for (const node of nodes) {
    // Add the simple name
    names.add(node.name);

    // Add qualified name if we have parent info (Class.method format)
    // This is a simplification - real implementation would use containment edges
    if (node.kind === 'method' || node.kind === 'function') {
      // Try to infer class from file path or other context
      const fileName = path.basename(node.filePath, path.extname(node.filePath));
      names.add(`${fileName}.${node.name}`);
    }
  }
  return names;
}

/**
 * Normalize symbol name for comparison
 */
function normalizeSymbol(symbol: string): string {
  // Remove common prefixes and normalize
  return symbol
    .replace(/^(db\.|authService\.|paymentService\.|auth_service\.|task_service\.)/, '')
    .toLowerCase();
}

/**
 * Check if a symbol matches any in a set (with fuzzy matching)
 */
function symbolMatches(symbol: string, candidates: Set<string>): boolean {
  const normalized = normalizeSymbol(symbol);

  for (const candidate of candidates) {
    const normalizedCandidate = normalizeSymbol(candidate);

    // Exact match
    if (normalized === normalizedCandidate) return true;

    // Partial match (e.g., "login" matches "AuthService.login")
    if (normalizedCandidate.endsWith(`.${normalized}`)) return true;
    if (normalized.endsWith(`.${normalizedCandidate}`)) return true;

    // Simple name match
    const simpleName = normalized.split('.').pop();
    const simpleCandidateName = normalizedCandidate.split('.').pop();
    if (simpleName === simpleCandidateName) return true;
  }

  return false;
}

/**
 * Run a single test case
 */
async function runTestCase(
  cg: CodeGraph,
  testCase: TestCase,
  fixtureTokens: number
): Promise<TestCaseResult> {
  const startTime = Date.now();

  let retrievedNodes: Node[] = [];
  let contextText = '';

  try {
    switch (testCase.type) {
      case 'search': {
        const results = cg.searchNodes(testCase.query, { limit: 20 });
        retrievedNodes = results.map(r => r.node);
        break;
      }

      case 'context': {
        const context = await cg.buildContext(testCase.query, {
          maxNodes: 30,
          includeCode: true,
          format: 'markdown',
        });
        contextText = typeof context === 'string' ? context : '';

        // Also get the nodes that were used to build context
        const results = cg.searchNodes(testCase.query, { limit: 30 });
        retrievedNodes = results.map(r => r.node);
        break;
      }

      case 'callers': {
        if (testCase.targetSymbol) {
          const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
          if (results.length > 0 && results[0]) {
            const callers = cg.getCallers(results[0].node.id);
            retrievedNodes = callers.map(c => c.node);
          }
        }
        break;
      }

      case 'callees': {
        if (testCase.targetSymbol) {
          const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
          if (results.length > 0 && results[0]) {
            const callees = cg.getCallees(results[0].node.id);
            retrievedNodes = callees.map(c => c.node);
          }
        }
        break;
      }

      case 'impact': {
        if (testCase.targetSymbol) {
          const results = cg.searchNodes(testCase.targetSymbol, { limit: 1 });
          if (results.length > 0 && results[0]) {
            const impact = cg.getImpactRadius(results[0].node.id, 2);
            retrievedNodes = Array.from(impact.nodes.values());
          }
        }
        break;
      }
    }
  } catch (err) {
    console.error(`Error running test case ${testCase.id}:`, err);
  }

  const executionTimeMs = Date.now() - startTime;

  // Extract retrieved symbol names
  const retrievedSymbols = extractSymbolNames(retrievedNodes);

  // Calculate metrics
  const expectedSet = new Set(testCase.expectedSymbols.map(s => normalizeSymbol(s)));
  const irrelevantSet = new Set(testCase.irrelevantSymbols.map(s => normalizeSymbol(s)));

  const truePositives: string[] = [];
  const falsePositives: string[] = [];

  for (const symbol of retrievedSymbols) {
    const normalized = normalizeSymbol(symbol);

    if (symbolMatches(symbol, new Set(testCase.expectedSymbols))) {
      truePositives.push(symbol);
    } else if (symbolMatches(symbol, new Set(testCase.irrelevantSymbols))) {
      falsePositives.push(symbol);
    }
    // Symbols not in either list are ignored (neutral)
  }

  // Find false negatives (expected but not retrieved)
  const falseNegatives: string[] = [];
  for (const expected of testCase.expectedSymbols) {
    if (!symbolMatches(expected, retrievedSymbols)) {
      falseNegatives.push(expected);
    }
  }

  // Calculate precision and recall
  const totalRetrieved = truePositives.length + falsePositives.length;
  const precision = totalRetrieved > 0 ? truePositives.length / totalRetrieved : 0;

  const totalRelevant = testCase.expectedSymbols.length;
  const recall = totalRelevant > 0 ? truePositives.length / totalRelevant : 0;

  const f1Score = precision + recall > 0
    ? 2 * (precision * recall) / (precision + recall)
    : 0;

  // Count context tokens
  const contextTokens = contextText
    ? countTokens(contextText)
    : retrievedNodes.reduce((sum, node) => {
        // Estimate tokens from node info
        return sum + countTokens(node.name + ' ' + (node.signature || ''));
      }, 0);

  // Determine if test passed
  const meetsRecall = !testCase.minRecall || recall >= testCase.minRecall;
  const meetsPrecision = !testCase.minPrecision || precision >= testCase.minPrecision;
  const passed = meetsRecall && meetsPrecision;

  return {
    testCaseId: testCase.id,
    passed,
    precision,
    recall,
    f1Score,
    truePositives,
    falsePositives,
    falseNegatives,
    contextTokens,
    executionTimeMs,
  };
}

/**
 * Run evaluation on a single fixture
 */
async function evaluateFixture(
  fixture: FixtureGroundTruth
): Promise<FixtureEvaluationResult> {
  const fixturePath = path.resolve(process.cwd(), fixture.path);
  const startTime = Date.now();

  console.log(`\nEvaluating fixture: ${fixture.name}`);
  console.log(`  Path: ${fixturePath}`);

  // Initialize CodeGraph for this fixture
  let cg: CodeGraph;

  if (CodeGraph.isInitialized(fixturePath)) {
    console.log('  Opening existing index...');
    cg = await CodeGraph.open(fixturePath);
  } else {
    console.log('  Initializing and indexing...');
    cg = await CodeGraph.init(fixturePath, { index: true });
  }

  const stats = cg.getStats();
  console.log(`  Indexed ${stats.fileCount} files, ${stats.nodeCount} nodes`);

  // Run all test cases
  const testCaseResults: TestCaseResult[] = [];

  for (const testCase of fixture.testCases) {
    console.log(`  Running: ${testCase.id}...`);
    const result = await runTestCase(cg, testCase, fixture.approximateTokens);
    testCaseResults.push(result);

    const status = result.passed ? '✓' : '✗';
    console.log(`    ${status} P=${(result.precision * 100).toFixed(0)}% R=${(result.recall * 100).toFixed(0)}% F1=${(result.f1Score * 100).toFixed(0)}%`);
  }

  // Close CodeGraph
  cg.destroy();

  // Calculate aggregate metrics
  const totalTimeMs = Date.now() - startTime;
  const passedTestCases = testCaseResults.filter(r => r.passed).length;

  const averagePrecision = testCaseResults.reduce((sum, r) => sum + r.precision, 0) / testCaseResults.length;
  const averageRecall = testCaseResults.reduce((sum, r) => sum + r.recall, 0) / testCaseResults.length;
  const averageF1Score = testCaseResults.reduce((sum, r) => sum + r.f1Score, 0) / testCaseResults.length;
  const averageContextTokens = testCaseResults.reduce((sum, r) => sum + r.contextTokens, 0) / testCaseResults.length;

  const tokenReductionPercent = fixture.approximateTokens > 0
    ? ((fixture.approximateTokens - averageContextTokens) / fixture.approximateTokens) * 100
    : 0;

  return {
    fixtureName: fixture.name,
    totalTestCases: testCaseResults.length,
    passedTestCases,
    averagePrecision,
    averageRecall,
    averageF1Score,
    fullCodebaseTokens: fixture.approximateTokens,
    averageContextTokens,
    tokenReductionPercent,
    testCaseResults,
    totalTimeMs,
  };
}

/**
 * Run full evaluation across all fixtures
 */
export async function runEvaluation(): Promise<EvaluationSummary> {
  console.log('╔════════════════════════════════════════════════════════════════╗');
  console.log('║              CodeGraph Evaluation Suite                        ║');
  console.log('╚════════════════════════════════════════════════════════════════╝');

  const fixtures: FixtureGroundTruth[] = [
    typescriptFixture,
    pythonFixture,
  ];

  const fixtureResults: FixtureEvaluationResult[] = [];

  for (const fixture of fixtures) {
    const result = await evaluateFixture(fixture);
    fixtureResults.push(result);
  }

  // Calculate overall metrics
  const totalTests = fixtureResults.reduce((sum, r) => sum + r.totalTestCases, 0);
  const totalPassed = fixtureResults.reduce((sum, r) => sum + r.passedTestCases, 0);

  const overallPrecision = fixtureResults.reduce((sum, r) => sum + r.averagePrecision, 0) / fixtureResults.length;
  const overallRecall = fixtureResults.reduce((sum, r) => sum + r.averageRecall, 0) / fixtureResults.length;
  const overallF1Score = fixtureResults.reduce((sum, r) => sum + r.averageF1Score, 0) / fixtureResults.length;
  const overallTokenReduction = fixtureResults.reduce((sum, r) => sum + r.tokenReductionPercent, 0) / fixtureResults.length;

  // Print summary
  console.log('\n╔════════════════════════════════════════════════════════════════╗');
  console.log('║                      EVALUATION SUMMARY                         ║');
  console.log('╚════════════════════════════════════════════════════════════════╝');

  console.log(`\nTest Results: ${totalPassed}/${totalTests} passed`);
  console.log(`\nOverall Metrics:`);
  console.log(`  Precision:        ${(overallPrecision * 100).toFixed(1)}%`);
  console.log(`  Recall:           ${(overallRecall * 100).toFixed(1)}%`);
  console.log(`  F1 Score:         ${(overallF1Score * 100).toFixed(1)}%`);
  console.log(`  Token Reduction:  ${overallTokenReduction.toFixed(1)}%`);

  console.log('\nPer-Fixture Results:');
  for (const result of fixtureResults) {
    console.log(`  ${result.fixtureName}:`);
    console.log(`    Tests: ${result.passedTestCases}/${result.totalTestCases} passed`);
    console.log(`    P=${(result.averagePrecision * 100).toFixed(0)}% R=${(result.averageRecall * 100).toFixed(0)}% F1=${(result.averageF1Score * 100).toFixed(0)}%`);
  }

  const summary: EvaluationSummary = {
    timestamp: new Date(),
    version: '0.1.0',
    fixtureResults,
    overallPrecision,
    overallRecall,
    overallF1Score,
    overallTokenReduction,
  };

  // Save results to file
  const resultsPath = path.join(__dirname, 'results', `eval-${Date.now()}.json`);
  const resultsDir = path.dirname(resultsPath);
  if (!fs.existsSync(resultsDir)) {
    fs.mkdirSync(resultsDir, { recursive: true });
  }
  fs.writeFileSync(resultsPath, JSON.stringify(summary, null, 2));
  console.log(`\nResults saved to: ${resultsPath}`);

  return summary;
}

// Run if called directly
if (require.main === module) {
  runEvaluation()
    .then(() => process.exit(0))
    .catch(err => {
      console.error('Evaluation failed:', err);
      process.exit(1);
    });
}