1
0

probe-sweep.mjs 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. #!/usr/bin/env node
  2. // probe-sweep — direct MCP test across N repos × N tools, no claude needed.
  3. //
  4. // Measures response characteristics (size, sections present, signals fired)
  5. // for each (repo, query) pair against the built dist/. Sub-second per probe;
  6. // the full sweep below runs in ~10-30s vs hours for a real claude audit.
  7. //
  8. // Use this to iterate on backend changes rapidly: change tools.ts /
  9. // context-builder, npm run build, re-run probe-sweep, compare. Once a
  10. // change looks good on probe metrics, run a focused claude audit for the
  11. // few repos that matter to confirm end-to-end cost behavior.
  12. //
  13. // Usage: node scripts/agent-eval/probe-sweep.mjs [--tool=context|explore|trace] [--repos=a,b,c]
  14. import { pathToFileURL } from 'node:url';
  15. import { resolve } from 'node:path';
  16. const args = Object.fromEntries(
  17. process.argv.slice(2).map(a => a.startsWith('--') ? a.slice(2).split('=') : [a, true])
  18. );
  19. const TOOL = args.tool ?? 'context';
  20. const load = (rel) => import(pathToFileURL(resolve(rel)).href);
  21. const idx = await load('dist/index.js');
  22. const tools = await load('dist/mcp/tools.js');
  23. const CodeGraph = idx.default?.default ?? idx.default ?? idx.CodeGraph;
  24. const ToolHandler = tools.ToolHandler ?? tools.default?.ToolHandler;
  25. // Each entry: repo, query, optional 2nd arg for trace (from, to).
  26. // The query is the same prompt used in the real claude audits, so probe
  27. // output is directly comparable to the agent's would-be input.
  28. const SWEEP = [
  29. // Small realworld template repos (the loss cases from the cross-language sweep)
  30. { id: 'gin-rw', repo: '/tmp/codegraph-corpus/gin-realworld', q: 'How does this Gin app route a request through its middleware chain to a handler?' },
  31. { id: 'go-mux', repo: '/tmp/codegraph-corpus/go-mux', q: 'How does this gorilla/mux app route a request to its handler?' },
  32. { id: 'fastapi-rw', repo: '/tmp/codegraph-corpus/fastapi-realworld', q: 'How does FastAPI route a request through its dependencies to a handler?' },
  33. { id: 'spring-pc', repo: '/tmp/codegraph-corpus/spring-petclinic', q: 'How does Spring route an HTTP request to a controller method?' },
  34. { id: 'axum-rw', repo: '/tmp/codegraph-corpus/rust-axum-realworld', q: 'How does Axum route a request to its handler in this app?' },
  35. { id: 'express-rw', repo: '/tmp/codegraph-corpus/express-realworld', q: 'How does this Express app route a request through middleware to a handler?' },
  36. { id: 'kotlin-pc', repo: '/tmp/codegraph-corpus/kotlin-petclinic', q: 'How does the Kotlin Spring app route an HTTP request to its handler?' },
  37. { id: 'flask-mb', repo: '/tmp/codegraph-corpus/flask-microblog', q: 'How does this Flask app route a request to a view function?' },
  38. { id: 'vapor-tpl', repo: '/tmp/codegraph-corpus/vapor-template', q: 'How does Vapor route an HTTP request to its handler?' },
  39. { id: 'cpp-leveldb', repo: '/tmp/codegraph-corpus/cpp-leveldb', q: 'How does LevelDB handle a Put operation through to disk?' },
  40. { id: 'lualine', repo: '/tmp/codegraph-corpus/lualine.nvim', q: 'How does lualine assemble and render the statusline?' },
  41. { id: 'drupal-admin', repo: '/tmp/codegraph-corpus/drupal-admintoolbar', q: 'How does the Drupal admin toolbar module render its toolbar?' },
  42. { id: 'svelte-rw', repo: '/tmp/codegraph-corpus/svelte-realworld', q: 'How does this SvelteKit app route a request to a handler?' },
  43. { id: 'react-rw', repo: '/tmp/codegraph-corpus/react-realworld', q: 'How does this React app fetch and display articles?' },
  44. { id: 'rails-rw', repo: '/tmp/codegraph-corpus/rails-realworld', q: 'How does Rails route a request to a controller action?' },
  45. { id: 'flask-rest', repo: '/tmp/codegraph-corpus/flask-restful-realworld', q: 'How does Flask-RESTful route a request to a resource method?' },
  46. { id: 'laravel-rw', repo: '/tmp/codegraph-corpus/laravel-realworld', q: 'How does Laravel route a request to the controller method?' },
  47. { id: 'aspnet-rw', repo: '/tmp/codegraph-corpus/aspnet-realworld', q: 'How does ASP.NET route a request to the controller action?' },
  48. // The iter7 wins/ties (to make sure we don't regress)
  49. { id: 'cobra', repo: '/tmp/codegraph-corpus/cobra', q: 'How does cobra parse commands and flags?' },
  50. { id: 'sinatra', repo: '/tmp/codegraph-corpus/sinatra', q: 'How does sinatra route a request to its handler?' },
  51. { id: 'slim', repo: '/tmp/codegraph-corpus/slim', q: 'How does slim route a request and apply middleware?' },
  52. ];
  53. // Detect signals in response text — these are the levers we've added that
  54. // otherwise only show up via "agent ran X more tool calls" downstream.
  55. const detect = (text) => ({
  56. hasEntryPoints: /^### Entry Points/m.test(text),
  57. hasRelatedSymbols: /^### Related Symbols/m.test(text),
  58. hasFlowTrace: /^## Inline flow trace/m.test(text),
  59. hasRouteManifest: /^## Routing manifest/m.test(text),
  60. hasTopHandler: /^### Top handler file/m.test(text),
  61. hasSmallRepoTail: /This project is small/.test(text),
  62. });
  63. const filterRepos = args.repos ? new Set(String(args.repos).split(',')) : null;
  64. const subjects = SWEEP.filter(s => !filterRepos || filterRepos.has(s.id));
  65. const t0 = Date.now();
  66. const rows = [];
  67. for (const s of subjects) {
  68. try {
  69. const cg = CodeGraph.openSync(s.repo);
  70. const handler = new ToolHandler(cg);
  71. const t1 = Date.now();
  72. const res = await handler.execute('codegraph_' + TOOL,
  73. TOOL === 'context' ? { task: s.q } :
  74. TOOL === 'explore' ? { query: s.q } : { from: 'main', to: 'main' });
  75. const text = res.content?.[0]?.text ?? '';
  76. const signals = detect(text);
  77. rows.push({
  78. id: s.id,
  79. ms: Date.now() - t1,
  80. chars: text.length,
  81. lines: text.split('\n').length,
  82. ...signals,
  83. });
  84. try { cg.close?.(); } catch {}
  85. } catch (e) {
  86. rows.push({ id: s.id, error: String(e).slice(0, 80) });
  87. }
  88. }
  89. // Pretty-print as a compact table.
  90. const fmt = (r) =>
  91. r.error
  92. ? ` ${r.id.padEnd(13)} ERROR: ${r.error}`
  93. : ` ${r.id.padEnd(13)} ${String(r.chars).padStart(6)}c ${String(r.lines).padStart(4)}L ${String(r.ms).padStart(4)}ms` +
  94. ` ${r.hasEntryPoints ? 'EP ' : ' '}` +
  95. `${r.hasFlowTrace ? 'TRC ' : ' '}` +
  96. `${r.hasRouteManifest ? 'MAN ' : ' '}` +
  97. `${r.hasTopHandler ? 'HND ' : ' '}` +
  98. `${r.hasSmallRepoTail ? 'TAIL' : ' '}`;
  99. console.log(`=== probe-sweep tool=${TOOL} n=${subjects.length} (${Date.now() - t0}ms total) ===`);
  100. console.log(' id chars lines ms signals');
  101. console.log(' ' + '-'.repeat(56));
  102. for (const r of rows) console.log(fmt(r));
  103. // Sum + medians for the size pillar
  104. const sizes = rows.filter(r => !r.error).map(r => r.chars);
  105. sizes.sort((a, b) => a - b);
  106. const median = sizes[Math.floor(sizes.length / 2)];
  107. const sum = sizes.reduce((a, b) => a + b, 0);
  108. console.log(` ${'-'.repeat(64)}`);
  109. console.log(` median=${median}c total=${sum}c ` +
  110. `manifest=${rows.filter(r => r.hasRouteManifest).length}/${rows.filter(r => !r.error).length} ` +
  111. `top-handler=${rows.filter(r => r.hasTopHandler).length}/${rows.filter(r => !r.error).length}`);