| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- #!/usr/bin/env node
- // probe-sweep — direct MCP test across N repos × N tools, no claude needed.
- //
- // Measures response characteristics (size, sections present, signals fired)
- // for each (repo, query) pair against the built dist/. Sub-second per probe;
- // the full sweep below runs in ~10-30s vs hours for a real claude audit.
- //
- // Use this to iterate on backend changes rapidly: change tools.ts /
- // context-builder, npm run build, re-run probe-sweep, compare. Once a
- // change looks good on probe metrics, run a focused claude audit for the
- // few repos that matter to confirm end-to-end cost behavior.
- //
- // Usage: node scripts/agent-eval/probe-sweep.mjs [--tool=context|explore|trace] [--repos=a,b,c]
- import { pathToFileURL } from 'node:url';
- import { resolve } from 'node:path';
- const args = Object.fromEntries(
- process.argv.slice(2).map(a => a.startsWith('--') ? a.slice(2).split('=') : [a, true])
- );
- const TOOL = args.tool ?? 'context';
- const load = (rel) => import(pathToFileURL(resolve(rel)).href);
- const idx = await load('dist/index.js');
- const tools = await load('dist/mcp/tools.js');
- const CodeGraph = idx.default?.default ?? idx.default ?? idx.CodeGraph;
- const ToolHandler = tools.ToolHandler ?? tools.default?.ToolHandler;
- // Each entry: repo, query, optional 2nd arg for trace (from, to).
- // The query is the same prompt used in the real claude audits, so probe
- // output is directly comparable to the agent's would-be input.
- const SWEEP = [
- // Small realworld template repos (the loss cases from the cross-language sweep)
- { id: 'gin-rw', repo: '/tmp/codegraph-corpus/gin-realworld', q: 'How does this Gin app route a request through its middleware chain to a handler?' },
- { id: 'go-mux', repo: '/tmp/codegraph-corpus/go-mux', q: 'How does this gorilla/mux app route a request to its handler?' },
- { id: 'fastapi-rw', repo: '/tmp/codegraph-corpus/fastapi-realworld', q: 'How does FastAPI route a request through its dependencies to a handler?' },
- { id: 'spring-pc', repo: '/tmp/codegraph-corpus/spring-petclinic', q: 'How does Spring route an HTTP request to a controller method?' },
- { id: 'axum-rw', repo: '/tmp/codegraph-corpus/rust-axum-realworld', q: 'How does Axum route a request to its handler in this app?' },
- { id: 'express-rw', repo: '/tmp/codegraph-corpus/express-realworld', q: 'How does this Express app route a request through middleware to a handler?' },
- { id: 'kotlin-pc', repo: '/tmp/codegraph-corpus/kotlin-petclinic', q: 'How does the Kotlin Spring app route an HTTP request to its handler?' },
- { id: 'flask-mb', repo: '/tmp/codegraph-corpus/flask-microblog', q: 'How does this Flask app route a request to a view function?' },
- { id: 'vapor-tpl', repo: '/tmp/codegraph-corpus/vapor-template', q: 'How does Vapor route an HTTP request to its handler?' },
- { id: 'cpp-leveldb', repo: '/tmp/codegraph-corpus/cpp-leveldb', q: 'How does LevelDB handle a Put operation through to disk?' },
- { id: 'lualine', repo: '/tmp/codegraph-corpus/lualine.nvim', q: 'How does lualine assemble and render the statusline?' },
- { id: 'drupal-admin', repo: '/tmp/codegraph-corpus/drupal-admintoolbar', q: 'How does the Drupal admin toolbar module render its toolbar?' },
- { id: 'svelte-rw', repo: '/tmp/codegraph-corpus/svelte-realworld', q: 'How does this SvelteKit app route a request to a handler?' },
- { id: 'react-rw', repo: '/tmp/codegraph-corpus/react-realworld', q: 'How does this React app fetch and display articles?' },
- { id: 'rails-rw', repo: '/tmp/codegraph-corpus/rails-realworld', q: 'How does Rails route a request to a controller action?' },
- { id: 'flask-rest', repo: '/tmp/codegraph-corpus/flask-restful-realworld', q: 'How does Flask-RESTful route a request to a resource method?' },
- { id: 'laravel-rw', repo: '/tmp/codegraph-corpus/laravel-realworld', q: 'How does Laravel route a request to the controller method?' },
- { id: 'aspnet-rw', repo: '/tmp/codegraph-corpus/aspnet-realworld', q: 'How does ASP.NET route a request to the controller action?' },
- // The iter7 wins/ties (to make sure we don't regress)
- { id: 'cobra', repo: '/tmp/codegraph-corpus/cobra', q: 'How does cobra parse commands and flags?' },
- { id: 'sinatra', repo: '/tmp/codegraph-corpus/sinatra', q: 'How does sinatra route a request to its handler?' },
- { id: 'slim', repo: '/tmp/codegraph-corpus/slim', q: 'How does slim route a request and apply middleware?' },
- ];
- // Detect signals in response text — these are the levers we've added that
- // otherwise only show up via "agent ran X more tool calls" downstream.
- const detect = (text) => ({
- hasEntryPoints: /^### Entry Points/m.test(text),
- hasRelatedSymbols: /^### Related Symbols/m.test(text),
- hasFlowTrace: /^## Inline flow trace/m.test(text),
- hasRouteManifest: /^## Routing manifest/m.test(text),
- hasTopHandler: /^### Top handler file/m.test(text),
- hasSmallRepoTail: /This project is small/.test(text),
- });
- const filterRepos = args.repos ? new Set(String(args.repos).split(',')) : null;
- const subjects = SWEEP.filter(s => !filterRepos || filterRepos.has(s.id));
- const t0 = Date.now();
- const rows = [];
- for (const s of subjects) {
- try {
- const cg = CodeGraph.openSync(s.repo);
- const handler = new ToolHandler(cg);
- const t1 = Date.now();
- const res = await handler.execute('codegraph_' + TOOL,
- TOOL === 'context' ? { task: s.q } :
- TOOL === 'explore' ? { query: s.q } : { from: 'main', to: 'main' });
- const text = res.content?.[0]?.text ?? '';
- const signals = detect(text);
- rows.push({
- id: s.id,
- ms: Date.now() - t1,
- chars: text.length,
- lines: text.split('\n').length,
- ...signals,
- });
- try { cg.close?.(); } catch {}
- } catch (e) {
- rows.push({ id: s.id, error: String(e).slice(0, 80) });
- }
- }
- // Pretty-print as a compact table.
- const fmt = (r) =>
- r.error
- ? ` ${r.id.padEnd(13)} ERROR: ${r.error}`
- : ` ${r.id.padEnd(13)} ${String(r.chars).padStart(6)}c ${String(r.lines).padStart(4)}L ${String(r.ms).padStart(4)}ms` +
- ` ${r.hasEntryPoints ? 'EP ' : ' '}` +
- `${r.hasFlowTrace ? 'TRC ' : ' '}` +
- `${r.hasRouteManifest ? 'MAN ' : ' '}` +
- `${r.hasTopHandler ? 'HND ' : ' '}` +
- `${r.hasSmallRepoTail ? 'TAIL' : ' '}`;
- console.log(`=== probe-sweep tool=${TOOL} n=${subjects.length} (${Date.now() - t0}ms total) ===`);
- console.log(' id chars lines ms signals');
- console.log(' ' + '-'.repeat(56));
- for (const r of rows) console.log(fmt(r));
- // Sum + medians for the size pillar
- const sizes = rows.filter(r => !r.error).map(r => r.chars);
- sizes.sort((a, b) => a - b);
- const median = sizes[Math.floor(sizes.length / 2)];
- const sum = sizes.reduce((a, b) => a + b, 0);
- console.log(` ${'-'.repeat(64)}`);
- console.log(` median=${median}c total=${sum}c ` +
- `manifest=${rows.filter(r => r.hasRouteManifest).length}/${rows.filter(r => !r.error).length} ` +
- `top-handler=${rows.filter(r => r.hasTopHandler).length}/${rows.filter(r => !r.error).length}`);
|