1 月之前 · 7fe64b32be
--- a/.claude/skills/audit/SKILL.md
+++ b/.claude/skills/audit/SKILL.md
@@ -0,0 +1,74 @@
 
				+---
			
 
				+name: audit
			
 
				+description: Benchmark CodeGraph retrieval quality on a real codebase by comparing agent behavior with vs without CodeGraph. Use when the user runs /audit or asks to test, benchmark, audit, or validate a codegraph version (the local dev build or a published npm version) against a language's repo.
			
 
				+---
			
 
				+
			
 
				+# CodeGraph Quality Audit
			
 
				+
			
 
				+Measures how much CodeGraph helps an agent versus plain grep/read, for a chosen
			
 
				+codegraph version on a chosen real-world repo. Drives the harness in
			
 
				+`scripts/agent-eval/`.
			
 
				+
			
 
				+## Prerequisites
			
 
				+- `tmux` 3+, a logged-in `claude` CLI, `node`, `git` (macOS/Linux).
			
 
				+- Run from the codegraph repo root.
			
 
				+
			
 
				+## Workflow
			
 
				+
			
 
				+Copy this checklist:
			
 
				+```
			
 
				+- [ ] 1. Pick version (local or npm)
			
 
				+- [ ] 2. Pick language
			
 
				+- [ ] 3. Pick repo by size
			
 
				+- [ ] 4. Pick harness (headless / tmux / both)
			
 
				+- [ ] 5. Run audit.sh in the background
			
 
				+- [ ] 6. Report results
			
 
				+```
			
 
				+
			
 
				+**Step 1 — version.** Ask with `AskUserQuestion`: which codegraph version to test.
			
 
				+Offer "Local dev build" and "Latest published"; the free-text "Other" lets the
			
 
				+user type a specific version (e.g. `0.7.10`). Map the answer to a VERSION token:
			
 
				+- "Local dev build" → `local`
			
 
				+- "Latest published" → `latest`
			
 
				+- a typed version → that string (e.g. `0.7.10`)
			
 
				+
			
 
				+**Step 2 — language.** Read `.claude/skills/audit/corpus.json`. Ask with
			
 
				+`AskUserQuestion` which language to test, listing the languages that have entries.
			
 
				+
			
 
				+**Step 3 — repo.** From the chosen language's entries, ask which repo. Label each
			
 
				+option with its size and file count, e.g. `excalidraw — Medium (~600 files)`.
			
 
				+Each entry carries the `repo` URL and a representative `question`.
			
 
				+
			
 
				+**Step 4 — harness.** Ask with `AskUserQuestion` which harness to run, and map
			
 
				+the answer to a MODE token:
			
 
				+- "Headless" → `headless` — `claude -p` with stream-json: exact tokens/cost and a
			
 
				+  clean tool sequence (2 runs, fast, no TTY).
			
 
				+- "Interactive (tmux)" → `tmux` — drives the real Claude TUI in tmux: faithful
			
 
				+  Explore-subagent behavior, metrics from session logs (2 runs, slower).
			
 
				+- "Both" → `all` — headless + interactive (4 runs).
			
 
				+
			
 
				+**Step 5 — run.** Launch in the background (sets the version, clones if missing,
			
 
				+wipes + re-indexes, runs the chosen arms — several minutes):
			
 
				+```bash
			
 
				+scripts/agent-eval/audit.sh <VERSION> <repo-name> <repo-url> "<question>" <MODE>
			
 
				+```
			
 
				+
			
 
				+**Step 6 — report.** When the job finishes, read the log and report per arm:
			
 
				+- Headless (`parse-run.mjs`): total tool calls, file `Read`s, Grep/Bash,
			
 
				+  codegraph-tool calls, duration, **total cost**.
			
 
				+- Interactive (`parse-session.mjs`): the `VERDICT: codegraph_explore used Nx |
			
 
				+  Read N | Grep/Bash N` and `TOKENS:` lines.
			
 
				+
			
 
				+Lead with cost + tool/Read counts — they are the reliable signals; raw token
			
 
				+in/out are confounded by subagent delegation and prompt caching. State whether
			
 
				+codegraph reduced effort and whether both arms reached a correct answer.
			
 
				+
			
 
				+## Notes
			
 
				+- The index is rebuilt every run (`audit.sh` wipes `.codegraph`) — different
			
 
				+  versions extract differently, so an index must be served by the same binary
			
 
				+  that built it.
			
 
				+- `audit.sh` temporarily mutates the global `codegraph` install for the test,
			
 
				+  then restores your dev link via `local-install.sh`.
			
 
				+- Corpus repos are cloned to `/tmp/codegraph-corpus` (reused if already present).
			
 
				+- Add or edit repos in `corpus.json` (fields: `name`, `repo`, `size`, `files`,
			
 
				+  `question`).
			
--- a/.claude/skills/audit/corpus.json
+++ b/.claude/skills/audit/corpus.json
@@ -0,0 +1,63 @@
 
				+{
			
 
				+  "_comment": "Test corpus for /audit. Add entries freely. size: Small (<~150 files), Medium (~150-1500), Large (>~1500). 'question' is a representative architectural question that exercises cross-file understanding.",
			
 
				+  "TypeScript": [
			
 
				+    { "name": "ky", "repo": "https://github.com/sindresorhus/ky", "size": "Small", "files": "~25", "question": "How does ky implement request retries and timeouts?" },
			
 
				+    { "name": "excalidraw", "repo": "https://github.com/excalidraw/excalidraw", "size": "Medium", "files": "~600", "question": "How does Excalidraw render and update canvas elements?" },
			
 
				+    { "name": "vscode", "repo": "https://github.com/microsoft/vscode", "size": "Large", "files": "~10000", "question": "How does the extension host communicate with the main process?" }
			
 
				+  ],
			
 
				+  "JavaScript": [
			
 
				+    { "name": "express", "repo": "https://github.com/expressjs/express", "size": "Small", "files": "~50", "question": "How does Express route a request through its middleware stack?" }
			
 
				+  ],
			
 
				+  "Go": [
			
 
				+    { "name": "cobra", "repo": "https://github.com/spf13/cobra", "size": "Small", "files": "~50", "question": "How does cobra parse commands and flags?" },
			
 
				+    { "name": "gin", "repo": "https://github.com/gin-gonic/gin", "size": "Medium", "files": "~150", "question": "How does gin route requests through its middleware chain?" },
			
 
				+    { "name": "terraform", "repo": "https://github.com/hashicorp/terraform", "size": "Large", "files": "~4000", "question": "How does Terraform build and walk the resource dependency graph?" }
			
 
				+  ],
			
 
				+  "Python": [
			
 
				+    { "name": "click", "repo": "https://github.com/pallets/click", "size": "Small", "files": "~60", "question": "How does click parse command-line arguments into commands?" },
			
 
				+    { "name": "flask", "repo": "https://github.com/pallets/flask", "size": "Medium", "files": "~90", "question": "How does Flask dispatch a request to a view function?" },
			
 
				+    { "name": "django", "repo": "https://github.com/django/django", "size": "Large", "files": "~2700", "question": "How does Django's ORM build and execute a query from a QuerySet?" }
			
 
				+  ],
			
 
				+  "Rust": [
			
 
				+    { "name": "clap", "repo": "https://github.com/clap-rs/clap", "size": "Medium", "files": "~200", "question": "How does clap parse arguments against a derived command definition?" },
			
 
				+    { "name": "tokio", "repo": "https://github.com/tokio-rs/tokio", "size": "Large", "files": "~700", "question": "How does tokio schedule and run async tasks on its runtime?" },
			
 
				+    { "name": "deno", "repo": "https://github.com/denoland/deno", "size": "Large", "files": "~1500", "question": "How does Deno load and execute a TypeScript module?" }
			
 
				+  ],
			
 
				+  "Java": [
			
 
				+    { "name": "gson", "repo": "https://github.com/google/gson", "size": "Medium", "files": "~200", "question": "How does Gson serialize an object to JSON?" },
			
 
				+    { "name": "okhttp", "repo": "https://github.com/square/okhttp", "size": "Medium", "files": "~640", "question": "How does OkHttp process a request through its interceptor chain?" },
			
 
				+    { "name": "guava", "repo": "https://github.com/google/guava", "size": "Large", "files": "~3000", "question": "How does Guava's CacheBuilder build and configure a cache?" }
			
 
				+  ],
			
 
				+  "Kotlin": [
			
 
				+    { "name": "koin", "repo": "https://github.com/InsertKoinIO/koin", "size": "Medium", "files": "~300", "question": "How does Koin resolve and inject dependencies?" },
			
 
				+    { "name": "leakcanary", "repo": "https://github.com/square/leakcanary", "size": "Medium", "files": "~250", "question": "How does LeakCanary detect and analyze a memory leak?" }
			
 
				+  ],
			
 
				+  "Swift": [
			
 
				+    { "name": "alamofire", "repo": "https://github.com/Alamofire/Alamofire", "size": "Small", "files": "~100", "question": "How does Alamofire build, send, and validate a request?" }
			
 
				+  ],
			
 
				+  "C#": [
			
 
				+    { "name": "serilog", "repo": "https://github.com/serilog/serilog", "size": "Medium", "files": "~250", "question": "How does Serilog route a log event to its sinks?" },
			
 
				+    { "name": "jellyfin", "repo": "https://github.com/jellyfin/jellyfin", "size": "Large", "files": "~2500", "question": "How does Jellyfin scan and identify items in a media library?" }
			
 
				+  ],
			
 
				+  "Ruby": [
			
 
				+    { "name": "sinatra", "repo": "https://github.com/sinatra/sinatra", "size": "Small", "files": "~60", "question": "How does Sinatra match a request to a route handler?" },
			
 
				+    { "name": "discourse", "repo": "https://github.com/discourse/discourse", "size": "Large", "files": "~3000", "question": "How does Discourse create and render a new post?" }
			
 
				+  ],
			
 
				+  "PHP": [
			
 
				+    { "name": "slim", "repo": "https://github.com/slimphp/Slim", "size": "Small", "files": "~80", "question": "How does Slim handle a request through its middleware?" },
			
 
				+    { "name": "laravel", "repo": "https://github.com/laravel/framework", "size": "Large", "files": "~3000", "question": "How does Laravel resolve and dispatch a route to a controller?" }
			
 
				+  ],
			
 
				+  "C": [
			
 
				+    { "name": "redis", "repo": "https://github.com/redis/redis", "size": "Large", "files": "~600", "question": "How does Redis parse and dispatch a client command?" }
			
 
				+  ],
			
 
				+  "C++": [
			
 
				+    { "name": "json", "repo": "https://github.com/nlohmann/json", "size": "Small", "files": "~100", "question": "How does nlohmann::json parse a JSON string into a value?" },
			
 
				+    { "name": "grpc", "repo": "https://github.com/grpc/grpc", "size": "Large", "files": "~3000", "question": "How does gRPC dispatch an incoming RPC to its handler?" }
			
 
				+  ],
			
 
				+  "Dart": [
			
 
				+    { "name": "flutter", "repo": "https://github.com/flutter/flutter", "size": "Large", "files": "~6000", "question": "How does Flutter build and lay out a widget tree?" }
			
 
				+  ],
			
 
				+  "Svelte": [
			
 
				+    { "name": "shadcn-svelte", "repo": "https://github.com/huntabyte/shadcn-svelte", "size": "Medium", "files": "~600", "question": "How do shadcn-svelte components compose and apply their styling?" }
			
 
				+  ]
			
 
				+}
			
--- a/.claude/skills/publish/SKILL.md
+++ b/.claude/skills/publish/SKILL.md
@@ -0,0 +1,136 @@
 
				+---
			
 
				+name: publish
			
 
				+description: Publishes a new minor or major release of this npm package (codegraph). Reads the latest version from npm, generates a user-perspective CHANGELOG entry from commits since the last tag, bumps package.json, publishes to npm, and creates the matching GitHub release. Use when the user runs /publish or asks to cut, ship, or publish a release / new version.
			
 
				+---
			
 
				+
			
 
				+# Publish a release
			
 
				+
			
 
				+Cut a **minor or major** release: generate the changelog, bump, publish to npm, and create the GitHub release. Patch releases are intentionally not offered here.
			
 
				+
			
 
				+This skill performs the actual publish (npm publish, git push, GitHub release) — that is the whole point of invoking it, so the general "hand the user the commands" rule does **not** apply inside `/publish`. The **confirmation gate in Step 5 is the safeguard**: never run a step past it without explicit approval.
			
 
				+
			
 
				+Run from the repo root.
			
 
				+
			
 
				+## Workflow
			
 
				+
			
 
				+Copy this checklist and work through it in order:
			
 
				+
			
 
				+```
			
 
				+- [ ] 1. Preflight: branch, sync, auth
			
 
				+- [ ] 2. Read base version from npm, compute candidates
			
 
				+- [ ] 3. Ask the user: minor or major
			
 
				+- [ ] 4. Generate the CHANGELOG entry from commits since the last tag
			
 
				+- [ ] 5. CONFIRMATION GATE — show changelog + plan, get explicit approval
			
 
				+- [ ] 6. Write CHANGELOG.md, bump, build
			
 
				+- [ ] 7. Commit + push
			
 
				+- [ ] 8. npm publish
			
 
				+- [ ] 9. scripts/release.sh (GitHub release)
			
 
				+- [ ] 10. Verify on the npm registry
			
 
				+```
			
 
				+
			
 
				+### Step 1 — Preflight
			
 
				+
			
 
				+```bash
			
 
				+git rev-parse --abbrev-ref HEAD   # expect: main
			
 
				+git fetch origin
			
 
				+git status --porcelain            # working tree should be clean
			
 
				+git rev-list --left-right --count origin/main...HEAD   # "<behind> <ahead>"
			
 
				+npm whoami                        # npm auth (publish will fail without it)
			
 
				+gh auth status                    # gh auth (release.sh needs it)
			
 
				+```
			
 
				+
			
 
				+- If not on `main`, stop and ask the user to confirm releasing from this branch.
			
 
				+- If behind origin, `git pull --ff-only` so the final push is a fast-forward.
			
 
				+- If the tree has **unrelated** uncommitted changes, stop and ask — the release commit only stages 3 files, but a dirty tree usually means something's mid-flight.
			
 
				+- If `npm whoami` or `gh auth status` fails, stop and tell the user to authenticate.
			
 
				+
			
 
				+### Step 2 — Base version + candidates
			
 
				+
			
 
				+The latest **published** version is the source of truth, not local `package.json`.
			
 
				+
			
 
				+```bash
			
 
				+PKG=$(node -p "require('./package.json').name")
			
 
				+BASE=$(npm view "$PKG" version)
			
 
				+node -e "const [a,b]=process.argv[1].split('.').map(Number);console.log('minor ->',a+'.'+(b+1)+'.0');console.log('major ->',(a+1)+'.0.0')" "$BASE"
			
 
				+```
			
 
				+
			
 
				+Note if local `package.json` differs from `BASE` (an unpublished bump) — surface it, but still base the new version on npm.
			
 
				+
			
 
				+### Step 3 — Ask minor or major
			
 
				+
			
 
				+Use the **AskUserQuestion** tool with the two computed candidates as options (show the resulting version in each label, e.g. "minor → 0.8.0"). Set the new version from the answer.
			
 
				+
			
 
				+### Step 4 — Generate the changelog entry
			
 
				+
			
 
				+```bash
			
 
				+LAST=$(git describe --tags --abbrev=0 --match 'v*' 2>/dev/null)
			
 
				+git log --no-merges "${LAST}..HEAD" --pretty=format:'%h %s'
			
 
				+```
			
 
				+
			
 
				+Read the commit subjects; for any whose user impact is unclear, inspect the diff (`git show <hash>` or `git diff "${LAST}..HEAD" -- <path>`). Then **write the entry yourself** following the repo's conventions in `CLAUDE.md` → "Writing changelog entries":
			
 
				+
			
 
				+- Header: `## [X.Y.Z] - YYYY-MM-DD` (get the date with `date +%F`).
			
 
				+- Group under `### Added`, `### Changed`, `### Fixed`, `### Removed`, `### Deprecated`, `### Security` — **omit empty sections**.
			
 
				+- Write from the **user's perspective** (observable capability/symptom), not the implementation. Collapse noisy commits ("fix typo", "address review") into the feature they belong to or drop them.
			
 
				+- Plan the bottom link reference: `[X.Y.Z]: https://github.com/colbymchenry/codegraph/releases/tag/vX.Y.Z`.
			
 
				+
			
 
				+Do not write to any file yet — draft it for review first.
			
 
				+
			
 
				+### Step 5 — CONFIRMATION GATE
			
 
				+
			
 
				+Show the user, in chat:
			
 
				+1. The new version (`BASE` → `X.Y.Z`, minor/major).
			
 
				+2. The full drafted changelog entry.
			
 
				+3. The exact actions Steps 6–9 will take (commit + push + npm publish + GitHub release).
			
 
				+
			
 
				+Then **STOP**. Proceed only on explicit approval ("yes" / "proceed"). If the user requests prose changes, revise the draft and re-show. Do not run any command below until approved.
			
 
				+
			
 
				+### Step 6 — Write changelog, bump, build
			
 
				+
			
 
				+1. Use the **Edit** tool to insert the drafted `## [X.Y.Z]` block at the **top** of `CHANGELOG.md` (under the intro, above the previous version), and add the link reference with the other `[x.y.z]:` links at the bottom.
			
 
				+2. Bump (also updates `package-lock.json`; `--allow-same-version` keeps re-runs safe):
			
 
				+   ```bash
			
 
				+   npm version X.Y.Z --no-git-tag-version --allow-same-version
			
 
				+   ```
			
 
				+3. Build (fail fast before any push/publish):
			
 
				+   ```bash
			
 
				+   npm run build
			
 
				+   ```
			
 
				+
			
 
				+### Step 7 — Commit + push
			
 
				+
			
 
				+`release.sh` tags HEAD, so the bump must be committed first.
			
 
				+
			
 
				+```bash
			
 
				+git add package.json package-lock.json CHANGELOG.md
			
 
				+git commit -m "release: X.Y.Z"
			
 
				+git push
			
 
				+```
			
 
				+
			
 
				+### Step 8 — Publish to npm
			
 
				+
			
 
				+```bash
			
 
				+npm publish --access public
			
 
				+```
			
 
				+
			
 
				+### Step 9 — GitHub release
			
 
				+
			
 
				+`scripts/release.sh` reads the `## [X.Y.Z]` block from CHANGELOG.md, tags `vX.Y.Z`, pushes the tag, and creates the GitHub release. It is idempotent.
			
 
				+
			
 
				+```bash
			
 
				+./scripts/release.sh
			
 
				+```
			
 
				+
			
 
				+### Step 10 — Verify
			
 
				+
			
 
				+Confirm against the **registry**, not the website (the website caches):
			
 
				+
			
 
				+```bash
			
 
				+npm view "$PKG" version   # must equal X.Y.Z
			
 
				+```
			
 
				+
			
 
				+Report the release URL (`scripts/release.sh` prints it) and the published version.
			
 
				+
			
 
				+## If something fails midway
			
 
				+
			
 
				+Re-running is safe: `npm version --allow-same-version` no-ops if already bumped, `git commit` skips if nothing's staged (check `git diff --cached --quiet`), `git push` no-ops if up to date, and `scripts/release.sh` skips tag/release steps already done. Re-run from the failed step.
			
--- a/publish.js
+++ b/publish.js
@@ -1,65 +0,0 @@
 
				-#!/usr/bin/env node
			
 
				-const { execSync } = require('child_process');
			
 
				-const fs = require('fs');
			
 
				-const path = require('path');
			
 
				-const readline = require('readline');
			
 
				-
			
 
				-const PKG_PATH = path.join(__dirname, 'package.json');
			
 
				-const pkg = JSON.parse(fs.readFileSync(PKG_PATH, 'utf-8'));
			
 
				-const [major, minor, patch] = pkg.version.split('.').map(Number);
			
 
				-
			
 
				-const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
			
 
				-
			
 
				-function ask(question) {
			
 
				-  return new Promise((resolve) => rl.question(question, resolve));
			
 
				-}
			
 
				-
			
 
				-async function main() {
			
 
				-  console.log(`\nCurrent version: ${pkg.version}\n`);
			
 
				-  console.log('  1) patch  -> ' + `${major}.${minor}.${patch + 1}`);
			
 
				-  console.log('  2) minor  -> ' + `${major}.${minor + 1}.0`);
			
 
				-  console.log('  3) major  -> ' + `${major + 1}.0.0`);
			
 
				-  console.log('');
			
 
				-
			
 
				-  const choice = await ask('Bump version (1/2/3): ');
			
 
				-
			
 
				-  let bump;
			
 
				-  switch (choice.trim()) {
			
 
				-    case '1': bump = 'patch'; break;
			
 
				-    case '2': bump = 'minor'; break;
			
 
				-    case '3': bump = 'major'; break;
			
 
				-    default:
			
 
				-      console.log('Invalid choice. Exiting.');
			
 
				-      rl.close();
			
 
				-      process.exit(1);
			
 
				-  }
			
 
				-
			
 
				-  // Bump version in package.json
			
 
				-  execSync(`npm version ${bump} --no-git-tag-version`, { stdio: 'inherit' });
			
 
				-
			
 
				-  const updated = JSON.parse(fs.readFileSync(PKG_PATH, 'utf-8'));
			
 
				-  console.log(`\nVersion bumped to ${updated.version}`);
			
 
				-
			
 
				-  const confirm = await ask(`Publish ${updated.name}@${updated.version} to npm? (y/n): `);
			
 
				-  if (confirm.trim().toLowerCase() !== 'y') {
			
 
				-    console.log('Aborted.');
			
 
				-    rl.close();
			
 
				-    process.exit(0);
			
 
				-  }
			
 
				-
			
 
				-  // Build and publish
			
 
				-  console.log('\nBuilding...');
			
 
				-  execSync('npm run build', { stdio: 'inherit' });
			
 
				-
			
 
				-  console.log('\nPublishing...');
			
 
				-  execSync('npm publish --access public', { stdio: 'inherit' });
			
 
				-
			
 
				-  console.log(`\nPublished ${updated.name}@${updated.version}`);
			
 
				-  rl.close();
			
 
				-}
			
 
				-
			
 
				-main().catch((err) => {
			
 
				-  console.error(err);
			
 
				-  rl.close();
			
 
				-  process.exit(1);
			
 
				-});
			
--- a/run-interactive-test.md
+++ b/run-interactive-test.md
@@ -0,0 +1,131 @@
 
				+# Running the agent-behavior test (how agents actually use codegraph)
			
 
				+
			
 
				+This explains how to measure **how a Claude Code agent uses the codegraph MCP
			
 
				+tools** on a real repo — which tools it calls (does it lead with
			
 
				+`codegraph_explore`?), how many follow-up `Read`/`Grep`s it does, and the token
			
 
				+cost. Use it when changing tool guidance (`server-instructions.ts`,
			
 
				+`instructions-template.ts`, tool descriptions) or retrieval, to verify the
			
 
				+change actually shifts agent behavior.
			
 
				+
			
 
				+Scripts live in `scripts/agent-eval/`.
			
 
				+
			
 
				+## Why two harnesses (read this first)
			
 
				+
			
 
				+| | Interactive (`itrun.sh`) | Headless (`run-agent.sh`) |
			
 
				+|---|---|---|
			
 
				+| Drives | the real TUI via tmux | `claude -p` print mode |
			
 
				+| Subagent it picks | **Explore** (matches real UX) | general-purpose (diverges) |
			
 
				+| Metrics | tool breakdown (from session logs) + `Done(…)` token summary | exact per-tool calls + tokens/cost (stream-json) |
			
 
				+| Cost | Claude Max subscription | API $ (`total_cost_usd`) |
			
 
				+
			
 
				+**Headless `claude -p` does NOT reproduce what users see** — it silently picks
			
 
				+the general-purpose subagent, while interactive sessions delegate to the
			
 
				+read-first **Explore** subagent. So for "what does my session actually do," use
			
 
				+the interactive harness. For a clean per-tool/token breakdown in one shot, use
			
 
				+headless (and ask for the Explore subagent in the prompt if you want that path).
			
 
				+
			
 
				+## Prerequisites
			
 
				+
			
 
				+- **tmux 3.0+**
			
 
				+- A logged-in `claude` CLI (Claude Max or API).
			
 
				+- codegraph configured as an MCP server (`claude mcp list` shows `codegraph`).
			
 
				+  The interactive harness uses your global config, so it runs whatever
			
 
				+  `codegraph` resolves to — point that at your dev build (`npm link` / the
			
 
				+  symlinked global) to test local changes.
			
 
				+- A target repo, cloned and indexed:
			
 
				+  ```bash
			
 
				+  git clone --depth 1 https://github.com/square/okhttp /tmp/corpus/okhttp
			
 
				+  cd /tmp/corpus/okhttp && codegraph init -i
			
 
				+  ```
			
 
				+  Good scale spread for a sweep: Alamofire (~100 files), Excalidraw (~600),
			
 
				+  OkHttp (~640), VS Code (~10k).
			
 
				+
			
 
				+## Interactive test (the faithful one)
			
 
				+
			
 
				+```bash
			
 
				+scripts/agent-eval/itrun.sh <repo-path> <label> "<question>"
			
 
				+```
			
 
				+
			
 
				+Example:
			
 
				+```bash
			
 
				+scripts/agent-eval/itrun.sh /tmp/corpus/vscode vscode \
			
 
				+  "How does the extension host communicate with the main process?"
			
 
				+```
			
 
				+
			
 
				+It opens `claude` in a tmux session, types the question, waits for the agent to
			
 
				+finish, then prints:
			
 
				+- the `Done (N tool uses · Xk tokens · Ym)` subagent summary (from the pane),
			
 
				+- the `Context Xk/1.0M` main-session size,
			
 
				+- a **tool breakdown** parsed from the session logs (main + subagents), ending
			
 
				+  in a `VERDICT: codegraph_explore used Nx | Read N | Grep/Bash N` line.
			
 
				+
			
 
				+### Startup robustness (so unattended runs don't silently no-op)
			
 
				+
			
 
				+Two things bite an unattended driver before the prompt even runs:
			
 
				+- **The `❯` glyph is drawn ~6s before the input accepts keystrokes.** Waiting
			
 
				+  for `❯` is necessary but not sufficient. The harness sends the prompt, then
			
 
				+  **verifies a chunk of it actually landed in the input box**, retrying until it
			
 
				+  does — so it can't type into a not-yet-live input and submit nothing.
			
 
				+- **First time claude opens a repo it shows "Is this a project you trust?"**
			
 
				+  (which also contains `❯`). The harness detects that dialog and presses Enter
			
 
				+  to accept it before typing.
			
 
				+
			
 
				+If the prompt never lands or work never starts, the harness now **fails loudly**
			
 
				+(non-zero exit) instead of capturing an empty pane and reporting a bogus run.
			
 
				+
			
 
				+### How completion is detected (the tricky part)
			
 
				+
			
 
				+Claude's TUI redraws in place, so you can't just wait for output to stop. The
			
 
				+harness polls `tmux capture-pane` and treats the pane as **busy** when it shows
			
 
				+the spinner's elapsed-time-in-parens — `(8s · …)` / `(1m 3s · …)`, matched by
			
 
				+`\(([0-9]+m )?[0-9]+s ·`. That's the *universal* working signal: it shows during
			
 
				+the pre-stream **thinking** phase (`(8s · thinking with max effort)`, which has
			
 
				+no token arrow yet) *and* during streaming. The `↓ N`/`↑ N` token arrow,
			
 
				+`esc to interrupt`, and `Initializing…` are OR'd in as belt-and-braces (some TUI
			
 
				+versions show one but not the others). It declares **idle** when the `❯` prompt
			
 
				+is present and not busy for 10 consecutive polls (~5s, long enough to ride out
			
 
				+mid-conversation thinking gaps that briefly drop the spinner). (Technique
			
 
				+adapted from devpit's `WaitForIdle`.)
			
 
				+
			
 
				+### Where the breakdown comes from
			
 
				+
			
 
				+`parse-session.mjs` reads the newest session log under
			
 
				+`~/.claude/projects/<escaped-cwd>/<session>.jsonl` and its subagent transcripts
			
 
				+under `<session>/subagents/*.jsonl`. The **subagent** file is where the real
			
 
				+tool calls are — the main log only shows the `Agent` delegation. You can run it
			
 
				+standalone:
			
 
				+```bash
			
 
				+node scripts/agent-eval/parse-session.mjs /tmp/corpus/vscode
			
 
				+```
			
 
				+
			
 
				+## Headless test (clean tokens, forceable Explore path)
			
 
				+
			
 
				+```bash
			
 
				+scripts/agent-eval/run-agent.sh <repo-path> <label> "<question>"
			
 
				+```
			
 
				+Writes stream-json and prints the tool sequence + exact tokens/cost. To
			
 
				+reproduce the Explore-subagent path headlessly, ask for it:
			
 
				+`"Use an Explore subagent to investigate, then answer: …"`.
			
 
				+
			
 
				+## Running a sweep
			
 
				+
			
 
				+Single runs vary a lot (the VS Code question has ranged 26–37 tool uses /
			
 
				+88–105k tokens across runs). For a real signal, run N≥3 and take the median:
			
 
				+```bash
			
 
				+for i in 1 2 3; do
			
 
				+  scripts/agent-eval/itrun.sh /tmp/corpus/vscode "vscode-$i" "<question>"
			
 
				+done
			
 
				+```
			
 
				+
			
 
				+## What "good" looks like
			
 
				+
			
 
				+After the explore-first guidance (PR #191), an understanding question should
			
 
				+show the agent **leading with `codegraph_explore`** and using `search`/`node`
			
 
				+to fill gaps — not a wall of `Read`/`Grep`. Example faithful run:
			
 
				+`VERDICT: codegraph_explore used 3x | Read 8 | Grep/Bash 1`. If `explore` is 0
			
 
				+and `Read`/`Grep` dominate, the guidance regressed.
			
 
				+
			
 
				+## Output artifacts
			
 
				+
			
 
				+Transcripts and logs go to `$AGENT_EVAL_OUT` (default `/tmp/agent-eval/`):
			
 
				+`itrun-<label>.txt` (pane capture), `run-<label>.jsonl` (headless stream-json).
			
--- a/scripts/agent-eval/audit.sh
+++ b/scripts/agent-eval/audit.sh
@@ -0,0 +1,68 @@
 
				+#!/usr/bin/env bash
			
 
				+# One-shot CodeGraph quality audit:
			
 
				+#   set version -> ensure corpus repo -> wipe+reindex with that version ->
			
 
				+#   run with/without A/B -> restore the local dev link.
			
 
				+#
			
 
				+# Usage: audit.sh <version> <repo-name> <repo-url> "<question>" [headless|all]
			
 
				+#   <version>    "local" (build + npm link this repo) | "latest" | a version (e.g. 0.7.10)
			
 
				+#   <repo-name>  dir name under the corpus dir
			
 
				+#   <repo-url>   git URL (cloned --depth 1 when the repo dir is missing)
			
 
				+#   [mode]       headless (default) | all (also the interactive tmux arms)
			
 
				+# Env: CORPUS  corpus dir (default: /tmp/codegraph-corpus)
			
 
				+set -uo pipefail
			
 
				+
			
 
				+VERSION="${1:?usage: audit.sh <version> <repo-name> <repo-url> \"<question>\" [mode]}"
			
 
				+NAME="${2:?repo-name required}"
			
 
				+URL="${3:?repo-url required}"
			
 
				+Q="${4:?question required}"
			
 
				+MODE="${5:-headless}"
			
 
				+
			
 
				+HARNESS="$(cd "$(dirname "$0")" && pwd)"
			
 
				+REPO_ROOT="$(cd "$HARNESS/../.." && pwd)"     # codegraph repo root
			
 
				+CORPUS="${CORPUS:-/tmp/codegraph-corpus}"
			
 
				+REPO="$CORPUS/$NAME"
			
 
				+PKG="@colbymchenry/codegraph"
			
 
				+
			
 
				+echo "==================== CodeGraph audit ===================="
			
 
				+echo "version=$VERSION  repo=$NAME  mode=$MODE  corpus=$CORPUS"
			
 
				+echo
			
 
				+
			
 
				+# 1. Set the codegraph version under test (mutates the global install).
			
 
				+if [ "$VERSION" = local ]; then
			
 
				+  echo "→ [1/4] building + linking local dev build (local-install.sh)"
			
 
				+  ( cd "$REPO_ROOT" && ./scripts/local-install.sh ) || { echo "local-install.sh failed"; exit 1; }
			
 
				+else
			
 
				+  echo "→ [1/4] installing $PKG@$VERSION globally"
			
 
				+  npm install -g "$PKG@$VERSION" || { echo "npm install -g $PKG@$VERSION failed"; exit 1; }
			
 
				+fi
			
 
				+ACTUAL="$(codegraph --version 2>/dev/null || echo '?')"
			
 
				+echo "  codegraph on PATH: $(command -v codegraph) -> $ACTUAL"
			
 
				+
			
 
				+# 2. Ensure the corpus repo exists (clone shallow if missing, reuse if present).
			
 
				+mkdir -p "$CORPUS"
			
 
				+if [ -d "$REPO/.git" ]; then
			
 
				+  echo "→ [2/4] reusing existing checkout: $REPO"
			
 
				+else
			
 
				+  echo "→ [2/4] cloning $URL"
			
 
				+  git clone --depth 1 "$URL" "$REPO" || { echo "git clone failed"; exit 1; }
			
 
				+fi
			
 
				+
			
 
				+# 3. Wipe + re-index with THIS version (the index must be built by the same
			
 
				+#    binary that serves it — different versions extract differently).
			
 
				+echo "→ [3/4] wiping .codegraph and re-indexing with $ACTUAL"
			
 
				+rm -rf "$REPO/.codegraph"
			
 
				+( cd "$REPO" && codegraph init -i ) || { echo "indexing failed"; exit 1; }
			
 
				+
			
 
				+# 4. Run the with/without A/B.
			
 
				+echo "→ [4/4] running A/B harness (mode=$MODE)"
			
 
				+bash "$HARNESS/run-all.sh" "$REPO" "$Q" "$MODE"
			
 
				+
			
 
				+# Restore the dev link (the normal working state in this repo).
			
 
				+echo
			
 
				+echo "→ restoring local dev link (local-install.sh)"
			
 
				+if ( cd "$REPO_ROOT" && ./scripts/local-install.sh >/dev/null 2>&1 ); then
			
 
				+  echo "  global codegraph restored to dev build"
			
 
				+else
			
 
				+  echo "  WARN: restore failed — run ./scripts/local-install.sh manually"
			
 
				+fi
			
 
				+echo "==================== audit complete ===================="
			
--- a/scripts/agent-eval/itrun.sh
+++ b/scripts/agent-eval/itrun.sh
@@ -0,0 +1,107 @@
 
				+#!/usr/bin/env bash
			
 
				+# Drive an INTERACTIVE Claude Code session in tmux, send a prompt, wait for the
			
 
				+# agent to finish, then print the tool-call breakdown from the session logs.
			
 
				+#
			
 
				+# Why interactive (not `claude -p`): headless print-mode picks the
			
 
				+# general-purpose subagent, while real interactive sessions delegate to the
			
 
				+# Explore subagent (or drive codegraph from the main thread). Only the
			
 
				+# interactive TUI reproduces the behavior users actually see. (Idle-detection
			
 
				+# technique borrowed from devpit's WaitForIdle.)
			
 
				+#
			
 
				+# Usage: itrun.sh <repo-path> <label> "<prompt>"
			
 
				+# Output dir: $AGENT_EVAL_OUT (default /tmp/agent-eval)
			
 
				+# Requires: tmux 3.0+, a logged-in `claude` CLI, codegraph MCP configured.
			
 
				+set -uo pipefail
			
 
				+REPO="$1"; LABEL="$2"; PROMPT="$3"
			
 
				+SESSION="cgt_${LABEL}"
			
 
				+OUT_DIR="${AGENT_EVAL_OUT:-/tmp/agent-eval}"; mkdir -p "$OUT_DIR"
			
 
				+OUT="$OUT_DIR/itrun-${LABEL}.txt"
			
 
				+HERE="$(cd "$(dirname "$0")" && pwd)"
			
 
				+
			
 
				+cap() { tmux capture-pane -p -t "$SESSION" -S -40; }
			
 
				+
			
 
				+tmux kill-session -t "$SESSION" 2>/dev/null
			
 
				+
			
 
				+# Wide pane so the TUI doesn't hard-wrap tool lines.
			
 
				+tmux new-session -d -s "$SESSION" -x 230 -y 60
			
 
				+tmux send-keys -t "$SESSION" "cd $REPO && claude --dangerously-skip-permissions ${CLAUDE_EXTRA_ARGS:-}" Enter
			
 
				+
			
 
				+# Wait for the ❯ prompt (claude drew its UI), up to 60s. NOTE: ❯ appears on the
			
 
				+# welcome screen seconds before the input actually accepts keystrokes, so this is
			
 
				+# necessary but NOT sufficient — the type-and-verify loop below is what proves
			
 
				+# the input is live.
			
 
				+ready=0
			
 
				+for _ in $(seq 1 120); do
			
 
				+  cap | grep -q "❯" && { ready=1; break; }
			
 
				+  sleep 0.5
			
 
				+done
			
 
				+[ "$ready" = 1 ] || { echo "claude never drew its UI"; cap; tmux kill-session -t "$SESSION" 2>/dev/null; exit 1; }
			
 
				+
			
 
				+# Accept the per-folder "Is this a project you trust?" dialog if it shows (first
			
 
				+# time claude opens a given repo). Option 1 ("Yes, I trust this folder") is
			
 
				+# pre-selected, so Enter accepts. This dialog also contains ❯, so it must be
			
 
				+# cleared before the type-and-verify loop or keystrokes land on the menu.
			
 
				+for _ in $(seq 1 20); do
			
 
				+  cap | grep -q "trust this folder" || break
			
 
				+  tmux send-keys -t "$SESSION" Enter
			
 
				+  sleep 1
			
 
				+done
			
 
				+
			
 
				+# Type-and-verify: send the prompt, confirm a distinctive chunk of it actually
			
 
				+# landed in the input box, retry if it didn't (handles the early-❯ race where
			
 
				+# the welcome screen shows the prompt glyph but MCP init is still eating keys).
			
 
				+needle="${PROMPT:0:24}"
			
 
				+typed=0
			
 
				+for _ in $(seq 1 30); do
			
 
				+  tmux send-keys -l -t "$SESSION" "$PROMPT"
			
 
				+  sleep 1
			
 
				+  if cap | grep -Fq "$needle"; then typed=1; break; fi
			
 
				+  # Clear whatever partial text may have landed, then retry.
			
 
				+  tmux send-keys -t "$SESSION" C-u
			
 
				+  sleep 1
			
 
				+done
			
 
				+[ "$typed" = 1 ] || { echo "prompt never landed in the input box"; cap; tmux kill-session -t "$SESSION" 2>/dev/null; exit 1; }
			
 
				+sleep 0.5
			
 
				+tmux send-keys -t "$SESSION" Enter
			
 
				+
			
 
				+# Busy signals. The robust one is the spinner's elapsed-time-in-parens, which
			
 
				+# EVERY working state shows — both the pre-stream thinking phase
			
 
				+# "(8s · thinking with max effort)" and the streaming phase
			
 
				+# "(24s · ↑ 2.5k tokens · …)", and it survives the 32s→"1m 3s" rollover. We OR
			
 
				+# in the token arrows, "esc to interrupt", and "Initializing" as belt-and-braces
			
 
				+# (some TUI versions/states show one but not the others).
			
 
				+BUSY_RE='esc to interrupt|↓ [0-9]|↑ [0-9]|Initializing|\(([0-9]+m )?[0-9]+s ·'
			
 
				+
			
 
				+# Wait for work to START (busy indicator appears), up to 60s. If it never starts,
			
 
				+# fail loudly rather than silently reporting an empty run.
			
 
				+started=0
			
 
				+for _ in $(seq 1 120); do
			
 
				+  cap | grep -qE "$BUSY_RE" && { started=1; break; }
			
 
				+  sleep 0.5
			
 
				+done
			
 
				+[ "$started" = 1 ] || { echo "agent never started working"; cap; tmux kill-session -t "$SESSION" 2>/dev/null; exit 1; }
			
 
				+
			
 
				+# Poll for idle: not busy AND ❯ present, for 10 consecutive polls (~5s) to ride
			
 
				+# out mid-conversation thinking gaps that briefly drop the spinner. Up to ~15min.
			
 
				+consec=0
			
 
				+for _ in $(seq 1 1800); do
			
 
				+  pane=$(cap)
			
 
				+  if echo "$pane" | grep -qE "$BUSY_RE"; then
			
 
				+    consec=0
			
 
				+  elif echo "$pane" | grep -q "❯"; then
			
 
				+    consec=$((consec+1)); [ "$consec" -ge 10 ] && break
			
 
				+  else
			
 
				+    consec=0
			
 
				+  fi
			
 
				+  sleep 0.5
			
 
				+done
			
 
				+sleep 1
			
 
				+
			
 
				+tmux capture-pane -p -t "$SESSION" -S - > "$OUT"
			
 
				+echo "captured $(wc -l < "$OUT") lines -> $OUT"
			
 
				+grep -oE "Done \([^)]*\)" "$OUT" | tail -1
			
 
				+grep -oE "[0-9.]+k?/[0-9.]+M" "$OUT" | tail -1 | sed 's/^/Context /'
			
 
				+tmux kill-session -t "$SESSION" 2>/dev/null
			
 
				+
			
 
				+# Clean tool breakdown from the session logs (main + subagents).
			
 
				+node "$HERE/parse-session.mjs" "$REPO" 2>/dev/null || true
			
--- a/scripts/agent-eval/parse-run.mjs
+++ b/scripts/agent-eval/parse-run.mjs
@@ -0,0 +1,45 @@
 
				+#!/usr/bin/env node
			
 
				+// Parse a Claude Code stream-json run log: tool-call sequence + token usage.
			
 
				+import { readFileSync } from 'fs';
			
 
				+const file = process.argv[2];
			
 
				+const lines = readFileSync(file, 'utf8').split('\n').filter(Boolean);
			
 
				+
			
 
				+const toolCalls = [];
			
 
				+let result = null;
			
 
				+let initTools = null;
			
 
				+
			
 
				+for (const line of lines) {
			
 
				+  let ev;
			
 
				+  try { ev = JSON.parse(line); } catch { continue; }
			
 
				+  if (ev.type === 'system' && ev.subtype === 'init') {
			
 
				+    initTools = (ev.tools || []).filter(t => /codegraph/.test(t));
			
 
				+  }
			
 
				+  if (ev.type === 'assistant' && ev.message?.content) {
			
 
				+    for (const block of ev.message.content) {
			
 
				+      if (block.type === 'tool_use') {
			
 
				+        let detail = '';
			
 
				+        if (block.name === 'Task') detail = ` [subagent_type=${block.input?.subagent_type ?? '?'}] ${(block.input?.description ?? '').slice(0,40)}`;
			
 
				+        else if (/codegraph/.test(block.name)) detail = ` ${JSON.stringify(block.input?.query ?? block.input?.task ?? block.input?.symbol ?? '').slice(0,60)}`;
			
 
				+        else if (block.name === 'Bash') detail = ` ${(block.input?.command ?? '').slice(0,50)}`;
			
 
				+        else if (block.name === 'Read') detail = ` ${(block.input?.file_path ?? '').split('/').slice(-1)[0]}`;
			
 
				+        toolCalls.push(`${block.name}${detail}`);
			
 
				+      }
			
 
				+    }
			
 
				+  }
			
 
				+  if (ev.type === 'result') result = ev;
			
 
				+}
			
 
				+
			
 
				+console.log(`\n=== ${file.split('/').pop()} ===`);
			
 
				+console.log(`codegraph tools exposed: ${initTools ? initTools.length : '?'}`);
			
 
				+console.log(`\nTool calls (${toolCalls.length}):`);
			
 
				+const counts = {};
			
 
				+for (const tc of toolCalls) { const n = tc.split(' ')[0]; counts[n] = (counts[n]||0)+1; }
			
 
				+console.log('  by type:', JSON.stringify(counts));
			
 
				+toolCalls.forEach((tc, i) => console.log(`  ${i+1}. ${tc}`));
			
 
				+
			
 
				+if (result) {
			
 
				+  const u = result.usage || {};
			
 
				+  const totalIn = (u.input_tokens||0) + (u.cache_read_input_tokens||0) + (u.cache_creation_input_tokens||0);
			
 
				+  console.log(`\nResult: ${result.subtype} | duration ${(result.duration_ms/1000).toFixed(0)}s | turns ${result.num_turns}`);
			
 
				+  console.log(`  tokens: in=${totalIn} out=${u.output_tokens||0} | cost $${(result.total_cost_usd||0).toFixed(3)}`);
			
 
				+}
			
--- a/scripts/agent-eval/parse-session.mjs
+++ b/scripts/agent-eval/parse-session.mjs
@@ -0,0 +1,93 @@
 
				+#!/usr/bin/env node
			
 
				+// Parse the newest Claude Code session log for a project + its subagent logs,
			
 
				+// and report the tool-call breakdown (main + subagents). Works for interactive
			
 
				+// runs (driven via itrun.sh) — Claude Code writes full transcripts to
			
 
				+// ~/.claude/projects/<escaped-cwd>/<session>.jsonl with subagents/ alongside.
			
 
				+import { readFileSync, readdirSync, statSync, existsSync, realpathSync } from 'fs';
			
 
				+import { join } from 'path';
			
 
				+import { homedir } from 'os';
			
 
				+
			
 
				+const projectArg = process.argv[2];
			
 
				+if (!projectArg) { console.error('usage: parse-session.mjs <project-dir>'); process.exit(1); }
			
 
				+
			
 
				+// Claude Code escapes the (real) cwd by replacing every "/" with "-".
			
 
				+const real = realpathSync(projectArg);
			
 
				+const escaped = real.replace(/\//g, '-');
			
 
				+const projDir = join(homedir(), '.claude', 'projects', escaped);
			
 
				+if (!existsSync(projDir)) { console.error('no session logs at', projDir); process.exit(1); }
			
 
				+
			
 
				+// Newest top-level session .jsonl
			
 
				+const sessions = readdirSync(projDir)
			
 
				+  .filter(f => f.endsWith('.jsonl'))
			
 
				+  .map(f => ({ f, m: statSync(join(projDir, f)).mtimeMs }))
			
 
				+  .sort((a, b) => b.m - a.m);
			
 
				+if (sessions.length === 0) { console.error('no .jsonl sessions in', projDir); process.exit(1); }
			
 
				+const sessionId = sessions[0].f.replace('.jsonl', '');
			
 
				+
			
 
				+function tally(file) {
			
 
				+  const counts = {};
			
 
				+  for (const line of readFileSync(file, 'utf8').split('\n')) {
			
 
				+    if (!line) continue;
			
 
				+    let ev; try { ev = JSON.parse(line); } catch { continue; }
			
 
				+    const content = ev.message?.content;
			
 
				+    if (!Array.isArray(content)) continue;
			
 
				+    for (const b of content) {
			
 
				+      if (b.type === 'tool_use') counts[b.name] = (counts[b.name] || 0) + 1;
			
 
				+    }
			
 
				+  }
			
 
				+  return counts;
			
 
				+}
			
 
				+
			
 
				+// Sum token usage from a transcript. The TUI's "Done (…Xk tokens…)" line only
			
 
				+// covers a subagent's throughput; this works for main-thread runs too and is
			
 
				+// consistent across both paths. `gen` = output, `fresh` = uncached input
			
 
				+// (input + cache_creation), `cached` = cache reads (≈free), `total` = all.
			
 
				+function sumTokens(file) {
			
 
				+  const t = { gen: 0, fresh: 0, cached: 0 };
			
 
				+  for (const line of readFileSync(file, 'utf8').split('\n')) {
			
 
				+    if (!line) continue;
			
 
				+    let ev; try { ev = JSON.parse(line); } catch { continue; }
			
 
				+    const u = ev.message?.usage;
			
 
				+    if (!u) continue;
			
 
				+    t.gen += u.output_tokens || 0;
			
 
				+    t.fresh += (u.input_tokens || 0) + (u.cache_creation_input_tokens || 0);
			
 
				+    t.cached += u.cache_read_input_tokens || 0;
			
 
				+  }
			
 
				+  return t;
			
 
				+}
			
 
				+
			
 
				+const mainCounts = tally(join(projDir, sessionId + '.jsonl'));
			
 
				+
			
 
				+// Subagent transcripts live under <session>/subagents/*.jsonl
			
 
				+const subDir = join(projDir, sessionId, 'subagents');
			
 
				+const subCounts = {};
			
 
				+let subAgentFiles = 0;
			
 
				+if (existsSync(subDir)) {
			
 
				+  for (const f of readdirSync(subDir).filter(f => f.endsWith('.jsonl'))) {
			
 
				+    subAgentFiles++;
			
 
				+    const c = tally(join(subDir, f));
			
 
				+    for (const [k, v] of Object.entries(c)) subCounts[k] = (subCounts[k] || 0) + v;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+const fmt = (counts) => Object.entries(counts).sort((a, b) => b[1] - a[1])
			
 
				+  .map(([k, v]) => `    ${String(v).padStart(3)}  ${k}`).join('\n') || '    (none)';
			
 
				+
			
 
				+console.log(`session: ${sessionId}`);
			
 
				+console.log(`\nMAIN thread tools:\n${fmt(mainCounts)}`);
			
 
				+console.log(`\nSUBAGENT tools (${subAgentFiles} subagent transcript${subAgentFiles === 1 ? '' : 's'}):\n${fmt(subCounts)}`);
			
 
				+
			
 
				+const explore = subCounts['mcp__codegraph__codegraph_explore'] || mainCounts['mcp__codegraph__codegraph_explore'] || 0;
			
 
				+const reads = (subCounts['Read'] || 0) + (mainCounts['Read'] || 0);
			
 
				+const greps = (subCounts['Grep'] || 0) + (mainCounts['Grep'] || 0) + (subCounts['Bash'] || 0) + (mainCounts['Bash'] || 0);
			
 
				+console.log(`\nVERDICT: codegraph_explore used ${explore}x | Read ${reads} | Grep/Bash ${greps}`);
			
 
				+
			
 
				+// Token totals (main + subagents), consistent across main-thread and subagent runs.
			
 
				+const tok = { gen: 0, fresh: 0, cached: 0 };
			
 
				+const addTok = (t) => { tok.gen += t.gen; tok.fresh += t.fresh; tok.cached += t.cached; };
			
 
				+addTok(sumTokens(join(projDir, sessionId + '.jsonl')));
			
 
				+if (existsSync(subDir)) {
			
 
				+  for (const f of readdirSync(subDir).filter(f => f.endsWith('.jsonl'))) addTok(sumTokens(join(subDir, f)));
			
 
				+}
			
 
				+const k = (n) => (n / 1000).toFixed(1) + 'k';
			
 
				+console.log(`TOKENS: gen ${k(tok.gen)} | fresh-in ${k(tok.fresh)} | cached-in ${k(tok.cached)} | billable≈ ${k(tok.gen + tok.fresh)}`);
			
--- a/scripts/agent-eval/run-agent.sh
+++ b/scripts/agent-eval/run-agent.sh
@@ -0,0 +1,34 @@
 
				+#!/usr/bin/env bash
			
 
				+# Headless Claude Code run against a repo with codegraph MCP, capturing the
			
 
				+# full stream-json so we can see tool calls + token usage. Complements the
			
 
				+# interactive itrun.sh: headless gives a clean per-tool breakdown + exact
			
 
				+# tokens/cost, but defaults to the general-purpose subagent (not Explore).
			
 
				+# To force the Explore path, ask for it in the prompt.
			
 
				+#
			
 
				+# Usage: run-agent.sh <repo-path> <label> "<prompt>"
			
 
				+# Env: AGENT_EVAL_OUT (default /tmp/agent-eval), CG_BIN (codegraph dist binary)
			
 
				+set -uo pipefail
			
 
				+
			
 
				+REPO="$1"; LABEL="$2"; PROMPT="$3"
			
 
				+CG_BIN="${CG_BIN:-$(command -v codegraph || echo /usr/local/bin/codegraph)}"
			
 
				+OUT_DIR="${AGENT_EVAL_OUT:-/tmp/agent-eval}"; mkdir -p "$OUT_DIR"
			
 
				+OUT="$OUT_DIR/run-${LABEL}.jsonl"
			
 
				+
			
 
				+MCP_CONFIG=$(cat <<JSON
			
 
				+{"mcpServers":{"codegraph":{"command":"${CG_BIN}","args":["serve","--mcp","--path","${REPO}"]}}}
			
 
				+JSON
			
 
				+)
			
 
				+
			
 
				+echo "→ running [$LABEL] in $REPO"
			
 
				+cd "$REPO" || exit 1
			
 
				+
			
 
				+claude -p "$PROMPT" \
			
 
				+  --output-format stream-json --verbose \
			
 
				+  --permission-mode bypassPermissions \
			
 
				+  --model opus \
			
 
				+  --max-budget-usd 2 \
			
 
				+  --strict-mcp-config --mcp-config "$MCP_CONFIG" \
			
 
				+  > "$OUT" 2>"$OUT_DIR/run-${LABEL}.err"
			
 
				+
			
 
				+echo "exit: $? | wrote $OUT ($(wc -l < "$OUT") lines)"
			
 
				+node "$(cd "$(dirname "$0")" && pwd)/parse-run.mjs" "$OUT" 2>/dev/null || true
			
--- a/scripts/agent-eval/run-all.sh
+++ b/scripts/agent-eval/run-all.sh
@@ -0,0 +1,67 @@
 
				+#!/usr/bin/env bash
			
 
				+# With/without A/B (and optional interactive) eval for a codegraph version on a
			
 
				+# repo. Codegraph is the ONLY variable: both arms launch claude with
			
 
				+# --strict-mcp-config — with = codegraph-only MCP (pointed at $CG_BIN),
			
 
				+# without = empty MCP. Built-in Read/Grep/Bash stay available in both arms.
			
 
				+#
			
 
				+# Usage: run-all.sh <repo-path> "<question>" [headless|tmux|all]
			
 
				+# Env:   CG_BIN          codegraph binary (default: command -v codegraph)
			
 
				+#        AGENT_EVAL_OUT  output dir (default: /tmp/agent-eval)
			
 
				+set -uo pipefail
			
 
				+
			
 
				+REPO="${1:?usage: run-all.sh <repo-path> \"<question>\" [headless|tmux|all]}"
			
 
				+Q="${2:?question required}"
			
 
				+MODE="${3:-headless}"
			
 
				+CG_BIN="${CG_BIN:-$(command -v codegraph)}"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/agent-eval}"
			
 
				+HARNESS="$(cd "$(dirname "$0")" && pwd)"
			
 
				+mkdir -p "$OUT"
			
 
				+
			
 
				+[ -n "$CG_BIN" ] || { echo "no codegraph binary on PATH (set CG_BIN)"; exit 1; }
			
 
				+[ -d "$REPO/.codegraph" ] || { echo "no .codegraph index at $REPO — index it first"; exit 1; }
			
 
				+case "$MODE" in headless|tmux|all) ;; *) echo "mode must be headless|tmux|all (got '$MODE')"; exit 1;; esac
			
 
				+
			
 
				+# MCP config files (path form avoids inline-JSON quoting through tmux).
			
 
				+cat > "$OUT/mcp-codegraph.json" <<JSON
			
 
				+{"mcpServers":{"codegraph":{"command":"$CG_BIN","args":["serve","--mcp","--path","$REPO"]}}}
			
 
				+JSON
			
 
				+echo '{"mcpServers":{}}' > "$OUT/mcp-empty.json"
			
 
				+
			
 
				+echo "###### codegraph: $CG_BIN"
			
 
				+echo "###### repo:      $REPO"
			
 
				+echo "###### question:  $Q"
			
 
				+echo
			
 
				+
			
 
				+# Headless arm: claude -p with stream-json -> exact tool sequence + tokens/cost.
			
 
				+headless() {
			
 
				+  local label="$1" cfg="$2"
			
 
				+  echo "############################## HEADLESS [$label] ##############################"
			
 
				+  ( cd "$REPO" && claude -p "$Q" \
			
 
				+      --output-format stream-json --verbose \
			
 
				+      --permission-mode bypassPermissions \
			
 
				+      --model opus \
			
 
				+      --max-budget-usd 4 \
			
 
				+      --strict-mcp-config --mcp-config "$cfg" \
			
 
				+      > "$OUT/run-$label.jsonl" 2>"$OUT/run-$label.err" )
			
 
				+  echo "exit $? -> $OUT/run-$label.jsonl ($(wc -l < "$OUT/run-$label.jsonl" | tr -d ' ') lines)"
			
 
				+  tail -2 "$OUT/run-$label.err" 2>/dev/null
			
 
				+  node "$HARNESS/parse-run.mjs" "$OUT/run-$label.jsonl" 2>&1 || true
			
 
				+  echo
			
 
				+}
			
 
				+
			
 
				+if [ "$MODE" = headless ] || [ "$MODE" = all ]; then
			
 
				+  headless "headless-with"    "$OUT/mcp-codegraph.json"
			
 
				+  headless "headless-without" "$OUT/mcp-empty.json"
			
 
				+fi
			
 
				+
			
 
				+if [ "$MODE" = tmux ] || [ "$MODE" = all ]; then
			
 
				+  echo "############################## INTERACTIVE [with] ##############################"
			
 
				+  CLAUDE_EXTRA_ARGS="--model opus --strict-mcp-config --mcp-config $OUT/mcp-codegraph.json" \
			
 
				+    bash "$HARNESS/itrun.sh" "$REPO" "int-with" "$Q" 2>&1 || echo "[itrun WITH failed]"
			
 
				+  echo
			
 
				+  echo "############################## INTERACTIVE [without] ##############################"
			
 
				+  CLAUDE_EXTRA_ARGS="--model opus --strict-mcp-config --mcp-config $OUT/mcp-empty.json" \
			
 
				+    bash "$HARNESS/itrun.sh" "$REPO" "int-without" "$Q" 2>&1 || echo "[itrun WITHOUT failed]"
			
 
				+  echo
			
 
				+fi
			
 
				+echo "############################## RUN-ALL COMPLETE ##############################"