2 minggu lalu · 28c5268ee3
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -137,6 +137,7 @@ For each **language × framework**, validate on **small, medium, and large** rea
 
				 1. **Pick the canonical flow** for the framework ("how does X reach Y": state→render, request→handler→view, query→SQL, action→reducer→store…).
			
 
				 2. **Deterministic probes** (`scripts/agent-eval/probe-{node,explore}.mjs` against the built `dist/`): `codegraph_explore` with the flow's symbol names connects from→to end-to-end with no break (its Flow section shows the path); **no node explosion** (`select count(*) from nodes` stable before/after re-index); synthesized-edge **precision** spot-check (`select … where provenance='heuristic'`).
			
 
				 3. **Agent A/B** (`scripts/agent-eval/run-all.sh <repo> "<Q>"`): with vs without codegraph, **≥2 runs/arm** (run-to-run variance is large — never conclude from n=1). Record **duration, total tool calls, Read, Grep**. Optional forced-Read-0 sufficiency proof via the block-read hook (`scripts/agent-eval/hook-settings.json`).
			
 
				+   - **Run agent-evals in a REAL terminal — NEVER nested inside a Claude Code session** (don't spawn `claude -p` from a Bash tool call). The codegraph MCP server is healthy (full `initialize`→`tools/list` handshake ~165ms, daemon and in-process modes both fine), but a nested `claude -p` marks it `status:"pending"` / 0 tools under CPU/timing contention and the agent silently runs with no codegraph — it can connect early in a session, then degrade to consistent failure as nested spawns pile up. `CODEGRAPH_NO_DAEMON=1` and `< /dev/null` do NOT fix it (it's the nested client, not the server). Confirm via `parse-run.mjs` (`codegraph tools exposed: 0` = void run). To isolate a change — **new-build vs baseline-build, both codegraph-on** (vs run-all.sh's with-vs-without) — use `scripts/agent-eval/ab-new-vs-baseline.sh <indexed-repo> "<task>" [baseline-ref]`.
			
 
				 4. **Pass bar:** a normal flow question reaches **~0 Read/Grep within the repo's explore-call budget**, runs **faster** than without-codegraph, and shows **no regression on a control repo**. Record the numbers in `docs/design/dynamic-dispatch-coverage-playbook.md` (the coverage matrix).
			
 
				 
			
 
				 Full playbook + per-mechanism design: `docs/design/dynamic-dispatch-coverage-playbook.md` and `docs/design/callback-edge-synthesis.md`.
			
--- a/scripts/agent-eval/ab-new-vs-baseline.sh
+++ b/scripts/agent-eval/ab-new-vs-baseline.sh
@@ -0,0 +1,83 @@
 
				+#!/usr/bin/env bash
			
 
				+# A/B a codegraph retrieval/steering change: the NEW build (current HEAD) vs a
			
 
				+# BASELINE build (a git ref) — BOTH with codegraph attached — on the same
			
 
				+# implementation task, measuring how many Read vs codegraph calls the agent
			
 
				+# makes. This ISOLATES the change (unlike run-all.sh, which is with-vs-without
			
 
				+# codegraph). The agent works on a throwaway copy of the target, so its edits
			
 
				+# never touch your repos.
			
 
				+#
			
 
				+# *** RUN THIS IN A REAL TERMINAL — NOT nested inside a Claude Code session. ***
			
 
				+# A `claude -p` spawned from within another Claude session (e.g. from a Bash
			
 
				+# tool call) cannot reliably attach the codegraph MCP server: the server is
			
 
				+# healthy (full handshake ~165ms) but the nested client marks it
			
 
				+# status:"pending" / 0 tools under CPU/timing contention, and degrades to
			
 
				+# consistent failure over a long session. NO_DAEMON + `< /dev/null` do NOT fix
			
 
				+# it — it's the nested client, not the server. See codegraph/CLAUDE.md
			
 
				+# ("Running agent-evals — do NOT nest").
			
 
				+#
			
 
				+# Usage: ab-new-vs-baseline.sh <indexed-repo> "<task>" [baseline-ref]
			
 
				+#   <indexed-repo>  a repo with a .codegraph index (copied per arm)
			
 
				+#   "<task>"        an implementation task, e.g. "Add X to Y and wire it through"
			
 
				+#   [baseline-ref]  git ref for the BEFORE build (default: HEAD~1)
			
 
				+# Env: AGENT_EVAL_OUT (default: /tmp/ab-new-vs-baseline)
			
 
				+set -uo pipefail
			
 
				+
			
 
				+TARGET="${1:?usage: ab-new-vs-baseline.sh <indexed-repo> \"<task>\" [baseline-ref]}"
			
 
				+TASK="${2:?task required}"
			
 
				+BASE_REF="${3:-HEAD~1}"
			
 
				+ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/ab-new-vs-baseline}"
			
 
				+PARSE="$ENGINE/scripts/agent-eval/parse-run.mjs"
			
 
				+
			
 
				+command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
			
 
				+[ -d "$TARGET/.codegraph" ] || { echo "target not indexed: run 'codegraph init $TARGET' first"; exit 1; }
			
 
				+if ! git -C "$ENGINE" diff --quiet || ! git -C "$ENGINE" diff --cached --quiet; then
			
 
				+  echo "engine repo has uncommitted changes — commit or stash first (this script checks files out)"; exit 1
			
 
				+fi
			
 
				+CHANGED=$(git -C "$ENGINE" diff --name-only "$BASE_REF" HEAD -- src 2>/dev/null)
			
 
				+[ -n "$CHANGED" ] || { echo "no src/ changes between $BASE_REF and HEAD — nothing to A/B"; exit 1; }
			
 
				+
			
 
				+# Always restore the engine to HEAD on exit, even if interrupted mid-arm.
			
 
				+restore() { git -C "$ENGINE" checkout HEAD -- $CHANGED 2>/dev/null; ( cd "$ENGINE" && npm run build >/dev/null 2>&1 ); }
			
 
				+trap restore EXIT
			
 
				+
			
 
				+mkdir -p "$OUT"
			
 
				+echo "###### engine=$ENGINE  baseline=$BASE_REF"
			
 
				+echo "###### changed: $(echo "$CHANGED" | tr '\n' ' ')"
			
 
				+echo "###### target=$TARGET"
			
 
				+echo "###### task=$TASK"
			
 
				+echo
			
 
				+
			
 
				+# Two pristine copies so each arm starts clean (the agent edits its own copy).
			
 
				+rm -rf "$OUT/t-new" "$OUT/t-base"
			
 
				+rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$TARGET/" "$OUT/t-new/"
			
 
				+cp -R "$OUT/t-new" "$OUT/t-base"
			
 
				+
			
 
				+cfg() { printf '{"mcpServers":{"codegraph":{"command":"%s","args":["serve","--mcp","--path","%s"]}}}' "$BIN" "$1" > "$2"; }
			
 
				+
			
 
				+run_arm() { # label, target-copy
			
 
				+  local label="$1" tgt="$2" c="$OUT/mcp-$1.json"
			
 
				+  cfg "$tgt" "$c"
			
 
				+  echo "############## ARM [$label] ##############"
			
 
				+  ( cd "$tgt" && claude -p "$TASK" \
			
 
				+      --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				+      --model opus --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
			
 
				+      < /dev/null > "$OUT/run-$label.jsonl" 2>"$OUT/run-$label.err" )
			
 
				+  node "$PARSE" "$OUT/run-$label.jsonl" 2>&1 | grep -E "tools exposed|by type|Result" || echo "  (parse failed — see $OUT/run-$label.jsonl)"
			
 
				+  echo
			
 
				+}
			
 
				+
			
 
				+echo "== NEW build (HEAD) =="
			
 
				+( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "  built"
			
 
				+node "$BIN" init "$OUT/t-new" >/dev/null 2>&1 && echo "  indexed t-new"
			
 
				+run_arm new "$OUT/t-new"
			
 
				+
			
 
				+echo "== BASELINE build ($BASE_REF) =="
			
 
				+git -C "$ENGINE" checkout "$BASE_REF" -- $CHANGED
			
 
				+( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "  built"
			
 
				+node "$BIN" init "$OUT/t-base" >/dev/null 2>&1 && echo "  indexed t-base"
			
 
				+run_arm baseline "$OUT/t-base"
			
 
				+
			
 
				+echo "###### DONE. Compare the [new] vs [baseline] 'by type' counts above"
			
 
				+echo "###### (especially Read vs mcp__codegraph__*). Full logs in: $OUT"