1 hafta önce · 0682681175
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -137,6 +137,7 @@ For each **language × framework**, validate on **small, medium, and large** rea
 
				 1. **Pick the canonical flow** for the framework ("how does X reach Y": state→render, request→handler→view, query→SQL, action→reducer→store…).
			
 
				 2. **Deterministic probes** (`scripts/agent-eval/probe-{node,explore}.mjs` against the built `dist/`): `codegraph_explore` with the flow's symbol names connects from→to end-to-end with no break (its Flow section shows the path); **no node explosion** (`select count(*) from nodes` stable before/after re-index); synthesized-edge **precision** spot-check (`select … where provenance='heuristic'`).
			
 
				 3. **Agent A/B** (`scripts/agent-eval/run-all.sh <repo> "<Q>"`): with vs without codegraph, **≥2 runs/arm** (run-to-run variance is large — never conclude from n=1). Record **duration, total tool calls, Read, Grep**. Optional forced-Read-0 sufficiency proof via the block-read hook (`scripts/agent-eval/hook-settings.json`).
			
 
				+   - **Model policy — every A/B arm runs Claude with `--model sonnet --effort high`. Always. Never Opus/Fable.** All `scripts/agent-eval/*.sh` default to this (`MODEL`/`EFFORT` env override exists — don't raise it without an explicit reason from the maintainer). Two reasons, and the second matters more than cost: (a) Sonnet doesn't burn tokens; (b) **Sonnet is the deliberate floor model** — codegraph's real users attach it to whatever agent they already run (Cursor Composer, Gemini, etc.), so we validate on a "dumber" model on purpose: a stronger model's tool-use covers up the salience/sufficiency problems a weaker one exposes. An affordance that lands on Sonnet generalizes up to every host; one that only works on Opus/Fable doesn't generalize down to the agents most users actually have. Both arms always use the same model.
			
 
				    - **MCP attach is a startup-latency issue, not a hard block.** On a multi-step task the agent dives into Read/grep before codegraph finishes its ~2-3s startup (worse when the eval is itself run nested inside a Claude session, under CPU contention), so it runs with no codegraph. Fix: **pre-warm a persistent daemon** for the target (`CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS` high; spawn `serve --mcp --path <target> </dev/null &`; wait for `.codegraph/daemon.sock`) **and skip the startup re-exec** (`CODEGRAPH_WASM_RELAUNCHED=1`) so claude connects before the agent's first turn. Don't trust claude's `init` snapshot — it can read `status:"pending"` / 0 tools even when it then connects; judge by actual codegraph usage in `parse-run.mjs`'s `by type`. To isolate a change — **new-build vs baseline-build, both codegraph-on** (vs run-all.sh's with-vs-without) — use `scripts/agent-eval/ab-new-vs-baseline.sh <indexed-repo> "<task>" [baseline-ref]` (it bakes in the pre-warm).
			
 
				 4. **Pass bar:** a normal flow question reaches **~0 Read/Grep within the repo's explore-call budget**, runs **faster** than without-codegraph, and shows **no regression on a control repo**. Record the numbers in `docs/design/dynamic-dispatch-coverage-playbook.md` (the coverage matrix).
			
 
				 
			
--- a/scripts/agent-eval/ab-adoption.sh
+++ b/scripts/agent-eval/ab-adoption.sh
@@ -71,7 +71,7 @@ run_arm() { # label, N
 
				     echo "----- [$label] run $i -----"
			
 
				     ( cd "$tgt" && claude -p "$TASK" \
			
 
				         --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				-        --model opus --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
			
 
				+        --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
			
 
				         </dev/null > "$OUT/run-$label-$i.jsonl" 2>"$OUT/run-$label-$i.err" )
			
 
				     count "$OUT/run-$label-$i.jsonl"
			
 
				     pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
			
--- a/scripts/agent-eval/ab-hook.sh
+++ b/scripts/agent-eval/ab-hook.sh
@@ -72,7 +72,7 @@ run_one() { # arm-label, run-index, use-hook(0|1)
 
				   # array expansion otherwise, which would skip the no-hook arm's claude run.
			
 
				   ( cd "$tgt" && claude -p "$TASK" \
			
 
				       --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				-      --model opus --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" ${extra[@]+"${extra[@]}"} \
			
 
				+      --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" ${extra[@]+"${extra[@]}"} \
			
 
				       </dev/null > "$OUT/run-$label-$idx.jsonl" 2>"$OUT/run-$label-$idx.err" )
			
 
				   node "$PARSE" "$OUT/run-$label-$idx.jsonl" 2>&1 | grep -E "by type|Result" || echo "  (parse failed — see $OUT/run-$label-$idx.jsonl)"
			
 
				   pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
			
--- a/scripts/agent-eval/ab-impl.sh
+++ b/scripts/agent-eval/ab-impl.sh
@@ -64,7 +64,7 @@ run(){ # label, withCodegraph(0/1)
 
				       prewarm "$tgt"
			
 
				     else cp "$OUT/mcp-empty.json" "$cfg"; fi
			
 
				     ( cd "$tgt" && claude -p "$Q" --output-format stream-json --verbose \
			
 
				-        --permission-mode bypassPermissions --model opus --max-budget-usd 4 \
			
 
				+        --permission-mode bypassPermissions --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
			
 
				         --strict-mcp-config --mcp-config "$cfg" </dev/null > "$OUT/$label-$i.jsonl" 2>"$OUT/$label-$i.err" )
			
 
				     echo "[$label] run $i:"; analyze "$OUT/$label-$i.jsonl"
			
 
				     if [ -n "$BUILD_CMD" ]; then ( cd "$tgt" && eval "$BUILD_CMD" >/dev/null 2>&1 && echo "      build: PASS" || echo "      build: FAIL" ); fi
			
--- a/scripts/agent-eval/ab-new-vs-baseline.sh
+++ b/scripts/agent-eval/ab-new-vs-baseline.sh
@@ -75,7 +75,7 @@ run_arm() { # label, target-copy
 
				   echo "############## ARM [$label] ##############"
			
 
				   ( cd "$tgt" && claude -p "$TASK" \
			
 
				       --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				-      --model opus --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
			
 
				+      --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
			
 
				       </dev/null > "$OUT/run-$label.jsonl" 2>"$OUT/run-$label.err" )
			
 
				   node "$PARSE" "$OUT/run-$label.jsonl" 2>&1 | grep -E "by type|Result" || echo "  (parse failed — see $OUT/run-$label.jsonl)"
			
 
				   pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
			
--- a/scripts/agent-eval/ab-sufficiency.sh
+++ b/scripts/agent-eval/ab-sufficiency.sh
@@ -66,7 +66,7 @@ run(){ # label, cfg, prewarm(0/1)
 
				   for i in $(seq 1 "$RUNS"); do
			
 
				     [ "$pw" = "1" ] && prewarm
			
 
				     ( cd "$TGT" && claude -p "$Q" --output-format stream-json --verbose \
			
 
				-        --permission-mode bypassPermissions --model opus --max-budget-usd 4 \
			
 
				+        --permission-mode bypassPermissions --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
			
 
				         --strict-mcp-config --mcp-config "$cfg" </dev/null > "$OUT/$label-$i.jsonl" 2>"$OUT/$label-$i.err" )
			
 
				     echo "[$label] run $i:"; analyze "$OUT/$label-$i.jsonl"
			
 
				   done
			
--- a/scripts/agent-eval/bench-why-repo.sh
+++ b/scripts/agent-eval/bench-why-repo.sh
@@ -15,7 +15,7 @@ printf '{"mcpServers":{"codegraph":{"command":"%s","args":["serve","--mcp","--pa
 
				 for i in $(seq 1 "$N"); do
			
 
				   pkill -f "serve --mcp" 2>/dev/null; sleep 1; rm -f "$REPO/.codegraph/daemon.sock"
			
 
				   ( cd "$REPO" && claude -p "$Q$WHY" --output-format stream-json --verbose \
			
 
				-      --permission-mode bypassPermissions --model opus --effort "${EFFORT:-high}" --max-budget-usd 4 \
			
 
				+      --permission-mode bypassPermissions --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
			
 
				       --strict-mcp-config --mcp-config "$OUT/cg.json" > "$OUT/with$i.jsonl" 2>"$OUT/with$i.err" )
			
 
				   echo "WITH run $i: exit $? ($(wc -l < "$OUT/with$i.jsonl" | tr -d ' ') lines)"
			
 
				 done
			
--- a/scripts/agent-eval/run-agent.sh
+++ b/scripts/agent-eval/run-agent.sh
@@ -25,7 +25,7 @@ cd "$REPO" || exit 1
 
				 claude -p "$PROMPT" \
			
 
				   --output-format stream-json --verbose \
			
 
				   --permission-mode bypassPermissions \
			
 
				-  --model opus \
			
 
				+  --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" \
			
 
				   --max-budget-usd 2 \
			
 
				   --strict-mcp-config --mcp-config "$MCP_CONFIG" \
			
 
				   > "$OUT" 2>"$OUT_DIR/run-${LABEL}.err"
			
--- a/scripts/agent-eval/run-all.sh
+++ b/scripts/agent-eval/run-all.sh
@@ -7,6 +7,8 @@
 
				 # Usage: run-all.sh <repo-path> "<question>" [headless|tmux|all]
			
 
				 # Env:   CG_BIN          codegraph binary (default: command -v codegraph)
			
 
				 #        AGENT_EVAL_OUT  output dir (default: /tmp/agent-eval)
			
 
				+#        MODEL / EFFORT  claude model/effort (default: sonnet / high — the
			
 
				+#                        standing A/B policy; see CLAUDE.md, don't raise)
			
 
				 set -uo pipefail
			
 
				 
			
 
				 REPO="${1:?usage: run-all.sh <repo-path> \"<question>\" [headless|tmux|all]}"
			
@@ -39,7 +41,7 @@ headless() {
 
				   ( cd "$REPO" && claude -p "$Q" \
			
 
				       --output-format stream-json --verbose \
			
 
				       --permission-mode bypassPermissions \
			
 
				-      --model opus \
			
 
				+      --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" \
			
 
				       --max-budget-usd 4 \
			
 
				       --strict-mcp-config --mcp-config "$cfg" \
			
 
				       > "$OUT/run-$label.jsonl" 2>"$OUT/run-$label.err" )
			
@@ -56,11 +58,11 @@ fi
 
				 
			
 
				 if [ "$MODE" = tmux ] || [ "$MODE" = all ]; then
			
 
				   echo "############################## INTERACTIVE [with] ##############################"
			
 
				-  CLAUDE_EXTRA_ARGS="--model opus --strict-mcp-config --mcp-config $OUT/mcp-codegraph.json" \
			
 
				+  CLAUDE_EXTRA_ARGS="--model ${MODEL:-sonnet} --effort ${EFFORT:-high} --strict-mcp-config --mcp-config $OUT/mcp-codegraph.json" \
			
 
				     bash "$HARNESS/itrun.sh" "$REPO" "int-with" "$Q" 2>&1 || echo "[itrun WITH failed]"
			
 
				   echo
			
 
				   echo "############################## INTERACTIVE [without] ##############################"
			
 
				-  CLAUDE_EXTRA_ARGS="--model opus --strict-mcp-config --mcp-config $OUT/mcp-empty.json" \
			
 
				+  CLAUDE_EXTRA_ARGS="--model ${MODEL:-sonnet} --effort ${EFFORT:-high} --strict-mcp-config --mcp-config $OUT/mcp-empty.json" \
			
 
				     bash "$HARNESS/itrun.sh" "$REPO" "int-without" "$Q" 2>&1 || echo "[itrun WITHOUT failed]"
			
 
				   echo
			
 
				 fi
			
--- a/scripts/agent-eval/run-arms.sh
+++ b/scripts/agent-eval/run-arms.sh
@@ -48,7 +48,7 @@ fi
 
				 
			
 
				 LOG="$OUT/$ARM-r$RID.jsonl"; ERR="$OUT/$ARM-r$RID.err"
			
 
				 ARGS=( -p "$Q" --output-format stream-json --verbose
			
 
				-       --permission-mode bypassPermissions --model opus --max-budget-usd 4
			
 
				+       --permission-mode bypassPermissions --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4
			
 
				        --strict-mcp-config --mcp-config "$CFG" )
			
 
				 [ -n "$STEERING" ] && ARGS+=( --append-system-prompt "$STEERING" )