1
0

ab-new-vs-baseline.sh 5.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. #!/usr/bin/env bash
  2. # A/B a codegraph retrieval/steering change: the NEW build (current HEAD) vs a
  3. # BASELINE build (a git ref) — BOTH with codegraph attached — on the same
  4. # implementation task, measuring how many Read vs codegraph calls the agent
  5. # makes. ISOLATES the change (unlike run-all.sh's with-vs-without). The agent
  6. # works on a throwaway copy of the target, so your repos are never touched.
  7. #
  8. # Reliable attach (works even when this is itself run nested inside a Claude
  9. # session): each arm PRE-WARMS a persistent codegraph daemon for its target so
  10. # claude connects to an already-bound, index-loaded daemon instantly — before
  11. # the agent's first turn — and SKIPS codegraph's startup re-exec via
  12. # CODEGRAPH_WASM_RELAUNCHED=1. Without this, on a multi-step task the agent
  13. # dives into Read/grep before codegraph finishes its ~2-3s startup (worse under
  14. # the CPU contention of a nested run) and runs with NO codegraph.
  15. #
  16. # Gotcha: claude's `system/init` snapshot can read status:"pending" / 0 tools
  17. # even when the server then connects fine — judge by ACTUAL codegraph usage in
  18. # parse-run.mjs's "by type", not the init line.
  19. #
  20. # Usage: ab-new-vs-baseline.sh <indexed-repo> "<task>" [baseline-ref]
  21. # <indexed-repo> a repo with a .codegraph index (copied per arm)
  22. # "<task>" an implementation task, e.g. "Add X to Y and wire it through"
  23. # [baseline-ref] git ref for the BEFORE build (default: HEAD~1)
  24. # Env: AGENT_EVAL_OUT (default: /tmp/ab-new-vs-baseline)
  25. set -uo pipefail
  26. TARGET="${1:?usage: ab-new-vs-baseline.sh <indexed-repo> \"<task>\" [baseline-ref]}"
  27. TASK="${2:?task required}"
  28. BASE_REF="${3:-HEAD~1}"
  29. ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
  30. BIN="$ENGINE/dist/bin/codegraph.js"
  31. OUT="${AGENT_EVAL_OUT:-/tmp/ab-new-vs-baseline}"
  32. PARSE="$ENGINE/scripts/agent-eval/parse-run.mjs"
  33. command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
  34. [ -d "$TARGET/.codegraph" ] || { echo "target not indexed: run 'codegraph init $TARGET' first"; exit 1; }
  35. if ! git -C "$ENGINE" diff --quiet || ! git -C "$ENGINE" diff --cached --quiet; then
  36. echo "engine repo has uncommitted changes — commit or stash first (this script checks files out)"; exit 1
  37. fi
  38. CHANGED=$(git -C "$ENGINE" diff --name-only "$BASE_REF" HEAD -- src 2>/dev/null)
  39. [ -n "$CHANGED" ] || { echo "no src/ changes between $BASE_REF and HEAD — nothing to A/B"; exit 1; }
  40. # On exit: kill any eval daemons + restore the engine to HEAD.
  41. cleanup() {
  42. pkill -9 -f "serve --mcp --path $OUT/" 2>/dev/null
  43. git -C "$ENGINE" checkout HEAD -- $CHANGED 2>/dev/null
  44. ( cd "$ENGINE" && npm run build >/dev/null 2>&1 )
  45. }
  46. trap cleanup EXIT
  47. mkdir -p "$OUT"
  48. echo "###### engine=$ENGINE baseline=$BASE_REF"
  49. echo "###### changed: $(echo "$CHANGED" | tr '\n' ' ')"
  50. echo "###### target=$TARGET"
  51. echo "###### task=$TASK"
  52. echo
  53. # Two pristine copies so each arm starts clean (the agent edits its own copy).
  54. rm -rf "$OUT/t-new" "$OUT/t-base"
  55. rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$TARGET/" "$OUT/t-new/"
  56. cp -R "$OUT/t-new" "$OUT/t-base"
  57. prewarm() { # target — spawn a persistent daemon (current $BIN) and wait for its socket
  58. pkill -9 -f "serve --mcp --path $1" 2>/dev/null
  59. CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
  60. node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" \
  61. && echo " daemon warm: $1" || echo " WARN: daemon never bound for $1 (arm may run without codegraph)"
  62. }
  63. run_arm() { # label, target-copy
  64. local label="$1" tgt="$2" c="$OUT/mcp-$1.json"
  65. # Connect to the pre-warmed daemon; skip the startup re-exec for a fast attach.
  66. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$tgt" > "$c"
  67. prewarm "$tgt"
  68. echo "############## ARM [$label] ##############"
  69. ( cd "$tgt" && claude -p "$TASK" \
  70. --output-format stream-json --verbose --permission-mode bypassPermissions \
  71. --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
  72. </dev/null > "$OUT/run-$label.jsonl" 2>"$OUT/run-$label.err" )
  73. node "$PARSE" "$OUT/run-$label.jsonl" 2>&1 | grep -E "by type|Result" || echo " (parse failed — see $OUT/run-$label.jsonl)"
  74. pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
  75. echo
  76. }
  77. echo "== NEW build (HEAD) =="
  78. ( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo " built"
  79. node "$BIN" init "$OUT/t-new" >/dev/null 2>&1 && echo " indexed t-new"
  80. run_arm new "$OUT/t-new"
  81. echo "== BASELINE build ($BASE_REF) =="
  82. git -C "$ENGINE" checkout "$BASE_REF" -- $CHANGED
  83. ( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo " built"
  84. node "$BIN" init "$OUT/t-base" >/dev/null 2>&1 && echo " indexed t-base"
  85. run_arm baseline "$OUT/t-base"
  86. echo "###### DONE. Compare the [new] vs [baseline] 'by type' counts above"
  87. echo "###### (especially Read vs mcp__codegraph__*). Full logs in: $OUT"