ab-hook.sh 4.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #!/usr/bin/env bash
  2. # A/B the PreToolUse(Read) REDIRECT hook (P1): does steering Read → codegraph_node
  3. # file-view actually move the agent off Read during implementation? BOTH arms use
  4. # the CURRENT build with codegraph attached and pre-warmed; the only difference is
  5. # the hook. Isolates the hook's behavioral effect from the build/file-view change
  6. # (use ab-new-vs-baseline.sh for the build A/B).
  7. #
  8. # arm [nohook] — codegraph on, no hook (does the better file-view get picked on its own?)
  9. # arm [hook] — codegraph on, + redirect hook (does routing close it?)
  10. #
  11. # Reliable attach (works nested): each arm pre-warms a persistent daemon and skips
  12. # the startup re-exec (CODEGRAPH_WASM_RELAUNCHED=1), so claude connects before the
  13. # agent's first turn. Judge by ACTUAL codegraph usage in parse-run.mjs's "by type",
  14. # not claude's init snapshot (which can read pending even when it then connects).
  15. #
  16. # Usage: ab-hook.sh <indexed-repo> "<implementation task>" [runs-per-arm]
  17. # <indexed-repo> a repo with a .codegraph index (copied per arm; never mutated)
  18. # "<task>" a GENUINELY-NEW implementation task (verify it isn't already done)
  19. # [runs-per-arm] default 2 (n=1 is noisy — the doctrine says >=2)
  20. # Env: AGENT_EVAL_OUT (default: /tmp/ab-hook)
  21. set -uo pipefail
  22. TARGET="${1:?usage: ab-hook.sh <indexed-repo> \"<task>\" [runs-per-arm]}"
  23. TASK="${2:?task required}"
  24. RUNS="${3:-2}"
  25. ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
  26. BIN="$ENGINE/dist/bin/codegraph.js"
  27. HOOK="$ENGINE/scripts/agent-eval/redirect-read-hook.sh"
  28. OUT="${AGENT_EVAL_OUT:-/tmp/ab-hook}"
  29. PARSE="$ENGINE/scripts/agent-eval/parse-run.mjs"
  30. command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
  31. command -v jq >/dev/null || { echo "jq not on PATH (the hook needs it)"; exit 1; }
  32. [ -d "$TARGET/.codegraph" ] || { echo "target not indexed: run 'codegraph init $TARGET' first"; exit 1; }
  33. chmod +x "$HOOK"
  34. cleanup() { pkill -9 -f "serve --mcp --path $OUT/" 2>/dev/null; }
  35. trap cleanup EXIT
  36. mkdir -p "$OUT"
  37. echo "###### engine=$ENGINE"
  38. echo "###### target=$TARGET runs/arm=$RUNS"
  39. echo "###### task=$TASK"
  40. echo
  41. ( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built"
  42. # A settings file carrying ONLY the PreToolUse(Read) redirect hook.
  43. HOOK_SETTINGS="$OUT/hook-settings.json"
  44. jq -n --arg cmd "bash $HOOK" \
  45. '{hooks:{PreToolUse:[{matcher:"Read",hooks:[{type:"command",command:$cmd}]}]}}' > "$HOOK_SETTINGS"
  46. prewarm() { # target — spawn a persistent daemon and wait for its socket
  47. pkill -9 -f "serve --mcp --path $1" 2>/dev/null
  48. CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
  49. node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" \
  50. && echo " daemon warm: $1" || echo " WARN: daemon never bound for $1"
  51. }
  52. run_one() { # arm-label, run-index, use-hook(0|1)
  53. local label="$1" idx="$2" hook="$3"
  54. local tgt="$OUT/t-$label-$idx" c="$OUT/mcp-$label.json"
  55. rm -rf "$tgt"
  56. rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$TARGET/" "$tgt/"
  57. node "$BIN" init "$tgt" >/dev/null 2>&1
  58. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$tgt" > "$c"
  59. prewarm "$tgt"
  60. local extra=()
  61. [ "$hook" = "1" ] && extra=(--settings "$HOOK_SETTINGS")
  62. echo "----- [$label] run $idx -----"
  63. # ${extra[@]+...} guard: bash 3.2 (macOS) under `set -u` errors on an empty
  64. # array expansion otherwise, which would skip the no-hook arm's claude run.
  65. ( cd "$tgt" && claude -p "$TASK" \
  66. --output-format stream-json --verbose --permission-mode bypassPermissions \
  67. --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" ${extra[@]+"${extra[@]}"} \
  68. </dev/null > "$OUT/run-$label-$idx.jsonl" 2>"$OUT/run-$label-$idx.err" )
  69. node "$PARSE" "$OUT/run-$label-$idx.jsonl" 2>&1 | grep -E "by type|Result" || echo " (parse failed — see $OUT/run-$label-$idx.jsonl)"
  70. pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
  71. echo
  72. }
  73. for i in $(seq 1 "$RUNS"); do run_one nohook "$i" 0; done
  74. for i in $(seq 1 "$RUNS"); do run_one hook "$i" 1; done
  75. echo "###### DONE. Compare [nohook] vs [hook] 'by type' — Read should fall and"
  76. echo "###### mcp__codegraph__codegraph_node should rise in the [hook] arm. Logs: $OUT"