1
0

ab-impl.sh 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. #!/usr/bin/env bash
  2. # Sufficiency A/B for an IMPLEMENTATION task (the agent edits): when it uses
  3. # codegraph (explore/node) to understand before editing, does it still Read? Like
  4. # ab-sufficiency.sh but copies+indexes a FRESH target per run (the agent mutates
  5. # it), so runs don't see each other's edits.
  6. #
  7. # WITH codegraph (pre-warmed) vs WITHOUT (empty MCP), N runs each. Reports
  8. # explore/node vs Read/Grep + the files Read, and whether the build still passes.
  9. #
  10. # Usage: ab-impl.sh <indexed-repo> "<task>" [runs] [build-cmd]
  11. # Env: AGENT_EVAL_OUT (default: /tmp/ab-impl)
  12. set -uo pipefail
  13. REPO="${1:?usage: ab-impl.sh <indexed-repo> \"<task>\" [runs] [build-cmd]}"
  14. Q="${2:?task required}"
  15. RUNS="${3:-2}"
  16. BUILD_CMD="${4:-}"
  17. ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
  18. BIN="$ENGINE/dist/bin/codegraph.js"
  19. OUT="${AGENT_EVAL_OUT:-/tmp/ab-impl}"
  20. command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
  21. [ -d "$REPO/.codegraph" ] || { echo "no .codegraph index at $REPO"; exit 1; }
  22. cleanup(){ pkill -9 -f "serve --mcp --path $OUT/" 2>/dev/null; }
  23. trap cleanup EXIT
  24. mkdir -p "$OUT"
  25. ( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built engine"
  26. echo "###### repo=$REPO runs/arm=$RUNS"
  27. echo "###### task=$Q"; echo
  28. echo '{"mcpServers":{}}' > "$OUT/mcp-empty.json"
  29. prewarm(){
  30. pkill -9 -f "serve --mcp --path $1" 2>/dev/null
  31. CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
  32. node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" >/dev/null 2>&1
  33. }
  34. analyze(){
  35. node -e '
  36. const fs=require("fs");
  37. const L=fs.readFileSync(process.argv[1],"utf8").split("\n").filter(Boolean);
  38. let ex=0,nf=0,ns=0,oc=0,gr=0,ed=0,exposed="?";const reads=[];
  39. for(const l of L){try{const o=JSON.parse(l);
  40. if(o.type==="system"&&o.subtype==="init")exposed=(o.tools||[]).filter(t=>/codegraph/.test(t)).length;
  41. for(const b of (o.message?.content||[])){if(b.type!=="tool_use")continue;
  42. if(b.name==="mcp__codegraph__codegraph_explore")ex++;
  43. else if(b.name==="mcp__codegraph__codegraph_node"){if(b.input&&b.input.symbol)ns++;else nf++;}
  44. else if(/mcp__codegraph__/.test(b.name))oc++;
  45. else if(b.name==="Read")reads.push((b.input?.file_path||"").split("/").pop());
  46. else if(b.name==="Grep")gr++;
  47. else if(b.name==="Edit"||b.name==="Write")ed++;
  48. }}catch{}}
  49. console.log(` explore=${ex} node[sym]=${ns} node[file]=${nf} other_cg=${oc} | Read=${reads.length}${reads.length?" ("+reads.join(", ")+")":""} Grep=${gr} Edit=${ed} [cg exposed=${exposed}]`);
  50. ' "$1"
  51. }
  52. run(){ # label, withCodegraph(0/1)
  53. local label="$1" wcg="$2"
  54. for i in $(seq 1 "$RUNS"); do
  55. local tgt="$OUT/t-$label-$i" cfg="$OUT/mcp-$label.json"
  56. rm -rf "$tgt"
  57. rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$REPO/" "$tgt/"
  58. node "$BIN" init "$tgt" >/dev/null 2>&1
  59. if [ "$wcg" = "1" ]; then
  60. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$tgt" > "$cfg"
  61. prewarm "$tgt"
  62. else cp "$OUT/mcp-empty.json" "$cfg"; fi
  63. ( cd "$tgt" && claude -p "$Q" --output-format stream-json --verbose \
  64. --permission-mode bypassPermissions --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
  65. --strict-mcp-config --mcp-config "$cfg" </dev/null > "$OUT/$label-$i.jsonl" 2>"$OUT/$label-$i.err" )
  66. echo "[$label] run $i:"; analyze "$OUT/$label-$i.jsonl"
  67. if [ -n "$BUILD_CMD" ]; then ( cd "$tgt" && eval "$BUILD_CMD" >/dev/null 2>&1 && echo " build: PASS" || echo " build: FAIL" ); fi
  68. pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
  69. done
  70. echo
  71. }
  72. echo "== WITH codegraph =="; run with 1
  73. echo "== WITHOUT (Read/Grep only) =="; run without 0
  74. echo "###### DONE: $OUT"