1
0

ab-sufficiency.sh 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. #!/usr/bin/env bash
  2. # Sufficiency A/B: on a real understanding/flow question, WHEN the agent uses
  3. # codegraph (explore/node), does it still Read? Premise under test: explore/node
  4. # return source WITH line numbers, so a Read should not be needed.
  5. #
  6. # WITH codegraph (pre-warmed daemon, reliable nested attach) vs WITHOUT (empty
  7. # MCP, Read/Grep only), N runs each, on a throwaway copy of the repo. Reports
  8. # explore/node vs Read/Grep, and LISTS the files Read in the WITH arm so a true
  9. # sufficiency gap (an indexed source file) is distinguishable from out-of-scope
  10. # (configs, docs, a file codegraph didn't index).
  11. #
  12. # Usage: ab-sufficiency.sh <indexed-repo> "<question>" [runs-per-arm]
  13. # Env: AGENT_EVAL_OUT (default: /tmp/ab-sufficiency)
  14. set -uo pipefail
  15. REPO="${1:?usage: ab-sufficiency.sh <indexed-repo> \"<question>\" [runs]}"
  16. Q="${2:?question required}"
  17. RUNS="${3:-2}"
  18. ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
  19. BIN="$ENGINE/dist/bin/codegraph.js"
  20. OUT="${AGENT_EVAL_OUT:-/tmp/ab-sufficiency}"
  21. TGT="$OUT/target"
  22. command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
  23. [ -d "$REPO/.codegraph" ] || { echo "no .codegraph index at $REPO"; exit 1; }
  24. cleanup(){ pkill -9 -f "serve --mcp --path $TGT" 2>/dev/null; }
  25. trap cleanup EXIT
  26. mkdir -p "$OUT"
  27. ( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built"
  28. # Throwaway copy + fresh index (the agent works here; a read-only question won't
  29. # edit, but isolate anyway). Excludes the source repo's index/build/vcs.
  30. rm -rf "$TGT"
  31. rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$REPO/" "$TGT/"
  32. node "$BIN" init "$TGT" >/dev/null 2>&1 && echo "indexed copy ($(node "$BIN" status --json 2>/dev/null | node -e 'let s="";process.stdin.on("data",d=>s+=d).on("end",()=>{try{console.log(JSON.parse(s).fileCount+" files")}catch{console.log("?")}})' 2>/dev/null || echo '?'))"
  33. echo "###### repo=$REPO runs/arm=$RUNS"
  34. echo "###### Q=$Q"; echo
  35. echo '{"mcpServers":{}}' > "$OUT/mcp-empty.json"
  36. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$TGT" > "$OUT/mcp-cg.json"
  37. prewarm(){
  38. pkill -9 -f "serve --mcp --path $TGT" 2>/dev/null
  39. CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$TGT" </dev/null >/dev/null 2>&1 &
  40. node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$TGT" >/dev/null 2>&1
  41. }
  42. analyze(){
  43. node -e '
  44. const fs=require("fs");
  45. const L=fs.readFileSync(process.argv[1],"utf8").split("\n").filter(Boolean);
  46. let ex=0,nf=0,ns=0,oc=0,gr=0,exposed="?";const reads=[];
  47. for(const l of L){try{const o=JSON.parse(l);
  48. if(o.type==="system"&&o.subtype==="init")exposed=(o.tools||[]).filter(t=>/codegraph/.test(t)).length;
  49. for(const b of (o.message?.content||[])){if(b.type!=="tool_use")continue;
  50. if(b.name==="mcp__codegraph__codegraph_explore")ex++;
  51. else if(b.name==="mcp__codegraph__codegraph_node"){if(b.input&&b.input.symbol)ns++;else nf++;}
  52. else if(/mcp__codegraph__/.test(b.name))oc++;
  53. else if(b.name==="Read")reads.push((b.input?.file_path||"").split("/").pop());
  54. else if(b.name==="Grep")gr++;
  55. }}catch{}}
  56. console.log(` explore=${ex} node[sym]=${ns} node[file]=${nf} other_cg=${oc} | Read=${reads.length}${reads.length?" ("+reads.join(", ")+")":""} Grep=${gr} [cg exposed=${exposed}]`);
  57. ' "$1"
  58. }
  59. run(){ # label, cfg, prewarm(0/1)
  60. local label="$1" cfg="$2" pw="$3"
  61. for i in $(seq 1 "$RUNS"); do
  62. [ "$pw" = "1" ] && prewarm
  63. ( cd "$TGT" && claude -p "$Q" --output-format stream-json --verbose \
  64. --permission-mode bypassPermissions --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
  65. --strict-mcp-config --mcp-config "$cfg" </dev/null > "$OUT/$label-$i.jsonl" 2>"$OUT/$label-$i.err" )
  66. echo "[$label] run $i:"; analyze "$OUT/$label-$i.jsonl"
  67. done
  68. echo
  69. }
  70. echo "== WITH codegraph (premise: explore/node used -> Read ~0) =="; run with "$OUT/mcp-cg.json" 1
  71. echo "== WITHOUT (Read/Grep only — the contrast) =="; run without "$OUT/mcp-empty.json" 0
  72. echo "###### DONE. In the WITH arm: are explore/node>0 and Read~0? Any Read of an INDEXED source file = sufficiency gap. Logs: $OUT"