1
0

offload-eval-styles.sh 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #!/usr/bin/env bash
  2. # Offload reasoning-OUTPUT-STYLE A/B — all codegraph-on, isolating the Worker's
  3. # output shape's effect on main-session tokens / latency / accuracy:
  4. # raw : CODEGRAPH_OFFLOAD_DISABLE=1 (verbatim explore source, the floor)
  5. # refs : managed offload, default (Cerebras map re-expanded to verbatim, ~24K)
  6. # map : managed offload, STYLE=map (compact reasoned map + file:line anchors, ~1-3K)
  7. # src : managed offload, STYLE=src (map + cited line ranges only, ~1-5K)
  8. # Delegation BLOCKED by default (DISALLOW=Agent) so we measure the offload payload's
  9. # effect on the main Sonnet agent, not whether it spawns a Haiku Explore subagent.
  10. #
  11. # Usage: offload-eval-styles.sh <indexed-repo> <reps> "<question>"
  12. # Env: RESULTS=<file> AGENT_EVAL_OUT=<dir> REP_START=1 DISALLOW=Agent MODEL/EFFORT
  13. set -uo pipefail
  14. HERE="$(cd "$(dirname "$0")" && pwd)"
  15. ENGINE="$(cd "$HERE/../.." && pwd)"
  16. BIN="$ENGINE/dist/bin/codegraph.js"
  17. OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
  18. TARGET="${1:?usage: offload-eval-styles.sh <indexed-repo> <reps> \"<question>\"}"
  19. REPS="${2:?reps}"; Q="${3:?question}"
  20. RUNS="$OUT/runs"; EXTRACT="$HERE/offload-eval-metrics.mjs"
  21. RESULTS="${RESULTS:-$OUT/results-styles.jsonl}"
  22. REPO=$(basename "$TARGET")
  23. DISALLOW="${DISALLOW-Agent}" # default: block delegation. `DISALLOW= ` to allow.
  24. START="${REP_START:-1}"; END=$((START + REPS - 1))
  25. mkdir -p "$RUNS"
  26. command -v claude >/dev/null || { echo "no claude on PATH"; exit 1; }
  27. [ -d "$TARGET/.codegraph" ] || { echo "not indexed: $TARGET"; exit 1; }
  28. TARGET=$(cd "$TARGET" && pwd -P)
  29. prewarm() { # path extra-env
  30. pkill -9 -f "serve --mcp --path $1" 2>/dev/null; rm -f "$1/.codegraph/daemon.sock" 2>/dev/null; sleep 0.6
  31. env ${2:-} CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
  32. node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" \
  33. && echo " daemon warm" || echo " WARN daemon never bound"
  34. }
  35. kill_daemon() { pkill -9 -f "serve --mcp --path $TARGET" 2>/dev/null; rm -f "$TARGET/.codegraph/daemon.sock" 2>/dev/null; sleep 1; }
  36. run() { # arm rep mcp-config usage-log-or-dash
  37. local arm="$1" rep="$2" cfg="$3" usage="$4" tag="$REPO-$1-$2"
  38. [ "$usage" != "-" ] && : > "$usage"
  39. ( cd "$TARGET" && claude -p "$Q" \
  40. --output-format stream-json --verbose --permission-mode bypassPermissions \
  41. --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
  42. ${DISALLOW:+--disallowedTools "$DISALLOW"} \
  43. --strict-mcp-config --mcp-config "$cfg" \
  44. </dev/null > "$RUNS/$tag.jsonl" 2>"$RUNS/$tag.err" )
  45. node "$EXTRACT" --run "$RUNS/$tag.jsonl" --usage "$usage" --arm "$arm" --rep "$rep" \
  46. --repo "$REPO" --tier styles --q "$Q" >> "$RESULTS"
  47. node -e 'const o=JSON.parse(require("fs").readFileSync(process.argv[1],"utf8").trim().split("\n").pop());console.log(` [${o.arm} #${o.rep}] ${o.durationSec}s | ${o.tokBillable} billable tok | read=${o.read} grep=${o.grep} explore=${o.explore} offload=${o.offloadFired} | AI ${o.ai.calls}c/${o.ai.totalTokens}t | ok=${o.ok}`)' "$RESULTS"
  48. }
  49. # MCP configs: env baked into the daemon-spawn command claude uses.
  50. USAGE="$RUNS/$REPO-usage.jsonl"
  51. mkcfg() { # file extra-env-pairs(JSON array entries, comma-led or empty)
  52. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1"%s,"node","%s","serve","--mcp","--path","%s"]}}}' "$1" "$BIN" "$TARGET"
  53. }
  54. CFG_RAW="$RUNS/mcp-sty-raw-$REPO.json"; mkcfg ',"CODEGRAPH_OFFLOAD_DISABLE=1"' > "$CFG_RAW"
  55. CFG_REFS="$RUNS/mcp-sty-refs-$REPO.json"; mkcfg ",\"CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE\"" > "$CFG_REFS"
  56. CFG_MAP="$RUNS/mcp-sty-map-$REPO.json"; mkcfg ",\"CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE\",\"CODEGRAPH_OFFLOAD_STYLE=map\"" > "$CFG_MAP"
  57. CFG_SRC="$RUNS/mcp-sty-src-$REPO.json"; mkcfg ",\"CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE\",\"CODEGRAPH_OFFLOAD_STYLE=src\"" > "$CFG_SRC"
  58. echo "###### repo=$REPO reps=$START..$END model=${MODEL:-sonnet}/${EFFORT:-high} disallow=${DISALLOW:-<none>}"
  59. echo "###### Q=$Q"
  60. echo "== ARM raw =="; prewarm "$TARGET" "CODEGRAPH_OFFLOAD_DISABLE=1"
  61. for r in $(seq "$START" "$END"); do run raw "$r" "$CFG_RAW" "-"; done; kill_daemon
  62. echo "== ARM refs =="; prewarm "$TARGET" "CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE"
  63. for r in $(seq "$START" "$END"); do run refs "$r" "$CFG_REFS" "$USAGE"; done; kill_daemon
  64. echo "== ARM map =="; prewarm "$TARGET" "CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE CODEGRAPH_OFFLOAD_STYLE=map"
  65. for r in $(seq "$START" "$END"); do run map "$r" "$CFG_MAP" "$USAGE"; done; kill_daemon
  66. echo "== ARM src =="; prewarm "$TARGET" "CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE CODEGRAPH_OFFLOAD_STYLE=src"
  67. for r in $(seq "$START" "$END"); do run src "$r" "$CFG_SRC" "$USAGE"; done; kill_daemon
  68. echo "###### DONE $REPO — judge: node $HERE/offload-eval-judge.mjs --results $RESULTS --truth $HERE/offload-eval-ground-truth.json --out $OUT/judged-styles.jsonl"