offload-eval-refs1.sh 3.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. #!/usr/bin/env bash
  2. # ONE offload run on ONE indexed repo at a given offload STYLE (plain|refs), so we can
  3. # watch a single agent transcript at a time (the user's one-run-at-a-time methodology).
  4. # The OFFLOAD reasoning runs in the prewarmed DAEMON process, so the style env must be
  5. # set on BOTH the daemon and the client MCP config. Writes one metrics line to RESULTS
  6. # and leaves the raw stream-json at $RUNS/<repo>-<style>-<n>.jsonl for inspection.
  7. #
  8. # Usage: offload-eval-refs1.sh <indexed-repo> <style> <n> "<question>"
  9. set -uo pipefail
  10. HERE="$(cd "$(dirname "$0")" && pwd)"; ENGINE="$(cd "$HERE/../.." && pwd)"; BIN="$ENGINE/dist/bin/codegraph.js"
  11. OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"; RUNS="$OUT/runs"; EXTRACT="$HERE/offload-eval-metrics.mjs"
  12. TARGET="${1:?repo}"; STYLE="${2:?style}"; N="${3:?run-tag}"; Q="${4:?question}"
  13. RESULTS="${RESULTS:-$OUT/results-refs.jsonl}"; REPO=$(basename "$TARGET"); TARGET=$(cd "$TARGET" && pwd -P)
  14. mkdir -p "$RUNS"; command -v claude >/dev/null || { echo "no claude"; exit 1; }
  15. USAGE="$RUNS/$REPO-$STYLE-usage.jsonl"; : > "$USAGE"
  16. CFG="$RUNS/mcp-$REPO-$STYLE.json"
  17. # `raw` is a pseudo-style: codegraph attached but the offload DISABLED (the ceiling —
  18. # verbatim source, no reasoning model). Any other value is an offload style (plain|refs).
  19. if [ "$STYLE" = "raw" ]; then
  20. DAEMON_ENV="CODEGRAPH_OFFLOAD_DISABLE=1"
  21. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","CODEGRAPH_OFFLOAD_DISABLE=1","node","%s","serve","--mcp","--path","%s"]}}}' \
  22. "$BIN" "$TARGET" > "$CFG"
  23. USAGE="-"
  24. else
  25. DAEMON_ENV="CODEGRAPH_OFFLOAD_STYLE=$STYLE CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE"
  26. printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","CODEGRAPH_OFFLOAD_STYLE=%s","CODEGRAPH_OFFLOAD_USAGE_LOG=%s","node","%s","serve","--mcp","--path","%s"]}}}' \
  27. "$STYLE" "$USAGE" "$BIN" "$TARGET" > "$CFG"
  28. fi
  29. # Prewarm a persistent daemon carrying the SAME offload config (it does the reasoning).
  30. pkill -9 -f "serve --mcp --path $TARGET" 2>/dev/null; rm -f "$TARGET/.codegraph/daemon.sock" 2>/dev/null; sleep 0.6
  31. env $DAEMON_ENV CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 \
  32. node "$BIN" serve --mcp --path "$TARGET" </dev/null >/dev/null 2>&1 &
  33. node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$TARGET" \
  34. && echo "daemon warm ($STYLE)" || echo "WARN daemon never bound"
  35. tag="$REPO-$STYLE-$N"
  36. echo "== run $tag =="
  37. # DISALLOW (optional): block tools that confound the offload-sufficiency signal —
  38. # chiefly "Agent" (sub-agent delegation: the spawned Explore subagent has low MCP
  39. # salience, ignores codegraph, and thrashes via Bash+Read, making the A/B noise).
  40. ( cd "$TARGET" && claude -p "$Q" --output-format stream-json --verbose --permission-mode bypassPermissions \
  41. --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
  42. ${DISALLOW:+--disallowedTools "$DISALLOW"} \
  43. --strict-mcp-config --mcp-config "$CFG" </dev/null > "$RUNS/$tag.jsonl" 2>"$RUNS/$tag.err" )
  44. node "$EXTRACT" --run "$RUNS/$tag.jsonl" --usage "$USAGE" --arm "offload-$STYLE" --rep "$N" \
  45. --repo "$REPO" --tier "complex" --q "$Q" >> "$RESULTS"
  46. node -e 'const o=JSON.parse(require("fs").readFileSync(process.argv[1],"utf8").trim().split("\n").pop());console.log(` [${o.arm} #${o.rep}] ${o.durationSec}s | main $${o.costUsdMain} ${o.tokBillable} tok | read=${o.read} grep=${o.grep} explore=${o.explore} offload=${o.offloadFired} | AI ${o.ai.calls}call/${o.ai.totalTokens}tok/$${o.ai.costUsd.toFixed(4)} | ok=${o.ok}`)' "$RESULTS"
  47. pkill -9 -f "serve --mcp --path $TARGET" 2>/dev/null; rm -f "$TARGET/.codegraph/daemon.sock" 2>/dev/null
  48. echo "raw transcript: $RUNS/$tag.jsonl"