1
0

offload-eval-matrix.sh 1.1 KB

1234567891011121314151617181920
  1. #!/usr/bin/env bash
  2. # Drive the 3-arm campaign (offload/raw/nocg) across all 4 tiers, n reps each, into one
  3. # results.jsonl. Reads the canonical question per repo from offload-eval-ground-truth.json.
  4. # Env: REPS (default 3) AGENT_EVAL_OUT=<scratch dir>
  5. set -uo pipefail
  6. HERE="$(cd "$(dirname "$0")" && pwd)"
  7. OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
  8. GT="$HERE/offload-eval-ground-truth.json"
  9. REPS="${REPS:-3}"
  10. export RESULTS="$OUT/results.jsonl"
  11. : > "$RESULTS"
  12. for repo in mtkruto postybirb shapeshift trezor; do
  13. case "$repo" in mtkruto) tier=small;; postybirb) tier=medium;; shapeshift) tier=complex;; trezor) tier=large;; esac
  14. Q=$(node -e "console.log(JSON.parse(require('fs').readFileSync(process.argv[1],'utf8'))[process.argv[2]].question)" "$GT" "$repo")
  15. echo ""; echo "### $repo ($tier) $(date +%H:%M:%S)"
  16. bash "$HERE/offload-eval-3arm.sh" "$OUT/repos/$repo" "$tier" "$REPS" "$Q"
  17. done
  18. echo ""; echo "###### MATRIX DONE -> $RESULTS ($(wc -l < "$RESULTS") runs). Judge + summarize with:"
  19. echo " node $HERE/offload-eval-judge.mjs --results $RESULTS --truth $GT --out $OUT/judged.jsonl"
  20. echo " node $HERE/offload-eval-summarize.mjs $OUT/judged.jsonl"