offload-eval-frontload-matrix.sh 1.6 KB

12345678910111213141516171819202122232425
  1. #!/usr/bin/env bash
  2. # Run the FRONTLOAD arm across all 4 tiers (n reps), then judge + merge with the existing
  3. # matrix (offload/raw/nocg in $OUT/judged.jsonl, if present) + emit a combined summary.
  4. # Env: REPS (default 3) AGENT_EVAL_OUT=<scratch dir>
  5. set -uo pipefail
  6. HERE="$(cd "$(dirname "$0")" && pwd)"
  7. OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
  8. GT="$HERE/offload-eval-ground-truth.json"
  9. REPS="${REPS:-3}"
  10. export RESULTS="$OUT/results-fl.jsonl"
  11. : > "$RESULTS"; rm -f "$OUT/runs/hook-debug.log"
  12. for repo in mtkruto postybirb shapeshift trezor; do
  13. case "$repo" in mtkruto) tier=small;; postybirb) tier=medium;; shapeshift) tier=complex;; trezor) tier=large;; esac
  14. Q=$(node -e "console.log(JSON.parse(require('fs').readFileSync(process.argv[1],'utf8'))[process.argv[2]].question)" "$GT" "$repo")
  15. echo ""; echo "### $repo ($tier) $(date +%H:%M:%S)"
  16. bash "$HERE/offload-eval-frontload.sh" "$OUT/repos/$repo" "$tier" "$REPS" "$Q"
  17. done
  18. echo ""
  19. echo "frontload: $(wc -l < "$RESULTS") runs | hook injections: $(grep -c INJECTED "$OUT/runs/hook-debug.log" 2>/dev/null) | errors: $(grep -c ERROR "$OUT/runs/hook-debug.log" 2>/dev/null)"
  20. echo "=== JUDGE frontload ==="
  21. node "$HERE/offload-eval-judge.mjs" --results "$RESULTS" --truth "$GT" --out "$OUT/judged-fl.jsonl" --concurrency 4 2>&1 | tail -4
  22. if [ -f "$OUT/judged.jsonl" ]; then cat "$OUT/judged.jsonl" "$OUT/judged-fl.jsonl" > "$OUT/judged-all.jsonl"; else cp "$OUT/judged-fl.jsonl" "$OUT/judged-all.jsonl"; fi
  23. echo "=== COMBINED SUMMARY ==="
  24. node "$HERE/offload-eval-summarize.mjs" "$OUT/judged-all.jsonl"
  25. echo "###### FRONTLOAD MATRIX DONE"