chore: optimize CI eval PR comment — aggregate all suites, update-not-duplicate

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-23 05:21:43 -07:00
parent 4e5f71c294
commit 9195b671e8

View File

@@ -10,7 +10,7 @@ concurrency:
jobs: jobs:
evals: evals:
runs-on: ubicloud-standard-2 runs-on: ubicloud-standard-2
timeout-minutes: 30 timeout-minutes: 45
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
with: with:
@@ -65,29 +65,68 @@ jobs:
env: env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: | run: |
RESULT=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial | head -1) # Aggregate results across ALL eval suites (not just the latest file)
if [ -z "$RESULT" ]; then RESULTS=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial)
if [ -z "$RESULTS" ]; then
echo "No eval results found" echo "No eval results found"
exit 0 exit 0
fi fi
TOTAL=$(jq .total_tests "$RESULT") TOTAL=0; PASSED=0; FAILED=0; COST=0
PASSED=$(jq .passed "$RESULT") SUITE_LINES=""
FAILED=$(jq .failed "$RESULT") for f in $RESULTS; do
COST=$(jq .total_cost_usd "$RESULT") T=$(jq -r '.total_tests // 0' "$f")
WALL=$(jq '.wall_clock_ms // 0 | . / 1000 | floor' "$RESULT") P=$(jq -r '.passed // 0' "$f")
F=$(jq -r '.failed // 0' "$f")
C=$(jq -r '.total_cost_usd // 0' "$f")
TIER=$(jq -r '.tier // "unknown"' "$f")
[ "$T" -eq 0 ] && continue
TOTAL=$((TOTAL + T))
PASSED=$((PASSED + P))
FAILED=$((FAILED + F))
COST=$(echo "$COST + $C" | bc)
STATUS_ICON="✅"
[ "$F" -gt 0 ] && STATUS_ICON="❌"
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
done
STATUS="pass" STATUS="✅ PASS"
[ "$FAILED" -gt 0 ] && STATUS="FAIL" [ "$FAILED" -gt 0 ] && STATUS="FAIL"
BODY="**E2E Evals:** ${STATUS} ${PASSED}/${TOTAL} passed | \$${COST} | ${WALL}s wall clock" BODY="## E2E Evals: ${STATUS}
**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost
| Suite | Result | Status | Cost |
|-------|--------|--------|------|
$(echo -e "$SUITE_LINES")"
if [ "$FAILED" -gt 0 ]; then if [ "$FAILED" -gt 0 ]; then
FAILURES=$(jq -r '.tests[] | select(.passed == false) | "- FAIL \(.name): \(.exit_reason // "unknown")"' "$RESULT") FAILURES=""
for f in $RESULTS; do
F=$(jq -r '.failed // 0' "$f")
[ "$F" -eq 0 ] && continue
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f")
FAILURES="${FAILURES}${FAILS}\n"
done
BODY="${BODY} BODY="${BODY}
Failures: ### Failures
${FAILURES}" $(echo -e "$FAILURES")"
fi fi
gh pr comment ${{ github.event.pull_request.number }} --body "$BODY" BODY="${BODY}
---
*Runner: ubicloud-standard-2 ($0.0008/min) | Concurrency: 40*"
# Update existing comment or create new one (prevents duplicates on re-runs)
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)
if [ -n "$COMMENT_ID" ]; then
gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \
-X PATCH -f body="$BODY"
else
gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
fi