mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-18 02:22:04 +08:00
chore: optimize CI eval PR comment — aggregate all suites, update-not-duplicate
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
69
.github/workflows/evals.yml
vendored
69
.github/workflows/evals.yml
vendored
@@ -10,7 +10,7 @@ concurrency:
|
|||||||
jobs:
|
jobs:
|
||||||
evals:
|
evals:
|
||||||
runs-on: ubicloud-standard-2
|
runs-on: ubicloud-standard-2
|
||||||
timeout-minutes: 30
|
timeout-minutes: 45
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
@@ -65,29 +65,68 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
RESULT=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial | head -1)
|
# Aggregate results across ALL eval suites (not just the latest file)
|
||||||
if [ -z "$RESULT" ]; then
|
RESULTS=$(ls -t ~/.gstack-dev/evals/*.json 2>/dev/null | grep -v _partial)
|
||||||
|
if [ -z "$RESULTS" ]; then
|
||||||
echo "No eval results found"
|
echo "No eval results found"
|
||||||
exit 0
|
exit 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
TOTAL=$(jq .total_tests "$RESULT")
|
TOTAL=0; PASSED=0; FAILED=0; COST=0
|
||||||
PASSED=$(jq .passed "$RESULT")
|
SUITE_LINES=""
|
||||||
FAILED=$(jq .failed "$RESULT")
|
for f in $RESULTS; do
|
||||||
COST=$(jq .total_cost_usd "$RESULT")
|
T=$(jq -r '.total_tests // 0' "$f")
|
||||||
WALL=$(jq '.wall_clock_ms // 0 | . / 1000 | floor' "$RESULT")
|
P=$(jq -r '.passed // 0' "$f")
|
||||||
|
F=$(jq -r '.failed // 0' "$f")
|
||||||
|
C=$(jq -r '.total_cost_usd // 0' "$f")
|
||||||
|
TIER=$(jq -r '.tier // "unknown"' "$f")
|
||||||
|
[ "$T" -eq 0 ] && continue
|
||||||
|
TOTAL=$((TOTAL + T))
|
||||||
|
PASSED=$((PASSED + P))
|
||||||
|
FAILED=$((FAILED + F))
|
||||||
|
COST=$(echo "$COST + $C" | bc)
|
||||||
|
STATUS_ICON="✅"
|
||||||
|
[ "$F" -gt 0 ] && STATUS_ICON="❌"
|
||||||
|
SUITE_LINES="${SUITE_LINES}| ${TIER} | ${P}/${T} | ${STATUS_ICON} | \$${C} |\n"
|
||||||
|
done
|
||||||
|
|
||||||
STATUS="pass"
|
STATUS="✅ PASS"
|
||||||
[ "$FAILED" -gt 0 ] && STATUS="FAIL"
|
[ "$FAILED" -gt 0 ] && STATUS="❌ FAIL"
|
||||||
|
|
||||||
BODY="**E2E Evals:** ${STATUS} ${PASSED}/${TOTAL} passed | \$${COST} | ${WALL}s wall clock"
|
BODY="## E2E Evals: ${STATUS}
|
||||||
|
|
||||||
|
**${PASSED}/${TOTAL}** tests passed | **\$${COST}** total cost
|
||||||
|
|
||||||
|
| Suite | Result | Status | Cost |
|
||||||
|
|-------|--------|--------|------|
|
||||||
|
$(echo -e "$SUITE_LINES")"
|
||||||
|
|
||||||
if [ "$FAILED" -gt 0 ]; then
|
if [ "$FAILED" -gt 0 ]; then
|
||||||
FAILURES=$(jq -r '.tests[] | select(.passed == false) | "- FAIL \(.name): \(.exit_reason // "unknown")"' "$RESULT")
|
FAILURES=""
|
||||||
|
for f in $RESULTS; do
|
||||||
|
F=$(jq -r '.failed // 0' "$f")
|
||||||
|
[ "$F" -eq 0 ] && continue
|
||||||
|
FAILS=$(jq -r '.tests[] | select(.passed == false) | "- ❌ \(.name): \(.exit_reason // "unknown")"' "$f")
|
||||||
|
FAILURES="${FAILURES}${FAILS}\n"
|
||||||
|
done
|
||||||
BODY="${BODY}
|
BODY="${BODY}
|
||||||
|
|
||||||
Failures:
|
### Failures
|
||||||
${FAILURES}"
|
$(echo -e "$FAILURES")"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
|
BODY="${BODY}
|
||||||
|
|
||||||
|
---
|
||||||
|
*Runner: ubicloud-standard-2 ($0.0008/min) | Concurrency: 40*"
|
||||||
|
|
||||||
|
# Update existing comment or create new one (prevents duplicates on re-runs)
|
||||||
|
COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/comments \
|
||||||
|
--jq '.[] | select(.body | startswith("## E2E Evals")) | .id' | tail -1)
|
||||||
|
|
||||||
|
if [ -n "$COMMENT_ID" ]; then
|
||||||
|
gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID \
|
||||||
|
-X PATCH -f body="$BODY"
|
||||||
|
else
|
||||||
|
gh pr comment ${{ github.event.pull_request.number }} --body "$BODY"
|
||||||
|
fi
|
||||||
|
|||||||
Reference in New Issue
Block a user