itrun.sh 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. #!/usr/bin/env bash
  2. # Drive an INTERACTIVE Claude Code session in tmux, send a prompt, wait for the
  3. # agent to finish, then print the tool-call breakdown from the session logs.
  4. #
  5. # Why interactive (not `claude -p`): headless print-mode picks the
  6. # general-purpose subagent, while real interactive sessions delegate to the
  7. # Explore subagent (or drive codegraph from the main thread). Only the
  8. # interactive TUI reproduces the behavior users actually see. (Idle-detection
  9. # technique borrowed from devpit's WaitForIdle.)
  10. #
  11. # Usage: itrun.sh <repo-path> <label> "<prompt>"
  12. # Output dir: $AGENT_EVAL_OUT (default /tmp/agent-eval)
  13. # Requires: tmux 3.0+, a logged-in `claude` CLI, codegraph MCP configured.
  14. set -uo pipefail
  15. REPO="$1"; LABEL="$2"; PROMPT="$3"
  16. SESSION="cgt_${LABEL}"
  17. OUT_DIR="${AGENT_EVAL_OUT:-/tmp/agent-eval}"; mkdir -p "$OUT_DIR"
  18. OUT="$OUT_DIR/itrun-${LABEL}.txt"
  19. HERE="$(cd "$(dirname "$0")" && pwd)"
  20. cap() { tmux capture-pane -p -t "$SESSION" -S -40; }
  21. tmux kill-session -t "$SESSION" 2>/dev/null
  22. # Wide pane so the TUI doesn't hard-wrap tool lines.
  23. tmux new-session -d -s "$SESSION" -x 230 -y 60
  24. tmux send-keys -t "$SESSION" "cd $REPO && claude --dangerously-skip-permissions ${CLAUDE_EXTRA_ARGS:-}" Enter
  25. # Wait for the ❯ prompt (claude drew its UI), up to 60s. NOTE: ❯ appears on the
  26. # welcome screen seconds before the input actually accepts keystrokes, so this is
  27. # necessary but NOT sufficient — the type-and-verify loop below is what proves
  28. # the input is live.
  29. ready=0
  30. for _ in $(seq 1 120); do
  31. cap | grep -q "❯" && { ready=1; break; }
  32. sleep 0.5
  33. done
  34. [ "$ready" = 1 ] || { echo "claude never drew its UI"; cap; tmux kill-session -t "$SESSION" 2>/dev/null; exit 1; }
  35. # Accept the per-folder "Is this a project you trust?" dialog if it shows (first
  36. # time claude opens a given repo). Option 1 ("Yes, I trust this folder") is
  37. # pre-selected, so Enter accepts. This dialog also contains ❯, so it must be
  38. # cleared before the type-and-verify loop or keystrokes land on the menu.
  39. for _ in $(seq 1 20); do
  40. cap | grep -q "trust this folder" || break
  41. tmux send-keys -t "$SESSION" Enter
  42. sleep 1
  43. done
  44. # Type-and-verify: send the prompt, confirm a distinctive chunk of it actually
  45. # landed in the input box, retry if it didn't (handles the early-❯ race where
  46. # the welcome screen shows the prompt glyph but MCP init is still eating keys).
  47. needle="${PROMPT:0:24}"
  48. typed=0
  49. for _ in $(seq 1 30); do
  50. tmux send-keys -l -t "$SESSION" "$PROMPT"
  51. sleep 1
  52. if cap | grep -Fq "$needle"; then typed=1; break; fi
  53. # Clear whatever partial text may have landed, then retry.
  54. tmux send-keys -t "$SESSION" C-u
  55. sleep 1
  56. done
  57. [ "$typed" = 1 ] || { echo "prompt never landed in the input box"; cap; tmux kill-session -t "$SESSION" 2>/dev/null; exit 1; }
  58. sleep 0.5
  59. tmux send-keys -t "$SESSION" Enter
  60. # Busy signals. The robust one is the spinner's elapsed-time-in-parens, which
  61. # EVERY working state shows — both the pre-stream thinking phase
  62. # "(8s · thinking with max effort)" and the streaming phase
  63. # "(24s · ↑ 2.5k tokens · …)", and it survives the 32s→"1m 3s" rollover. We OR
  64. # in the token arrows, "esc to interrupt", and "Initializing" as belt-and-braces
  65. # (some TUI versions/states show one but not the others).
  66. BUSY_RE='esc to interrupt|↓ [0-9]|↑ [0-9]|Initializing|\(([0-9]+m )?[0-9]+s ·'
  67. # Wait for work to START (busy indicator appears), up to 60s. If it never starts,
  68. # fail loudly rather than silently reporting an empty run.
  69. started=0
  70. for _ in $(seq 1 120); do
  71. cap | grep -qE "$BUSY_RE" && { started=1; break; }
  72. sleep 0.5
  73. done
  74. [ "$started" = 1 ] || { echo "agent never started working"; cap; tmux kill-session -t "$SESSION" 2>/dev/null; exit 1; }
  75. # Poll for idle. CRITICAL: Opus 4.8 (extended thinking) renders NO spinner /
  76. # "esc to interrupt" / timer while it STREAMS its final answer — those appear
  77. # only during the thinking + tool-use phases ("✻ Marinating… (32s · ↓ 1.3k
  78. # tokens · thinking with max effort)"). So BUSY_RE reads "not busy" for the whole
  79. # 10-30s answer stream, and any short not-busy threshold kills the run mid-answer
  80. # (the truncation bug). We therefore detect "done" by CONTENT STABILITY, not by a
  81. # spinner string: while the agent streams, the captured pane changes every poll,
  82. # so stability never accrues; it accrues only once the agent has finished and the
  83. # static "✻ Brewed for 1m 9s" summary is all that is left. BUSY_RE still hard-
  84. # resets stability (covers thinking/tool-use/live-timer, where text can briefly
  85. # sit still). Need STABLE_NEEDED polls (~8s) of zero pane change + ❯ present.
  86. # Content-stability is model-agnostic — it survives future spinner re-wordings.
  87. STABLE_NEEDED=16
  88. prev=""; stable=0
  89. for _ in $(seq 1 2400); do # up to ~20 min
  90. pane="$(cap)"
  91. sig="$(printf '%s' "$pane" | tr -s '[:space:]' ' ')"
  92. if printf '%s' "$pane" | grep -qE "$BUSY_RE"; then
  93. stable=0 # thinking / tool use / live timer → busy
  94. elif [ -n "$sig" ] && [ "$sig" = "$prev" ] && printf '%s' "$pane" | grep -q "❯"; then
  95. stable=$((stable+1)); [ "$stable" -ge "$STABLE_NEEDED" ] && break
  96. else
  97. stable=0 # answer still streaming → pane changing
  98. fi
  99. prev="$sig"
  100. sleep 0.5
  101. done
  102. sleep 1
  103. tmux capture-pane -p -t "$SESSION" -S - > "$OUT"
  104. echo "captured $(wc -l < "$OUT") lines -> $OUT"
  105. grep -oE "Done \([^)]*\)|[A-Z][a-z]+ for ([0-9]+m )?[0-9]+s" "$OUT" | tail -1
  106. grep -oE "[0-9.]+k?/[0-9.]+M" "$OUT" | tail -1 | sed 's/^/Context /'
  107. tmux kill-session -t "$SESSION" 2>/dev/null
  108. # Clean tool breakdown from the session logs (main + subagents).
  109. node "$HERE/parse-session.mjs" "$REPO" 2>/dev/null || true