2 settimane fa · ebae044a43
--- a/scripts/agent-eval/ab-adoption.sh
+++ b/scripts/agent-eval/ab-adoption.sh
@@ -0,0 +1,91 @@
 
				+#!/usr/bin/env bash
			
 
				+# Does the agent PICK codegraph_node to read a file, vs the built-in Read tool?
			
 
				+# Build A/B: NEW build (HEAD, codegraph_node has Read parity) vs BASELINE build
			
 
				+# (a ref where it doesn't), BOTH codegraph-attached + pre-warmed, same task. The
			
 
				+# metric is tool CHOICE: Read calls vs codegraph_node[file] calls per run.
			
 
				+#
			
 
				+# Usage: ab-adoption.sh <indexed-repo> "<task>" [runs-per-arm] [baseline-ref]
			
 
				+# Env: AGENT_EVAL_OUT (default: /tmp/ab-adoption)
			
 
				+set -uo pipefail
			
 
				+TARGET="${1:?usage: ab-adoption.sh <indexed-repo> \"<task>\" [runs] [baseline-ref]}"
			
 
				+TASK="${2:?task required}"
			
 
				+RUNS="${3:-2}"
			
 
				+BASE_REF="${4:-HEAD~1}"
			
 
				+ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/ab-adoption}"
			
 
				+
			
 
				+command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
			
 
				+[ -d "$TARGET/.codegraph" ] || { echo "target not indexed: run 'codegraph init $TARGET' first"; exit 1; }
			
 
				+git -C "$ENGINE" diff --quiet && git -C "$ENGINE" diff --cached --quiet || { echo "engine has uncommitted changes — commit/stash first"; exit 1; }
			
 
				+CHANGED=$(git -C "$ENGINE" diff --name-only "$BASE_REF" HEAD -- src 2>/dev/null)
			
 
				+[ -n "$CHANGED" ] || { echo "no src/ changes between $BASE_REF and HEAD"; exit 1; }
			
 
				+
			
 
				+cleanup() {
			
 
				+  pkill -9 -f "serve --mcp --path $OUT/" 2>/dev/null
			
 
				+  git -C "$ENGINE" checkout HEAD -- $CHANGED 2>/dev/null
			
 
				+  ( cd "$ENGINE" && npm run build >/dev/null 2>&1 )
			
 
				+}
			
 
				+trap cleanup EXIT
			
 
				+mkdir -p "$OUT"
			
 
				+echo "###### target=$TARGET  runs/arm=$RUNS  baseline=$BASE_REF"
			
 
				+echo "###### changed: $(echo "$CHANGED" | tr '\n' ' ')"
			
 
				+echo "###### task=$TASK"; echo
			
 
				+
			
 
				+prewarm() {
			
 
				+  pkill -9 -f "serve --mcp --path $1" 2>/dev/null
			
 
				+  CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
			
 
				+  node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" >/dev/null 2>&1
			
 
				+}
			
 
				+
			
 
				+# Per-run tool-choice counts: Read vs codegraph_node[file] vs [symbol].
			
 
				+count() {
			
 
				+  node -e '
			
 
				+    const fs=require("fs");
			
 
				+    const lines=fs.readFileSync(process.argv[1],"utf8").split("\n").filter(Boolean);
			
 
				+    let read=0,cgFile=0,cgSym=0,cgOther=0,exposed="?";
			
 
				+    for(const l of lines){try{const o=JSON.parse(l);
			
 
				+      if(o.type==="system"&&o.subtype==="init"){exposed=(o.tools||[]).filter(t=>/codegraph/.test(t)).length;}
			
 
				+      const blocks=o.message?.content||[];
			
 
				+      for(const b of (Array.isArray(blocks)?blocks:[])){
			
 
				+        if(b.type!=="tool_use")continue;
			
 
				+        if(b.name==="Read")read++;
			
 
				+        else if(b.name==="mcp__codegraph__codegraph_node"){ if(b.input&&b.input.symbol)cgSym++; else cgFile++; }
			
 
				+        else if(/mcp__codegraph__/.test(b.name))cgOther++;
			
 
				+      }
			
 
				+    }catch{}}
			
 
				+    console.log(`    Read=${read}  codegraph_node[file]=${cgFile}  codegraph_node[symbol]=${cgSym}  other_cg=${cgOther}  (cg exposed=${exposed})`);
			
 
				+  ' "$1"
			
 
				+}
			
 
				+
			
 
				+run_arm() { # label, N
			
 
				+  local label="$1" n="$2"
			
 
				+  local c="$OUT/mcp-$label.json"
			
 
				+  for i in $(seq 1 "$n"); do
			
 
				+    local tgt="$OUT/t-$label-$i"
			
 
				+    rm -rf "$tgt"
			
 
				+    rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$TARGET/" "$tgt/"
			
 
				+    node "$BIN" init "$tgt" >/dev/null 2>&1
			
 
				+    printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$tgt" > "$c"
			
 
				+    prewarm "$tgt"
			
 
				+    echo "----- [$label] run $i -----"
			
 
				+    ( cd "$tgt" && claude -p "$TASK" \
			
 
				+        --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				+        --model opus --max-budget-usd 4 --strict-mcp-config --mcp-config "$c" \
			
 
				+        </dev/null > "$OUT/run-$label-$i.jsonl" 2>"$OUT/run-$label-$i.err" )
			
 
				+    count "$OUT/run-$label-$i.jsonl"
			
 
				+    pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
			
 
				+  done
			
 
				+  echo
			
 
				+}
			
 
				+
			
 
				+echo "== NEW build (HEAD: codegraph_node has Read parity) =="
			
 
				+( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built"
			
 
				+run_arm new "$RUNS"
			
 
				+
			
 
				+echo "== BASELINE build ($BASE_REF) =="
			
 
				+git -C "$ENGINE" checkout "$BASE_REF" -- $CHANGED
			
 
				+( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built"
			
 
				+run_arm baseline "$RUNS"
			
 
				+
			
 
				+echo "###### DONE — compare [new] vs [baseline]: does codegraph_node[file] rise / Read fall? Logs: $OUT"
			
--- a/scripts/agent-eval/ab-impl.sh
+++ b/scripts/agent-eval/ab-impl.sh
@@ -0,0 +1,78 @@
 
				+#!/usr/bin/env bash
			
 
				+# Sufficiency A/B for an IMPLEMENTATION task (the agent edits): when it uses
			
 
				+# codegraph (explore/node) to understand before editing, does it still Read? Like
			
 
				+# ab-sufficiency.sh but copies+indexes a FRESH target per run (the agent mutates
			
 
				+# it), so runs don't see each other's edits.
			
 
				+#
			
 
				+# WITH codegraph (pre-warmed) vs WITHOUT (empty MCP), N runs each. Reports
			
 
				+# explore/node vs Read/Grep + the files Read, and whether the build still passes.
			
 
				+#
			
 
				+# Usage: ab-impl.sh <indexed-repo> "<task>" [runs] [build-cmd]
			
 
				+# Env: AGENT_EVAL_OUT (default: /tmp/ab-impl)
			
 
				+set -uo pipefail
			
 
				+REPO="${1:?usage: ab-impl.sh <indexed-repo> \"<task>\" [runs] [build-cmd]}"
			
 
				+Q="${2:?task required}"
			
 
				+RUNS="${3:-2}"
			
 
				+BUILD_CMD="${4:-}"
			
 
				+ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/ab-impl}"
			
 
				+command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
			
 
				+[ -d "$REPO/.codegraph" ] || { echo "no .codegraph index at $REPO"; exit 1; }
			
 
				+cleanup(){ pkill -9 -f "serve --mcp --path $OUT/" 2>/dev/null; }
			
 
				+trap cleanup EXIT
			
 
				+mkdir -p "$OUT"
			
 
				+( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built engine"
			
 
				+echo "###### repo=$REPO  runs/arm=$RUNS"
			
 
				+echo "###### task=$Q"; echo
			
 
				+echo '{"mcpServers":{}}' > "$OUT/mcp-empty.json"
			
 
				+
			
 
				+prewarm(){
			
 
				+  pkill -9 -f "serve --mcp --path $1" 2>/dev/null
			
 
				+  CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
			
 
				+  node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" >/dev/null 2>&1
			
 
				+}
			
 
				+
			
 
				+analyze(){
			
 
				+  node -e '
			
 
				+    const fs=require("fs");
			
 
				+    const L=fs.readFileSync(process.argv[1],"utf8").split("\n").filter(Boolean);
			
 
				+    let ex=0,nf=0,ns=0,oc=0,gr=0,ed=0,exposed="?";const reads=[];
			
 
				+    for(const l of L){try{const o=JSON.parse(l);
			
 
				+      if(o.type==="system"&&o.subtype==="init")exposed=(o.tools||[]).filter(t=>/codegraph/.test(t)).length;
			
 
				+      for(const b of (o.message?.content||[])){if(b.type!=="tool_use")continue;
			
 
				+        if(b.name==="mcp__codegraph__codegraph_explore")ex++;
			
 
				+        else if(b.name==="mcp__codegraph__codegraph_node"){if(b.input&&b.input.symbol)ns++;else nf++;}
			
 
				+        else if(/mcp__codegraph__/.test(b.name))oc++;
			
 
				+        else if(b.name==="Read")reads.push((b.input?.file_path||"").split("/").pop());
			
 
				+        else if(b.name==="Grep")gr++;
			
 
				+        else if(b.name==="Edit"||b.name==="Write")ed++;
			
 
				+      }}catch{}}
			
 
				+    console.log(`    explore=${ex} node[sym]=${ns} node[file]=${nf} other_cg=${oc} | Read=${reads.length}${reads.length?" ("+reads.join(", ")+")":""} Grep=${gr} Edit=${ed}  [cg exposed=${exposed}]`);
			
 
				+  ' "$1"
			
 
				+}
			
 
				+
			
 
				+run(){ # label, withCodegraph(0/1)
			
 
				+  local label="$1" wcg="$2"
			
 
				+  for i in $(seq 1 "$RUNS"); do
			
 
				+    local tgt="$OUT/t-$label-$i" cfg="$OUT/mcp-$label.json"
			
 
				+    rm -rf "$tgt"
			
 
				+    rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$REPO/" "$tgt/"
			
 
				+    node "$BIN" init "$tgt" >/dev/null 2>&1
			
 
				+    if [ "$wcg" = "1" ]; then
			
 
				+      printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$tgt" > "$cfg"
			
 
				+      prewarm "$tgt"
			
 
				+    else cp "$OUT/mcp-empty.json" "$cfg"; fi
			
 
				+    ( cd "$tgt" && claude -p "$Q" --output-format stream-json --verbose \
			
 
				+        --permission-mode bypassPermissions --model opus --max-budget-usd 4 \
			
 
				+        --strict-mcp-config --mcp-config "$cfg" </dev/null > "$OUT/$label-$i.jsonl" 2>"$OUT/$label-$i.err" )
			
 
				+    echo "[$label] run $i:"; analyze "$OUT/$label-$i.jsonl"
			
 
				+    if [ -n "$BUILD_CMD" ]; then ( cd "$tgt" && eval "$BUILD_CMD" >/dev/null 2>&1 && echo "      build: PASS" || echo "      build: FAIL" ); fi
			
 
				+    pkill -9 -f "serve --mcp --path $tgt" 2>/dev/null
			
 
				+  done
			
 
				+  echo
			
 
				+}
			
 
				+
			
 
				+echo "== WITH codegraph =="; run with 1
			
 
				+echo "== WITHOUT (Read/Grep only) =="; run without 0
			
 
				+echo "###### DONE: $OUT"
			
--- a/scripts/agent-eval/ab-sufficiency.sh
+++ b/scripts/agent-eval/ab-sufficiency.sh
@@ -0,0 +1,78 @@
 
				+#!/usr/bin/env bash
			
 
				+# Sufficiency A/B: on a real understanding/flow question, WHEN the agent uses
			
 
				+# codegraph (explore/node), does it still Read? Premise under test: explore/node
			
 
				+# return source WITH line numbers, so a Read should not be needed.
			
 
				+#
			
 
				+# WITH codegraph (pre-warmed daemon, reliable nested attach) vs WITHOUT (empty
			
 
				+# MCP, Read/Grep only), N runs each, on a throwaway copy of the repo. Reports
			
 
				+# explore/node vs Read/Grep, and LISTS the files Read in the WITH arm so a true
			
 
				+# sufficiency gap (an indexed source file) is distinguishable from out-of-scope
			
 
				+# (configs, docs, a file codegraph didn't index).
			
 
				+#
			
 
				+# Usage: ab-sufficiency.sh <indexed-repo> "<question>" [runs-per-arm]
			
 
				+# Env: AGENT_EVAL_OUT (default: /tmp/ab-sufficiency)
			
 
				+set -uo pipefail
			
 
				+REPO="${1:?usage: ab-sufficiency.sh <indexed-repo> \"<question>\" [runs]}"
			
 
				+Q="${2:?question required}"
			
 
				+RUNS="${3:-2}"
			
 
				+ENGINE="$(cd "$(dirname "$0")/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/ab-sufficiency}"
			
 
				+TGT="$OUT/target"
			
 
				+command -v claude >/dev/null || { echo "claude CLI not on PATH"; exit 1; }
			
 
				+[ -d "$REPO/.codegraph" ] || { echo "no .codegraph index at $REPO"; exit 1; }
			
 
				+cleanup(){ pkill -9 -f "serve --mcp --path $TGT" 2>/dev/null; }
			
 
				+trap cleanup EXIT
			
 
				+mkdir -p "$OUT"
			
 
				+( cd "$ENGINE" && npm run build >/dev/null 2>&1 ) && echo "built"
			
 
				+
			
 
				+# Throwaway copy + fresh index (the agent works here; a read-only question won't
			
 
				+# edit, but isolate anyway). Excludes the source repo's index/build/vcs.
			
 
				+rm -rf "$TGT"
			
 
				+rsync -a --exclude node_modules --exclude .git --exclude dist --exclude .codegraph "$REPO/" "$TGT/"
			
 
				+node "$BIN" init "$TGT" >/dev/null 2>&1 && echo "indexed copy ($(node "$BIN" status --json 2>/dev/null | node -e 'let s="";process.stdin.on("data",d=>s+=d).on("end",()=>{try{console.log(JSON.parse(s).fileCount+" files")}catch{console.log("?")}})' 2>/dev/null || echo '?'))"
			
 
				+
			
 
				+echo "###### repo=$REPO  runs/arm=$RUNS"
			
 
				+echo "###### Q=$Q"; echo
			
 
				+echo '{"mcpServers":{}}' > "$OUT/mcp-empty.json"
			
 
				+printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$TGT" > "$OUT/mcp-cg.json"
			
 
				+
			
 
				+prewarm(){
			
 
				+  pkill -9 -f "serve --mcp --path $TGT" 2>/dev/null
			
 
				+  CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$TGT" </dev/null >/dev/null 2>&1 &
			
 
				+  node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$TGT" >/dev/null 2>&1
			
 
				+}
			
 
				+
			
 
				+analyze(){
			
 
				+  node -e '
			
 
				+    const fs=require("fs");
			
 
				+    const L=fs.readFileSync(process.argv[1],"utf8").split("\n").filter(Boolean);
			
 
				+    let ex=0,nf=0,ns=0,oc=0,gr=0,exposed="?";const reads=[];
			
 
				+    for(const l of L){try{const o=JSON.parse(l);
			
 
				+      if(o.type==="system"&&o.subtype==="init")exposed=(o.tools||[]).filter(t=>/codegraph/.test(t)).length;
			
 
				+      for(const b of (o.message?.content||[])){if(b.type!=="tool_use")continue;
			
 
				+        if(b.name==="mcp__codegraph__codegraph_explore")ex++;
			
 
				+        else if(b.name==="mcp__codegraph__codegraph_node"){if(b.input&&b.input.symbol)ns++;else nf++;}
			
 
				+        else if(/mcp__codegraph__/.test(b.name))oc++;
			
 
				+        else if(b.name==="Read")reads.push((b.input?.file_path||"").split("/").pop());
			
 
				+        else if(b.name==="Grep")gr++;
			
 
				+      }}catch{}}
			
 
				+    console.log(`    explore=${ex} node[sym]=${ns} node[file]=${nf} other_cg=${oc} | Read=${reads.length}${reads.length?" ("+reads.join(", ")+")":""} Grep=${gr}  [cg exposed=${exposed}]`);
			
 
				+  ' "$1"
			
 
				+}
			
 
				+
			
 
				+run(){ # label, cfg, prewarm(0/1)
			
 
				+  local label="$1" cfg="$2" pw="$3"
			
 
				+  for i in $(seq 1 "$RUNS"); do
			
 
				+    [ "$pw" = "1" ] && prewarm
			
 
				+    ( cd "$TGT" && claude -p "$Q" --output-format stream-json --verbose \
			
 
				+        --permission-mode bypassPermissions --model opus --max-budget-usd 4 \
			
 
				+        --strict-mcp-config --mcp-config "$cfg" </dev/null > "$OUT/$label-$i.jsonl" 2>"$OUT/$label-$i.err" )
			
 
				+    echo "[$label] run $i:"; analyze "$OUT/$label-$i.jsonl"
			
 
				+  done
			
 
				+  echo
			
 
				+}
			
 
				+
			
 
				+echo "== WITH codegraph (premise: explore/node used -> Read ~0) =="; run with "$OUT/mcp-cg.json" 1
			
 
				+echo "== WITHOUT (Read/Grep only — the contrast) =="; run without "$OUT/mcp-empty.json" 0
			
 
				+echo "###### DONE. In the WITH arm: are explore/node>0 and Read~0? Any Read of an INDEXED source file = sufficiency gap. Logs: $OUT"