4 天之前 · 7ddd3fa7eb
--- a/scripts/agent-eval/offload-eval-3arm.sh
+++ b/scripts/agent-eval/offload-eval-3arm.sh
@@ -0,0 +1,65 @@
 
				+#!/usr/bin/env bash
			
 
				+# 3-arm offload eval for ONE indexed repo + ONE question, n reps each.
			
 
				+#   ARM offload : codegraph attached, managed offload ON  (per-run AI usage log)
			
 
				+#   ARM raw     : codegraph attached, CODEGRAPH_OFFLOAD_DISABLE=1 (raw source)
			
 
				+#   ARM nocg    : no codegraph (empty MCP config) -> Read/Grep baseline
			
 
				+# All arms: claude -p sonnet --effort high. One JSON metrics line/run -> $RESULTS.
			
 
				+#
			
 
				+# Usage: offload-eval-3arm.sh <indexed-repo> <tier> <reps> "<question>"
			
 
				+# Env:   MODEL=sonnet EFFORT=high  RESULTS=<file>  AGENT_EVAL_OUT=<scratch dir>
			
 
				+set -uo pipefail
			
 
				+HERE="$(cd "$(dirname "$0")" && pwd)"
			
 
				+ENGINE="$(cd "$HERE/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
			
 
				+TARGET="${1:?usage: offload-eval-3arm.sh <indexed-repo> <tier> <reps> \"<question>\"}"
			
 
				+TIER="${2:?tier}"; REPS="${3:?reps}"; Q="${4:?question}"
			
 
				+RUNS="$OUT/runs"
			
 
				+EXTRACT="$HERE/offload-eval-metrics.mjs"
			
 
				+RESULTS="${RESULTS:-$OUT/results.jsonl}"
			
 
				+REPO=$(basename "$TARGET")
			
 
				+mkdir -p "$RUNS"
			
 
				+command -v claude >/dev/null || { echo "no claude on PATH"; exit 1; }
			
 
				+[ -d "$TARGET/.codegraph" ] || { echo "not indexed: $TARGET (run offload-eval-setup.sh first)"; exit 1; }
			
 
				+# Physical path so pkill matches the daemon's real cmdline (macOS /tmp->/private/tmp symlink
			
 
				+# otherwise makes the kill miss the daemon, and the next arm connects to the SURVIVING daemon
			
 
				+# — contaminating the raw arm with offload).
			
 
				+TARGET=$(cd "$TARGET" && pwd -P)
			
 
				+
			
 
				+prewarm() { # path  extra-env (e.g. "FOO=bar")
			
 
				+  pkill -9 -f "serve --mcp --path $1" 2>/dev/null; rm -f "$1/.codegraph/daemon.sock" 2>/dev/null; sleep 0.6
			
 
				+  env ${2:-} CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
			
 
				+  node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" \
			
 
				+    && echo "  daemon warm" || echo "  WARN daemon never bound"
			
 
				+}
			
 
				+
			
 
				+run() { # arm rep mcp-config usage-log-or-dash
			
 
				+  local arm="$1" rep="$2" cfg="$3" usage="$4" tag="$REPO-$1-$2"
			
 
				+  [ "$usage" != "-" ] && : > "$usage"
			
 
				+  ( cd "$TARGET" && claude -p "$Q" \
			
 
				+      --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				+      --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
			
 
				+      --strict-mcp-config --mcp-config "$cfg" \
			
 
				+      </dev/null > "$RUNS/$tag.jsonl" 2>"$RUNS/$tag.err" )
			
 
				+  node "$EXTRACT" --run "$RUNS/$tag.jsonl" --usage "$usage" --arm "$arm" --rep "$rep" \
			
 
				+      --repo "$REPO" --tier "$TIER" --q "$Q" >> "$RESULTS"
			
 
				+  node -e 'const o=JSON.parse(require("fs").readFileSync(process.argv[1],"utf8").trim().split("\n").pop());console.log(`  [${o.arm} #${o.rep}] ${o.durationSec}s | main $${o.costUsdMain} ${o.tokBillable} tok | read=${o.read} grep=${o.grep} explore=${o.explore} offload=${o.offloadFired} | AI ${o.ai.calls}call/${o.ai.totalTokens}tok/$${o.ai.costUsd.toFixed(4)} | ok=${o.ok}`)' "$RESULTS"
			
 
				+}
			
 
				+
			
 
				+CFG_OFF="$RUNS/mcp-offload-$REPO.json"; CFG_RAW="$RUNS/mcp-raw-$REPO.json"; CFG_NOCG="$RUNS/mcp-nocg.json"
			
 
				+USAGE="$RUNS/$REPO-usage.jsonl"
			
 
				+printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","CODEGRAPH_OFFLOAD_USAGE_LOG=%s","node","%s","serve","--mcp","--path","%s"]}}}' "$USAGE" "$BIN" "$TARGET" > "$CFG_OFF"
			
 
				+printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","CODEGRAPH_OFFLOAD_DISABLE=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$TARGET" > "$CFG_RAW"
			
 
				+printf '{"mcpServers":{}}' > "$CFG_NOCG"
			
 
				+
			
 
				+echo "###### repo=$REPO tier=$TIER reps=$REPS model=${MODEL:-sonnet}/${EFFORT:-high}"
			
 
				+echo "###### Q=$Q"
			
 
				+echo "== ARM offload =="; prewarm "$TARGET" "CODEGRAPH_OFFLOAD_USAGE_LOG=$USAGE"
			
 
				+for r in $(seq 1 "$REPS"); do run offload "$r" "$CFG_OFF" "$USAGE"; done
			
 
				+pkill -9 -f "serve --mcp --path $TARGET" 2>/dev/null; rm -f "$TARGET/.codegraph/daemon.sock" 2>/dev/null; sleep 1
			
 
				+echo "== ARM raw =="; prewarm "$TARGET" "CODEGRAPH_OFFLOAD_DISABLE=1"
			
 
				+for r in $(seq 1 "$REPS"); do run raw "$r" "$CFG_RAW" "-"; done
			
 
				+pkill -9 -f "serve --mcp --path $TARGET" 2>/dev/null; rm -f "$TARGET/.codegraph/daemon.sock" 2>/dev/null; sleep 1
			
 
				+echo "== ARM nocg =="
			
 
				+for r in $(seq 1 "$REPS"); do run nocg "$r" "$CFG_NOCG" "-"; done
			
 
				+echo "###### DONE $REPO"
			
--- a/scripts/agent-eval/offload-eval-frontload-matrix.sh
+++ b/scripts/agent-eval/offload-eval-frontload-matrix.sh
@@ -0,0 +1,25 @@
 
				+#!/usr/bin/env bash
			
 
				+# Run the FRONTLOAD arm across all 4 tiers (n reps), then judge + merge with the existing
			
 
				+# matrix (offload/raw/nocg in $OUT/judged.jsonl, if present) + emit a combined summary.
			
 
				+# Env: REPS (default 3)  AGENT_EVAL_OUT=<scratch dir>
			
 
				+set -uo pipefail
			
 
				+HERE="$(cd "$(dirname "$0")" && pwd)"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
			
 
				+GT="$HERE/offload-eval-ground-truth.json"
			
 
				+REPS="${REPS:-3}"
			
 
				+export RESULTS="$OUT/results-fl.jsonl"
			
 
				+: > "$RESULTS"; rm -f "$OUT/runs/hook-debug.log"
			
 
				+for repo in mtkruto postybirb shapeshift trezor; do
			
 
				+  case "$repo" in mtkruto) tier=small;; postybirb) tier=medium;; shapeshift) tier=complex;; trezor) tier=large;; esac
			
 
				+  Q=$(node -e "console.log(JSON.parse(require('fs').readFileSync(process.argv[1],'utf8'))[process.argv[2]].question)" "$GT" "$repo")
			
 
				+  echo ""; echo "### $repo ($tier)  $(date +%H:%M:%S)"
			
 
				+  bash "$HERE/offload-eval-frontload.sh" "$OUT/repos/$repo" "$tier" "$REPS" "$Q"
			
 
				+done
			
 
				+echo ""
			
 
				+echo "frontload: $(wc -l < "$RESULTS") runs | hook injections: $(grep -c INJECTED "$OUT/runs/hook-debug.log" 2>/dev/null) | errors: $(grep -c ERROR "$OUT/runs/hook-debug.log" 2>/dev/null)"
			
 
				+echo "=== JUDGE frontload ==="
			
 
				+node "$HERE/offload-eval-judge.mjs" --results "$RESULTS" --truth "$GT" --out "$OUT/judged-fl.jsonl" --concurrency 4 2>&1 | tail -4
			
 
				+if [ -f "$OUT/judged.jsonl" ]; then cat "$OUT/judged.jsonl" "$OUT/judged-fl.jsonl" > "$OUT/judged-all.jsonl"; else cp "$OUT/judged-fl.jsonl" "$OUT/judged-all.jsonl"; fi
			
 
				+echo "=== COMBINED SUMMARY ==="
			
 
				+node "$HERE/offload-eval-summarize.mjs" "$OUT/judged-all.jsonl"
			
 
				+echo "###### FRONTLOAD MATRIX DONE"
			
--- a/scripts/agent-eval/offload-eval-frontload.sh
+++ b/scripts/agent-eval/offload-eval-frontload.sh
@@ -0,0 +1,47 @@
 
				+#!/usr/bin/env bash
			
 
				+# FRONTLOAD arm (approach 1): codegraph attached (offload-disabled) + the front-load
			
 
				+# UserPromptSubmit hook (offload-eval-hook.mjs), n reps, appended to $RESULTS. Compare against
			
 
				+# the matrix's raw/nocg baselines. Usage: offload-eval-frontload.sh <indexed-repo> <tier> <reps> "<Q>"
			
 
				+# Env: MODEL=sonnet EFFORT=high  RESULTS=<file>  AGENT_EVAL_OUT=<scratch dir>
			
 
				+set -uo pipefail
			
 
				+HERE="$(cd "$(dirname "$0")" && pwd)"
			
 
				+ENGINE="$(cd "$HERE/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
			
 
				+TARGET="${1:?repo}"; TIER="${2:?tier}"; REPS="${3:?reps}"; Q="${4:?question}"
			
 
				+RUNS="$OUT/runs"
			
 
				+EXTRACT="$HERE/offload-eval-metrics.mjs"
			
 
				+RESULTS="${RESULTS:-$OUT/results-fl.jsonl}"
			
 
				+REPO=$(basename "$TARGET")
			
 
				+mkdir -p "$RUNS"
			
 
				+[ -d "$TARGET/.codegraph" ] || { echo "not indexed: $TARGET"; exit 1; }
			
 
				+TARGET=$(cd "$TARGET" && pwd -P)
			
 
				+
			
 
				+CFG="$RUNS/mcp-fl-$REPO.json"
			
 
				+printf '{"mcpServers":{"codegraph":{"command":"env","args":["CODEGRAPH_WASM_RELAUNCHED=1","CODEGRAPH_OFFLOAD_DISABLE=1","node","%s","serve","--mcp","--path","%s"]}}}' "$BIN" "$TARGET" > "$CFG"
			
 
				+# Generate the hook settings pointing at the persisted hook; enable its debug log so we can
			
 
				+# count injections (claude passes this env down to the spawned hook process).
			
 
				+HOOKCFG="$RUNS/frontload-settings.json"
			
 
				+printf '{"hooks":{"UserPromptSubmit":[{"hooks":[{"type":"command","command":"node %s/offload-eval-hook.mjs"}]}]}}' "$HERE" > "$HOOKCFG"
			
 
				+export CG_FRONTLOAD_DEBUG="$RUNS/hook-debug.log"
			
 
				+
			
 
				+prewarm() {
			
 
				+  pkill -9 -f "serve --mcp --path $1" 2>/dev/null; rm -f "$1/.codegraph/daemon.sock" 2>/dev/null; sleep 0.6
			
 
				+  env CODEGRAPH_OFFLOAD_DISABLE=1 CODEGRAPH_DAEMON_IDLE_TIMEOUT_MS=1800000 node "$BIN" serve --mcp --path "$1" </dev/null >/dev/null 2>&1 &
			
 
				+  node -e 'const fs=require("fs");let n=0;const t=setInterval(()=>{if(fs.existsSync(process.argv[1]+"/.codegraph/daemon.sock")){clearInterval(t);process.exit(0)}if(n++>150){clearInterval(t);process.exit(1)}},100)' "$1" \
			
 
				+    && echo "  daemon warm" || echo "  WARN no daemon"
			
 
				+}
			
 
				+
			
 
				+echo "###### FRONTLOAD repo=$REPO tier=$TIER reps=$REPS"
			
 
				+prewarm "$TARGET"
			
 
				+for r in $(seq 1 "$REPS"); do
			
 
				+  tag="$REPO-frontload-$r"
			
 
				+  ( cd "$TARGET" && claude -p "$Q" --output-format stream-json --verbose --permission-mode bypassPermissions \
			
 
				+      --model "${MODEL:-sonnet}" --effort "${EFFORT:-high}" --max-budget-usd 4 \
			
 
				+      --strict-mcp-config --mcp-config "$CFG" --settings "$HOOKCFG" \
			
 
				+      </dev/null > "$RUNS/$tag.jsonl" 2>"$RUNS/$tag.err" )
			
 
				+  node "$EXTRACT" --run "$RUNS/$tag.jsonl" --usage "-" --arm frontload --rep "$r" --repo "$REPO" --tier "$TIER" --q "$Q" >> "$RESULTS"
			
 
				+  node -e 'const o=JSON.parse(require("fs").readFileSync(process.argv[1],"utf8").trim().split("\n").pop());console.log(`  [frontload #${o.rep}] ${o.durationSec}s | main $${o.costUsdMain} ${o.tokBillable}tok | read=${o.read} grep=${o.grep} agentExplore=${o.explore} | ok=${o.ok}`)' "$RESULTS"
			
 
				+done
			
 
				+pkill -9 -f "serve --mcp --path $TARGET" 2>/dev/null; rm -f "$TARGET/.codegraph/daemon.sock" 2>/dev/null
			
 
				+echo "###### FRONTLOAD DONE $REPO (cumulative hook injections: $(grep -c INJECTED "$CG_FRONTLOAD_DEBUG" 2>/dev/null))"
			
--- a/scripts/agent-eval/offload-eval-ground-truth.json
+++ b/scripts/agent-eval/offload-eval-ground-truth.json
--- a/scripts/agent-eval/offload-eval-hook.mjs
+++ b/scripts/agent-eval/offload-eval-hook.mjs
@@ -0,0 +1,84 @@
 
				+#!/usr/bin/env node
			
 
				+// UserPromptSubmit hook — APPROACH 1: additive context-injection.
			
 
				+// Front-loads codegraph's structural answer for flow/impact/"how/where" prompts so the
			
 
				+// agent's reflex grep/read has nothing left to find. Strictly additive (never blocks),
			
 
				+// gated to structural prompts (no cost otherwise), and uses RAW explore (offload disabled)
			
 
				+// so the injected context is accurate — never the (currently low-fidelity) synthesis.
			
 
				+//
			
 
				+// Reads {prompt, cwd} as JSON on stdin; prints the explore result to stdout (which Claude
			
 
				+// Code injects into the agent's context). Any failure -> silent exit 0 (degradable).
			
 
				+import { pathToFileURL, fileURLToPath } from 'node:url';
			
 
				+import { resolve, join, dirname } from 'node:path';
			
 
				+import { existsSync, readFileSync, appendFileSync } from 'node:fs';
			
 
				+
			
 
				+// Resolve the engine repo from this script's own location (scripts/agent-eval/ -> ../..),
			
 
				+// overridable with CG_ENGINE. The hook ships inside the repo, so it finds its own dist.
			
 
				+const HERE = dirname(fileURLToPath(import.meta.url));
			
 
				+const ENGINE = process.env.CG_ENGINE || resolve(HERE, '..', '..');
			
 
				+const BUDGET = Number(process.env.CG_FRONTLOAD_BUDGET || 16000);
			
 
				+
			
 
				+// Debug log only when CG_FRONTLOAD_DEBUG is set to a file path (the harness points it at a
			
 
				+// log to count injections); off by default so the shipped hook writes nothing extra.
			
 
				+const DBG = process.env.CG_FRONTLOAD_DEBUG;
			
 
				+const dbg = (m) => { if (!DBG) return; try { appendFileSync(DBG, `[${new Date().toISOString()}] ${m}\n`); } catch { /* ignore */ } };
			
 
				+
			
 
				+let input = {};
			
 
				+try { input = JSON.parse(readFileSync(0, 'utf8')); } catch (e) { dbg('stdin parse fail: ' + e.message); }
			
 
				+const prompt = String(input.prompt || '');
			
 
				+const cwd = String(input.cwd || process.cwd());
			
 
				+dbg(`invoked: promptLen=${prompt.length} cwd=${cwd}`);
			
 
				+
			
 
				+// Gate: only structural / flow / impact / where-how questions. Cheap regex; silent no-op
			
 
				+// otherwise so non-structural prompts ("fix this typo") cost nothing.
			
 
				+const STRUCTURAL = /\b(how|where|trace|flow|path|reach(es|ed)?|call(s|ed|er|ers|ee)?|depend|impact|affect|wire[ds]?|connect|implement|architect|structure|breaks?|what calls|why does)\b/i;
			
 
				+if (!prompt || !STRUCTURAL.test(prompt)) { dbg('gate: non-structural, no-op'); process.exit(0); }
			
 
				+dbg('gate: structural PASS');
			
 
				+
			
 
				+// Find the index: cwd, then walk up a few levels.
			
 
				+let root = cwd, found = null;
			
 
				+for (let i = 0; i < 6 && root; i++) {
			
 
				+  if (existsSync(join(root, '.codegraph'))) { found = root; break; }
			
 
				+  const parent = resolve(root, '..'); if (parent === root) break; root = parent;
			
 
				+}
			
 
				+if (!found) { dbg(`no .codegraph found from cwd=${cwd}`); process.exit(0); }
			
 
				+dbg(`found index at ${found}`);
			
 
				+
			
 
				+try {
			
 
				+  process.env.CODEGRAPH_OFFLOAD_DISABLE = '1'; // raw, accurate — never the unfixed offload
			
 
				+  process.env.CODEGRAPH_TELEMETRY = '0'; process.env.DO_NOT_TRACK = '1';
			
 
				+  const load = async (rel) => import(pathToFileURL(resolve(ENGINE, rel)).href);
			
 
				+  const idx = await load('dist/index.js');
			
 
				+  const tools = await load('dist/mcp/tools.js');
			
 
				+  const CodeGraph = idx.default?.default ?? idx.default ?? idx.CodeGraph;
			
 
				+  const ToolHandler = tools.ToolHandler ?? tools.default?.ToolHandler;
			
 
				+  if (typeof CodeGraph?.openSync !== 'function' || typeof ToolHandler !== 'function') process.exit(0);
			
 
				+
			
 
				+  // Retry once on a transient busy/locked index (the hook's openSync can race a
			
 
				+  // freshly-warming daemon on the first prompt of a session).
			
 
				+  let text = '';
			
 
				+  for (let attempt = 1; attempt <= 2; attempt++) {
			
 
				+    try {
			
 
				+      const cg = CodeGraph.openSync(found);
			
 
				+      const h = new ToolHandler(cg);
			
 
				+      const res = await h.execute('codegraph_explore', { query: prompt });
			
 
				+      text = res?.content?.[0]?.text ?? '';
			
 
				+      try { cg.close?.(); } catch { /* ignore */ }
			
 
				+      dbg(`explore attempt ${attempt} returned ${text.length} chars`);
			
 
				+      break;
			
 
				+    } catch (e) {
			
 
				+      dbg(`explore attempt ${attempt} failed: ${e?.message || e}`);
			
 
				+      if (attempt === 2) throw e;
			
 
				+      await new Promise((r) => setTimeout(r, 800));
			
 
				+    }
			
 
				+  }
			
 
				+  if (!text.trim()) { dbg('empty explore result, no-op'); process.exit(0); }
			
 
				+  if (text.length > BUDGET) text = text.slice(0, BUDGET) + '\n…[front-load truncated to budget]';
			
 
				+
			
 
				+  process.stdout.write(
			
 
				+    `## CodeGraph structural context (auto-retrieved for this question)\n` +
			
 
				+    `The code graph was queried for your question; the relevant symbols, source, and call flow are below. ` +
			
 
				+    `Treat the quoted source as already read. If you need more, call codegraph_explore with specific symbol names rather than grepping or reading files.\n\n` +
			
 
				+    text + '\n'
			
 
				+  );
			
 
				+  dbg(`INJECTED ${text.length} chars`);
			
 
				+} catch (e) { dbg('ERROR: ' + (e?.stack || e?.message || e)); process.exit(0); } // degradable
			
--- a/scripts/agent-eval/offload-eval-judge.mjs
+++ b/scripts/agent-eval/offload-eval-judge.mjs
@@ -0,0 +1,103 @@
 
				+#!/usr/bin/env node
			
 
				+// Accuracy judge. For each run in results.jsonl:
			
 
				+//   - end-to-end: agent finalAnswer vs verified ground truth (all arms)
			
 
				+//   - fidelity:   offload synthesized answer vs ground truth (offload arm only)
			
 
				+// Judge = claude -p sonnet --effort high, no tools, run from a neutral cwd,
			
 
				+// JSON-only verdicts. Writes judged.jsonl (one line per run, verdicts merged).
			
 
				+//
			
 
				+// Usage: judge.mjs --results <f> --truth <f> --out <f> [--concurrency 4]
			
 
				+import { readFileSync, writeFileSync, existsSync } from 'fs';
			
 
				+import { execFile } from 'child_process';
			
 
				+
			
 
				+const A = {};
			
 
				+for (let i = 2; i < process.argv.length; i += 2) A[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
			
 
				+const results = readFileSync(A.results, 'utf8').split('\n').filter(Boolean).map(l => JSON.parse(l));
			
 
				+const truth = JSON.parse(readFileSync(A.truth, 'utf8'));
			
 
				+const OUT = A.out || '/tmp/cg-offload-eval/judged.jsonl';
			
 
				+const CONC = Number(A.concurrency || 4);
			
 
				+
			
 
				+function askJudge(prompt) {
			
 
				+  return new Promise((resolve) => {
			
 
				+    execFile('claude', ['-p', prompt, '--model', 'sonnet', '--effort', 'high',
			
 
				+      '--max-budget-usd', '0.5', '--strict-mcp-config', '--mcp-config', '{"mcpServers":{}}'],
			
 
				+      // Run from a neutral dir with no repo files so the judge can't "cheat" by reading source.
			
 
				+      { cwd: process.env.AGENT_EVAL_OUT || '/tmp', maxBuffer: 1 << 24, timeout: 120000 },
			
 
				+      (err, stdout) => {
			
 
				+        const raw = (stdout || '').trim();
			
 
				+        const m = raw.match(/\{[\s\S]*\}/);
			
 
				+        if (!m) return resolve({ verdict: 'error', score: null, note: (err ? 'exec ' + err.message : 'no json').slice(0, 80) });
			
 
				+        try { resolve(JSON.parse(m[0])); } catch { resolve({ verdict: 'error', score: null, note: 'parse fail' }); }
			
 
				+      });
			
 
				+  });
			
 
				+}
			
 
				+
			
 
				+const e2ePrompt = (gt, ans) => `You are scoring whether an AI coding agent correctly answered a code-flow question about a repository. Judge ONLY against the verified ground truth. Do NOT use any tools.
			
 
				+
			
 
				+QUESTION: ${gt.question}
			
 
				+
			
 
				+VERIFIED GROUND TRUTH (the actual call path + files):
			
 
				+${gt.truth}
			
 
				+
			
 
				+AGENT'S ANSWER:
			
 
				+${ans || '(empty)'}
			
 
				+
			
 
				+Score how correct the agent's answer is vs the ground truth. A "pass" means it identifies the core mechanism and the major hops with the right files/symbols and makes no materially wrong claim. "partial" = right area but misses major hops or has notable errors. "fail" = wrong layer, fabricated, or misses the mechanism.
			
 
				+Output ONLY minified JSON, no prose, no code fences:
			
 
				+{"verdict":"pass|partial|fail","score":<0-100>,"missedHops":["..."],"wrongClaims":["..."],"note":"<=20 words"}`;
			
 
				+
			
 
				+const fidPrompt = (gt, ans) => `You are scoring the FIDELITY of a machine-synthesized code-exploration answer against verified ground truth. The synthesized answer claims to trace a flow and cite file:line locations. Do NOT use any tools.
			
 
				+
			
 
				+QUESTION: ${gt.question}
			
 
				+
			
 
				+VERIFIED GROUND TRUTH (the actual call path + files):
			
 
				+${gt.truth}
			
 
				+
			
 
				+SYNTHESIZED ANSWER (to score):
			
 
				+${ans || '(empty)'}
			
 
				+
			
 
				+Judge: (1) is the traced call path correct vs ground truth? (2) are the cited files/symbols correct (not fabricated)? (3) if it gave a "Coverage:" verdict, was that verdict honest about what it actually covered? A confident WRONG trace is the worst outcome — penalize it harder than an honest "partial/not found".
			
 
				+Output ONLY minified JSON, no prose, no code fences:
			
 
				+{"verdict":"pass|partial|fail","score":<0-100>,"fabrication":<true|false>,"coverageHonest":<true|false>,"missedHops":["..."],"note":"<=20 words"}`;
			
 
				+
			
 
				+// Build the job list
			
 
				+const jobs = [];
			
 
				+for (const r of results) {
			
 
				+  const gt = truth[r.repo];
			
 
				+  if (!gt) { r._nojudge = true; continue; }
			
 
				+  jobs.push({ r, kind: 'e2e', prompt: e2ePrompt(gt, r.finalAnswer) });
			
 
				+  if (r.arm === 'offload' && Array.isArray(r.offloadAnswers))
			
 
				+    r.offloadAnswers.forEach((ans, i) => { if (ans && ans.trim()) jobs.push({ r, kind: 'fid', idx: i, prompt: fidPrompt(gt, ans) }); });
			
 
				+}
			
 
				+console.error(`judging ${jobs.length} verdicts across ${results.length} runs (concurrency ${CONC})...`);
			
 
				+
			
 
				+let done = 0;
			
 
				+async function worker(queue) {
			
 
				+  while (queue.length) {
			
 
				+    const job = queue.shift();
			
 
				+    const v = await askJudge(job.prompt);
			
 
				+    if (job.kind === 'e2e') job.r.e2e = v; else (job.r._fid ??= []).push(v);
			
 
				+    console.error(`  [${++done}/${jobs.length}] ${job.r.repo}/${job.r.arm}#${job.r.rep} ${job.kind}: ${v.verdict}${v.score != null ? ' ' + v.score : ''}`);
			
 
				+  }
			
 
				+}
			
 
				+const q = [...jobs];
			
 
				+await Promise.all(Array.from({ length: CONC }, () => worker(q)));
			
 
				+
			
 
				+// Aggregate per-answer fidelity verdicts into one fidelity object per offload run.
			
 
				+const medOf = (a) => { a = [...a].sort((x, y) => x - y); return a.length ? (a.length % 2 ? a[(a.length - 1) / 2] : (a[a.length / 2 - 1] + a[a.length / 2]) / 2) : null; };
			
 
				+for (const r of results) {
			
 
				+  if (r._fid?.length) {
			
 
				+    const scores = r._fid.map(v => v.score).filter(x => x != null);
			
 
				+    r.fidelity = {
			
 
				+      n: r._fid.length, scores,
			
 
				+      max: scores.length ? Math.max(...scores) : null,
			
 
				+      min: scores.length ? Math.min(...scores) : null,
			
 
				+      median: medOf(scores),
			
 
				+      anyFabrication: r._fid.some(v => v.fabrication === true),
			
 
				+      allCoverageHonest: r._fid.every(v => v.coverageHonest !== false),
			
 
				+      verdicts: r._fid.map(v => v.verdict),
			
 
				+    };
			
 
				+  }
			
 
				+  delete r._fid;
			
 
				+}
			
 
				+writeFileSync(OUT, results.map(r => JSON.stringify(r)).join('\n') + '\n');
			
 
				+console.error(`wrote ${OUT}`);
			
--- a/scripts/agent-eval/offload-eval-matrix.sh
+++ b/scripts/agent-eval/offload-eval-matrix.sh
@@ -0,0 +1,20 @@
 
				+#!/usr/bin/env bash
			
 
				+# Drive the 3-arm campaign (offload/raw/nocg) across all 4 tiers, n reps each, into one
			
 
				+# results.jsonl. Reads the canonical question per repo from offload-eval-ground-truth.json.
			
 
				+# Env: REPS (default 3)  AGENT_EVAL_OUT=<scratch dir>
			
 
				+set -uo pipefail
			
 
				+HERE="$(cd "$(dirname "$0")" && pwd)"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
			
 
				+GT="$HERE/offload-eval-ground-truth.json"
			
 
				+REPS="${REPS:-3}"
			
 
				+export RESULTS="$OUT/results.jsonl"
			
 
				+: > "$RESULTS"
			
 
				+for repo in mtkruto postybirb shapeshift trezor; do
			
 
				+  case "$repo" in mtkruto) tier=small;; postybirb) tier=medium;; shapeshift) tier=complex;; trezor) tier=large;; esac
			
 
				+  Q=$(node -e "console.log(JSON.parse(require('fs').readFileSync(process.argv[1],'utf8'))[process.argv[2]].question)" "$GT" "$repo")
			
 
				+  echo ""; echo "### $repo ($tier)  $(date +%H:%M:%S)"
			
 
				+  bash "$HERE/offload-eval-3arm.sh" "$OUT/repos/$repo" "$tier" "$REPS" "$Q"
			
 
				+done
			
 
				+echo ""; echo "###### MATRIX DONE -> $RESULTS ($(wc -l < "$RESULTS") runs).  Judge + summarize with:"
			
 
				+echo "  node $HERE/offload-eval-judge.mjs --results $RESULTS --truth $GT --out $OUT/judged.jsonl"
			
 
				+echo "  node $HERE/offload-eval-summarize.mjs $OUT/judged.jsonl"
			
--- a/scripts/agent-eval/offload-eval-metrics.mjs
+++ b/scripts/agent-eval/offload-eval-metrics.mjs
@@ -0,0 +1,90 @@
 
				+#!/usr/bin/env node
			
 
				+// Extract one eval run's metrics from its Claude stream-json transcript + the
			
 
				+// offload usage sidecar log, emit ONE merged JSON line.
			
 
				+//
			
 
				+// Usage: extract-metrics.mjs --run <run.jsonl> --usage <usage.jsonl|-> \
			
 
				+//          --arm <a> --rep <n> --repo <r> --tier <t> --q <question>
			
 
				+import { readFileSync, existsSync } from 'fs';
			
 
				+
			
 
				+const args = {};
			
 
				+for (let i = 2; i < process.argv.length; i += 2) args[process.argv[i].replace(/^--/, '')] = process.argv[i + 1];
			
 
				+
			
 
				+const runFile = args.run;
			
 
				+const lines = existsSync(runFile) ? readFileSync(runFile, 'utf8').split('\n').filter(Boolean) : [];
			
 
				+
			
 
				+const toolCounts = {};
			
 
				+let result = null;
			
 
				+const tok = { gen: 0, fresh: 0, cached: 0 };
			
 
				+const offloadAnswers = [];
			
 
				+let exploreResults = 0; // tool_results from explore (offload or raw)
			
 
				+let lastAssistantText = '';
			
 
				+
			
 
				+for (const line of lines) {
			
 
				+  let ev; try { ev = JSON.parse(line); } catch { continue; }
			
 
				+
			
 
				+  // per-turn token usage (authoritative token measure; result.usage is last-turn only)
			
 
				+  const u = ev.message?.usage;
			
 
				+  if (u) {
			
 
				+    tok.gen += u.output_tokens || 0;
			
 
				+    tok.fresh += (u.input_tokens || 0) + (u.cache_creation_input_tokens || 0);
			
 
				+    tok.cached += u.cache_read_input_tokens || 0;
			
 
				+  }
			
 
				+
			
 
				+  if (ev.type === 'assistant' && Array.isArray(ev.message?.content)) {
			
 
				+    for (const b of ev.message.content) {
			
 
				+      if (b.type === 'tool_use') toolCounts[b.name] = (toolCounts[b.name] || 0) + 1;
			
 
				+      if (b.type === 'text' && b.text?.trim()) lastAssistantText = b.text.trim();
			
 
				+    }
			
 
				+  }
			
 
				+  // tool_results arrive in user messages
			
 
				+  if (ev.type === 'user' && Array.isArray(ev.message?.content)) {
			
 
				+    for (const b of ev.message.content) {
			
 
				+      if (b.type !== 'tool_result') continue;
			
 
				+      const text = Array.isArray(b.content)
			
 
				+        ? b.content.map(c => (typeof c === 'string' ? c : c.text || '')).join('')
			
 
				+        : (typeof b.content === 'string' ? b.content : '');
			
 
				+      if (/Synthesized by CodeGraph/.test(text)) { offloadAnswers.push(text); exploreResults++; }
			
 
				+      else if (/Found \d+ symbols? across|## Exploration:/.test(text)) exploreResults++;
			
 
				+    }
			
 
				+  }
			
 
				+  if (ev.type === 'result') result = ev;
			
 
				+}
			
 
				+
			
 
				+// offload usage sidecar (CodeGraph AI tokens + cost) — one JSON line per offload call
			
 
				+const ai = { calls: 0, promptTokens: 0, completionTokens: 0, totalTokens: 0, credits: 0, costUsd: 0, ms: 0 };
			
 
				+if (args.usage && args.usage !== '-' && existsSync(args.usage)) {
			
 
				+  for (const line of readFileSync(args.usage, 'utf8').split('\n').filter(Boolean)) {
			
 
				+    let e; try { e = JSON.parse(line); } catch { continue; }
			
 
				+    ai.calls++;
			
 
				+    ai.promptTokens += e.promptTokens || 0;
			
 
				+    ai.completionTokens += e.completionTokens || 0;
			
 
				+    ai.totalTokens += e.totalTokens || 0;
			
 
				+    ai.credits += e.creditsCharged || 0;
			
 
				+    ai.costUsd += e.costUsd || 0;
			
 
				+    ai.ms += e.ms || 0;
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// front-load hook fired iff its injected header appears in the transcript
			
 
				+const frontload = lines.some(l => l.includes('auto-retrieved for this question'));
			
 
				+const get = (n) => toolCounts[n] || 0;
			
 
				+const read = get('Read');
			
 
				+const grep = get('Grep') + get('Bash') + get('Glob');
			
 
				+const explore = get('mcp__codegraph__codegraph_explore');
			
 
				+const cgAny = Object.keys(toolCounts).filter(k => /mcp__codegraph__/.test(k)).reduce((s, k) => s + toolCounts[k], 0);
			
 
				+
			
 
				+const out = {
			
 
				+  repo: args.repo, tier: args.tier, arm: args.arm, rep: Number(args.rep), question: args.q,
			
 
				+  ok: result?.subtype === 'success',
			
 
				+  durationSec: result ? +(result.duration_ms / 1000).toFixed(1) : null,
			
 
				+  numTurns: result?.num_turns ?? null,
			
 
				+  costUsdMain: result ? +(result.total_cost_usd || 0).toFixed(4) : null,
			
 
				+  tokGen: tok.gen, tokFresh: tok.fresh, tokCached: tok.cached, tokBillable: tok.gen + tok.fresh,
			
 
				+  read, grep, explore, cgAny, frontload,
			
 
				+  offloadFired: offloadAnswers.length,
			
 
				+  ai,
			
 
				+  // text payloads for the accuracy judge (kept separate; large)
			
 
				+  finalAnswer: (result?.result || lastAssistantText || '').slice(0, 8000),
			
 
				+  offloadAnswers: offloadAnswers.map(a => a.slice(0, 6000)),
			
 
				+};
			
 
				+process.stdout.write(JSON.stringify(out) + '\n');
			
--- a/scripts/agent-eval/offload-eval-setup.sh
+++ b/scripts/agent-eval/offload-eval-setup.sh
@@ -0,0 +1,24 @@
 
				+#!/usr/bin/env bash
			
 
				+# Clone + index the 4 "not-trained-on" eval repos into $AGENT_EVAL_OUT/repos. These were
			
 
				+# selected via a no-tools memory-probe gate (Sonnet cannot answer their flow questions from
			
 
				+# memory — so the no-codegraph baseline is honest). Env: AGENT_EVAL_OUT=<scratch dir>
			
 
				+set -uo pipefail
			
 
				+HERE="$(cd "$(dirname "$0")" && pwd)"
			
 
				+ENGINE="$(cd "$HERE/../.." && pwd)"
			
 
				+BIN="$ENGINE/dist/bin/codegraph.js"
			
 
				+OUT="${AGENT_EVAL_OUT:-/tmp/cg-offload-eval}"
			
 
				+ROOT="$OUT/repos"; mkdir -p "$ROOT"
			
 
				+export CODEGRAPH_TELEMETRY=0 DO_NOT_TRACK=1
			
 
				+[ -f "$BIN" ] || { echo "engine not built: run 'npm run build' in $ENGINE first"; exit 1; }
			
 
				+
			
 
				+clone_index() { # url name
			
 
				+  echo "=== $2: clone ==="; rm -rf "$ROOT/$2"
			
 
				+  git clone --quiet --depth 1 "$1" "$ROOT/$2" || { echo "  clone FAILED"; return 1; }
			
 
				+  echo "=== $2: index ==="
			
 
				+  node "$BIN" init "$ROOT/$2" 2>&1 | grep -iE 'indexed|nodes|edges|error' | tail -2
			
 
				+}
			
 
				+clone_index https://github.com/MTKruto/MTKruto.git mtkruto          # small  (~322 TS)
			
 
				+clone_index https://github.com/mvdicarlo/postybirb-plus.git postybirb  # medium (~608 TS)
			
 
				+clone_index https://github.com/shapeshift/web.git shapeshift        # complex (~3.2k TS, 35-pkg monorepo)
			
 
				+clone_index https://github.com/trezor/trezor-suite.git trezor       # large  (~8k TS monorepo)
			
 
				+echo "###### SETUP DONE -> $ROOT"
			
--- a/scripts/agent-eval/offload-eval-summarize.mjs
+++ b/scripts/agent-eval/offload-eval-summarize.mjs
@@ -0,0 +1,68 @@
 
				+#!/usr/bin/env node
			
 
				+// Aggregate judged.jsonl (or results.jsonl) into a per-repo, per-arm report:
			
 
				+// time, main tokens/cost, AI tokens/cost, total cost, tool mix, accuracy.
			
 
				+// Usage: summarize.mjs <judged-or-results.jsonl>
			
 
				+import { readFileSync } from 'fs';
			
 
				+const rows = readFileSync(process.argv[2], 'utf8').split('\n').filter(Boolean).map(l => JSON.parse(l));
			
 
				+
			
 
				+const med = (xs) => { const a = xs.filter(x => x != null).sort((p, q) => p - q); if (!a.length) return null; const m = Math.floor(a.length / 2); return a.length % 2 ? a[m] : (a[m - 1] + a[m]) / 2; };
			
 
				+const rng = (xs) => { const a = xs.filter(x => x != null); return a.length ? `${Math.min(...a)}–${Math.max(...a)}` : '—'; };
			
 
				+const d2 = (x) => x == null ? '—' : (+x).toFixed(2);
			
 
				+const d3 = (x) => x == null ? '—' : (+x).toFixed(3);
			
 
				+const d4 = (x) => x == null ? '—' : (+x).toFixed(4);
			
 
				+
			
 
				+const ARM_ORDER = ['frontload', 'offload', 'raw', 'nocg'];
			
 
				+const byRepo = {};
			
 
				+for (const r of rows) (byRepo[r.repo] ??= {});
			
 
				+for (const r of rows) ((byRepo[r.repo][r.arm] ??= []).push(r));
			
 
				+
			
 
				+const verdictTally = (rs, field) => {
			
 
				+  const t = { pass: 0, partial: 0, fail: 0, error: 0 };
			
 
				+  for (const r of rs) { const v = r[field]?.verdict; if (v in t) t[v]++; }
			
 
				+  return t;
			
 
				+};
			
 
				+
			
 
				+for (const repo of Object.keys(byRepo)) {
			
 
				+  const tier = byRepo[repo][Object.keys(byRepo[repo])[0]][0].tier;
			
 
				+  console.log(`\n${'='.repeat(78)}\n${repo}  [${tier}]\n${'='.repeat(78)}`);
			
 
				+  console.log(`${'arm'.padEnd(9)} n  ${'time(s)'.padStart(9)} ${'mainCost'.padStart(9)} ${'aiCost'.padStart(8)} ${'totCost'.padStart(8)} ${'mainTok'.padStart(8)} ${'aiTok'.padStart(7)} ${'rd'.padStart(3)} ${'gr'.padStart(3)} ${'exp'.padStart(3)} ${'off'.padStart(3)}  e2e(P/p/F)  fidScore`);
			
 
				+  for (const arm of ARM_ORDER) {
			
 
				+    const rs = byRepo[repo][arm]; if (!rs) continue;
			
 
				+    const n = rs.length;
			
 
				+    const mainCost = med(rs.map(r => r.costUsdMain));
			
 
				+    const aiCost = med(rs.map(r => r.ai?.costUsd ?? 0));
			
 
				+    const totCost = (mainCost ?? 0) + (aiCost ?? 0);
			
 
				+    const e2e = verdictTally(rs, 'e2e');
			
 
				+    const fidScores = arm === 'offload' ? rs.flatMap(r => r.fidelity?.scores ?? []) : [];
			
 
				+    const fid = fidScores.length ? med(fidScores) : null;
			
 
				+    const fab = arm === 'offload' && rs.some(r => r.fidelity?.anyFabrication);
			
 
				+    const e2eScore = med(rs.map(r => r.e2e?.score).filter(x => x != null));
			
 
				+    console.log(
			
 
				+      `${arm.padEnd(9)} ${String(n).padStart(1)}  ${String(med(rs.map(r => r.durationSec))).padStart(9)} ` +
			
 
				+      `${('$' + d3(mainCost)).padStart(9)} ${('$' + d3(aiCost)).padStart(8)} ${('$' + d3(totCost)).padStart(8)} ` +
			
 
				+      `${String(Math.round(med(rs.map(r => r.tokBillable)) / 1000) + 'k').padStart(8)} ${String(Math.round(med(rs.map(r => r.ai?.totalTokens ?? 0)) / 1000) + 'k').padStart(7)} ` +
			
 
				+      `${String(med(rs.map(r => r.read))).padStart(3)} ${String(med(rs.map(r => r.grep))).padStart(3)} ${String(med(rs.map(r => r.explore))).padStart(3)} ${String(med(rs.map(r => r.offloadFired))).padStart(3)}  ` +
			
 
				+      `${(e2e.pass + '/' + e2e.partial + '/' + e2e.fail).padStart(9)}  ${e2eScore != null ? 'e2e=' + e2eScore : ''} ${fid != null ? 'fid=' + fid + (fab ? ' FAB!' : '') : ''}`
			
 
				+    );
			
 
				+  }
			
 
				+  // ranges line for the two key metrics (variance matters)
			
 
				+  for (const arm of ARM_ORDER) {
			
 
				+    const rs = byRepo[repo][arm]; if (!rs) continue;
			
 
				+    console.log(`   ${arm} ranges: time ${rng(rs.map(r => r.durationSec))}s · mainCost $${rng(rs.map(r => r.costUsdMain))} · read ${rng(rs.map(r => r.read))} · explore ${rng(rs.map(r => r.explore))} · offloadFired ${rng(rs.map(r => r.offloadFired))}`);
			
 
				+  }
			
 
				+}
			
 
				+
			
 
				+// Cross-repo roll-up: offload vs raw vs nocg deltas
			
 
				+console.log(`\n${'='.repeat(78)}\nCROSS-REPO SUMMARY (medians per repo, then averaged)\n${'='.repeat(78)}`);
			
 
				+console.log(`${'repo'.padEnd(12)} ${'arm'.padEnd(8)} ${'time'.padStart(7)} ${'totCost'.padStart(8)} ${'read'.padStart(5)} ${'e2e pass%'.padStart(9)} ${'fid'.padStart(5)}`);
			
 
				+for (const repo of Object.keys(byRepo)) {
			
 
				+  for (const arm of ARM_ORDER) {
			
 
				+    const rs = byRepo[repo][arm]; if (!rs) continue;
			
 
				+    const e2e = verdictTally(rs, 'e2e');
			
 
				+    const passPct = Math.round(100 * e2e.pass / rs.length);
			
 
				+    const totCost = (med(rs.map(r => r.costUsdMain)) ?? 0) + (med(rs.map(r => r.ai?.costUsd ?? 0)) ?? 0);
			
 
				+    const fid = arm === 'offload' ? med(rs.flatMap(r => r.fidelity?.scores ?? [])) : null;
			
 
				+    console.log(`${repo.padEnd(12)} ${arm.padEnd(8)} ${(med(rs.map(r => r.durationSec)) + 's').padStart(7)} ${('$' + d3(totCost)).padStart(8)} ${String(med(rs.map(r => r.read))).padStart(5)} ${(passPct + '%').padStart(9)} ${String(fid ?? '—').padStart(5)}`);
			
 
				+  }
			
 
				+}
			
 
				+console.log('');
			
--- a/scripts/agent-eval/offload-eval.md
+++ b/scripts/agent-eval/offload-eval.md
@@ -0,0 +1,76 @@
 
				+# CodeGraph AI offload — accuracy & adoption eval harness
			
 
				+
			
 
				+Measures the managed **offload** (`codegraph_explore` → reasoning model synthesis) and the
			
 
				+**front-load hook** (approach 1) against plain codegraph and no-codegraph, across repo sizes,
			
 
				+on **time · main-session tokens/cost · CodeGraph-AI tokens/cost · accuracy**.
			
 
				+
			
 
				+All agent arms run `claude -p --model sonnet --effort high` (the deliberate floor model — an
			
 
				+affordance that lands on Sonnet generalizes up). Everything writes to a scratch dir
			
 
				+(`AGENT_EVAL_OUT`, default `/tmp/cg-offload-eval`); nothing here is shipped to users.
			
 
				+
			
 
				+## Repos (selected via a memory-probe gate — NOT trained on)
			
 
				+
			
 
				+Famous repos (express, excalidraw, n8n, …) are useless for *accuracy* evals: Sonnet answers their
			
 
				+flow questions from memory, so the no-codegraph baseline is dishonest. These four passed a no-tools
			
 
				+probe (Sonnet could not name their real flow internals) and are cloned fresh by `offload-eval-setup.sh`:
			
 
				+
			
 
				+| tier | repo | ~src files | canonical flow |
			
 
				+|---|---|---|---|
			
 
				+| small | MTKruto/MTKruto | 322 TS | `sendMessage` → invoke → TL serialize → transport |
			
 
				+| medium | mvdicarlo/postybirb-plus | 608 TS | submission → queue → per-website `.post()` |
			
 
				+| complex | shapeshift/web | 3.2k TS (35-pkg monorepo) | swap → swapper registry → concrete swapper |
			
 
				+| large | trezor/trezor-suite | 8k TS monorepo | send-form → sign thunk → `@trezor/connect` |
			
 
				+
			
 
				+Verified ground-truth flows (the judge's reference) live in `offload-eval-ground-truth.json`.
			
 
				+
			
 
				+## Arms
			
 
				+
			
 
				+- **offload** — codegraph + managed offload ON (requires `codegraph login`); records AI tokens/credits via `CODEGRAPH_OFFLOAD_USAGE_LOG`.
			
 
				+- **raw** — codegraph, `CODEGRAPH_OFFLOAD_DISABLE=1` (returns raw source).
			
 
				+- **nocg** — empty MCP config; Read/Grep baseline.
			
 
				+- **frontload** — codegraph (offload-disabled) + a `UserPromptSubmit` hook (`offload-eval-hook.mjs`) that runs raw explore on the prompt and injects the result into context (approach 1).
			
 
				+
			
 
				+## Run it
			
 
				+
			
 
				+```bash
			
 
				+npm run build                       # the harness shells out to dist/
			
 
				+codegraph login                     # only needed for the offload arm
			
 
				+export AGENT_EVAL_OUT=/tmp/cg-offload-eval
			
 
				+
			
 
				+bash scripts/agent-eval/offload-eval-setup.sh            # clone + index the 4 repos
			
 
				+bash scripts/agent-eval/offload-eval-matrix.sh           # 3 arms × 4 tiers × REPS (default 3)
			
 
				+node scripts/agent-eval/offload-eval-judge.mjs \
			
 
				+     --results $AGENT_EVAL_OUT/results.jsonl \
			
 
				+     --truth  scripts/agent-eval/offload-eval-ground-truth.json \
			
 
				+     --out    $AGENT_EVAL_OUT/judged.jsonl
			
 
				+node scripts/agent-eval/offload-eval-summarize.mjs $AGENT_EVAL_OUT/judged.jsonl
			
 
				+
			
 
				+bash scripts/agent-eval/offload-eval-frontload-matrix.sh # frontload arm + judge + merged summary
			
 
				+```
			
 
				+
			
 
				+Single repo: `offload-eval-3arm.sh <indexed-repo> <tier> <reps> "<question>"` (or `-frontload.sh`).
			
 
				+
			
 
				+## Files
			
 
				+
			
 
				+- `offload-eval-setup.sh` — clone + index the 4 repos.
			
 
				+- `offload-eval-3arm.sh` / `-frontload.sh` — one repo, the arms.
			
 
				+- `offload-eval-matrix.sh` / `-frontload-matrix.sh` — drive all 4 tiers.
			
 
				+- `offload-eval-hook.mjs` — the front-load `UserPromptSubmit` hook (resolves its own engine; `CG_FRONTLOAD_DEBUG=<path>` to log injections; `CG_FRONTLOAD_BUDGET` to cap injected chars).
			
 
				+- `offload-eval-metrics.mjs` — one run's stream-json + usage log → one JSON metrics line.
			
 
				+- `offload-eval-judge.mjs` — Sonnet judge: end-to-end (agent final vs ground truth) + per-answer offload fidelity.
			
 
				+- `offload-eval-summarize.mjs` — per-tier, per-arm table + cross-repo roll-up.
			
 
				+- `offload-eval-ground-truth.json` — source-verified canonical flows.
			
 
				+
			
 
				+## Findings (2026-06, n=3 — direction consistent, magnitudes noisy)
			
 
				+
			
 
				+- **Raw codegraph is the efficiency win** — ~nocg accuracy, fewer reads, faster, no AI cost.
			
 
				+- **The offload is the least-accurate arm in all 4 tiers** — synthesized fidelity 12–27/100 with
			
 
				+  fabrication in 3/4 (e.g. invented website services; traced `ClientPlain`/`SessionPlain` instead of
			
 
				+  the real encrypted path). Its speed/cost win is narrow (medium-only) and inversely correlated with
			
 
				+  accuracy. **Use raw until offload fidelity is fixed.**
			
 
				+- **The front-load hook SOLVES adoption** — reads → 0–1 in every tier (incl. large, where the agent
			
 
				+  otherwise read 12–24 files); fired 12/12, 0 errors. Wins on medium/complex (100% pass). But it
			
 
				+  **regresses small/large to partial** — it suppresses the reads that compensate for explore's gaps at
			
 
				+  **dynamic boundaries** (async queues, redux thunks, facade/factory indirection).
			
 
				+- **Master lever for BOTH:** explore's dynamic-dispatch coverage. Fix it → front-load is complete
			
 
				+  everywhere and the offload has the full flow to synthesize.