docs: add evaluator CI failure scenario (#1826)

2026-05-13 16:13:03 +08:00 · 2026-05-12 17:44:00 -04:00
parent 863519eecf
commit cd90c84c32
8 changed files with 274 additions and 6 deletions
--- a/examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md
+++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md
@@ -0,0 +1,46 @@
+# CI Failure Diagnosis Playbook
+
+Candidate id: `log-backed-minimal-fix`
+
+Use this playbook when a PR, maintainer branch, or release-readiness branch has
+one or more red GitHub Actions checks.
+
+## Accepted Path
+
+1. Capture PR and branch context:
+   - `gh pr view <pr-number> --json files,statusCheckRollup,headRefName,baseRefName`
+   - `gh run view <run-id> --json jobs`
+2. Fetch the failed log evidence:
+   - `gh run view <run-id> --log-failed`
+3. Record the failing job, step, OS, Node/Python/Rust version, package manager,
+   and shortest useful error excerpt.
+4. Compare the failing step to the PR changed files.
+5. Search current docs, tests, and prior PRs for a known matching failure mode.
+6. Promote the smallest fix path only when it includes a local reproduction or
+   regression command.
+7. After a separate implementation branch exists, rerun the focused local gate,
+   then wait for the full GitHub Actions matrix before merge.
+
+## Rejected Path
+
+Do not keep rerunning CI until a transient green result appears without
+recording the original failure and why it is safe to ignore.
+
+Do not weaken tests, skip matrix legs, or broaden the patch to unrelated files
+just to make the check pass.
+
+Do not claim release readiness from a branch with required checks still red.
+
+## Minimum Validation
+
+- `gh run view <run-id> --log-failed`
+- Focused local command matching the failing surface, such as:
+  - `node tests/<matching-test>.js`
+  - `npm run harness:audit -- --format json`
+  - `npm run observability:ready`
+  - `cargo test`
+- `git diff --check`
+- Full required GitHub Actions matrix before merge
+
+Record the failed-log excerpt and the chosen regression command in the
+maintainer PR body or handoff before merging the fix.
--- a/examples/evaluator-rag-prototype/ci-failure-diagnosis/report.json
+++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/report.json
@@ -0,0 +1,35 @@
+{
+  "schema_version": "ecc.evaluator-rag.report.v1",
+  "scenario_id": "ci-failure-diagnosis",
+  "run_id": "2026-05-12-ci-failure-diagnosis-prototype",
+  "result": "prototype_passed",
+  "read_only": true,
+  "scores": {
+    "failure_evidence": 0.92,
+    "scope_control": 0.9,
+    "regression_specificity": 0.86,
+    "matrix_safety": 1,
+    "publication_safety": 1
+  },
+  "findings": [
+    {
+      "id": "log-first-required",
+      "severity": "warning",
+      "summary": "A CI fix candidate must start from the exact failed job, step, platform, runtime, package manager, and log excerpt rather than from a generic rerun."
+    },
+    {
+      "id": "changed-file-scope-needed",
+      "severity": "info",
+      "summary": "Changed-file context should narrow the fix to the surface that can affect the failing step, especially in a broad OS/runtime matrix."
+    },
+    {
+      "id": "regression-gate-needed",
+      "severity": "warning",
+      "summary": "A promoted fix playbook must name a local reproduction or regression command before the branch is allowed to merge."
+    }
+  ],
+  "recommended_next_action": {
+    "candidate_id": "log-backed-minimal-fix",
+    "action": "Use the promoted CI failure diagnosis playbook whenever a PR check goes red before implementing or rerunning fixes."
+  }
+}
--- a/examples/evaluator-rag-prototype/ci-failure-diagnosis/scenario.json
+++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/scenario.json
@@ -0,0 +1,57 @@
+{
+  "schema_version": "ecc.evaluator-rag.scenario.v1",
+  "scenario_id": "ci-failure-diagnosis",
+  "title": "Diagnose CI failures from captured logs before proposing fixes",
+  "mode": "read_only_prototype",
+  "objective": "Given a failed CI run on a PR or maintainer branch, retrieve the exact failing job, captured log excerpt, changed-file context, and prior known-fix evidence before promoting a fix playbook.",
+  "sources": [
+    {
+      "kind": "repo_doc",
+      "path": "docs/ECC-2.0-GA-ROADMAP.md",
+      "purpose": "Records ECC-Tools CI failure-mode predictive follow-ups and the evaluator/RAG corpus expansion need"
+    },
+    {
+      "kind": "repo_doc",
+      "path": "docs/architecture/evaluator-rag-prototype.md",
+      "purpose": "Defines the artifact contract and promotion rules for evaluator/RAG scenarios"
+    },
+    {
+      "kind": "github_actions",
+      "command": "gh run view <run-id> --log-failed",
+      "purpose": "Primary evidence for the failing job, failing step, and deterministic error text"
+    },
+    {
+      "kind": "github_pr",
+      "command": "gh pr view <pr-number> --json files,statusCheckRollup,headRefName,baseRefName",
+      "purpose": "Changed-file and check-rollup context for scoping the fix"
+    },
+    {
+      "kind": "repo_test",
+      "command": "node tests/run-all.js",
+      "purpose": "Local regression gate after a candidate fix is implemented outside the read-only evaluator run"
+    }
+  ],
+  "retrieval_questions": [
+    "Which job, step, platform, runtime, and package manager failed?",
+    "What is the smallest failing log excerpt that explains the failure?",
+    "Which changed files are plausibly connected to the failing step?",
+    "Is there a prior known-fix, troubleshooting note, or fixture that matches this failure mode?",
+    "Which local command reproduces or guards the failure before a fix can merge?"
+  ],
+  "forbidden_actions": [
+    "rerunning CI until it passes without diagnosing the failure",
+    "pushing speculative fixes without a captured failing log excerpt",
+    "editing unrelated files to make the matrix green",
+    "weakening or deleting tests to silence a failure",
+    "merging or publishing while required checks are red",
+    "creating release tags or posting announcements from this evaluator run"
+  ],
+  "acceptance_gates": [
+    "failing job and step are named",
+    "captured log excerpt is linked or summarized",
+    "changed-file context is compared to the failing step",
+    "known-fix or no-known-fix status is recorded",
+    "local reproduction or regression command is named",
+    "at least one rerun-only candidate is rejected"
+  ]
+}
--- a/examples/evaluator-rag-prototype/ci-failure-diagnosis/trace.json
+++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/trace.json
@@ -0,0 +1,45 @@
+{
+  "schema_version": "ecc.evaluator-rag.trace.v1",
+  "scenario_id": "ci-failure-diagnosis",
+  "run_id": "2026-05-12-ci-failure-diagnosis-prototype",
+  "read_only": true,
+  "events": [
+    {
+      "phase": "observation",
+      "summary": "A PR or maintainer branch has a red GitHub Actions matrix. The evaluator records status without rerunning, merging, or editing code.",
+      "evidence": [
+        "gh pr view <pr-number> --json statusCheckRollup,files",
+        "gh run view <run-id> --json jobs"
+      ]
+    },
+    {
+      "phase": "retrieval",
+      "summary": "Retrieved failed-job logs, changed-file context, current roadmap CI failure-mode requirements, and existing local regression commands.",
+      "evidence": [
+        "gh run view <run-id> --log-failed",
+        "docs/ECC-2.0-GA-ROADMAP.md",
+        "tests/run-all.js"
+      ]
+    },
+    {
+      "phase": "proposal",
+      "summary": "Generated two candidate playbooks: log-backed minimal fix with regression coverage, and rerun-only optimism that treats CI flake as proven without evidence.",
+      "candidate_ids": [
+        "log-backed-minimal-fix",
+        "rerun-only-green-wait"
+      ]
+    },
+    {
+      "phase": "verification",
+      "summary": "Accepted the log-backed minimal fix because it names failing evidence, scope, and validation. Rejected rerun-only waiting because it does not explain the failure or preserve a regression guard.",
+      "evidence": [
+        "examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json"
+      ]
+    },
+    {
+      "phase": "promotion",
+      "summary": "Promoted only the read-only CI triage playbook. The evaluator does not push a fix, rerun CI, merge, publish, or weaken checks.",
+      "promoted_candidate_id": "log-backed-minimal-fix"
+    }
+  ]
+}
--- a/examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json
+++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json
@@ -0,0 +1,35 @@
+{
+  "schema_version": "ecc.evaluator-rag.verifier.v1",
+  "scenario_id": "ci-failure-diagnosis",
+  "run_id": "2026-05-12-ci-failure-diagnosis-prototype",
+  "read_only": true,
+  "candidates": [
+    {
+      "candidate_id": "log-backed-minimal-fix",
+      "decision": "accepted",
+      "score": 0.93,
+      "reasons": [
+        "requires failed job, step, platform, runtime, and log evidence",
+        "compares changed files to the failing surface before proposing a fix",
+        "names a focused local reproduction or regression command",
+        "keeps required checks intact",
+        "keeps merge, release, package, plugin, billing, and announcement actions approval-gated"
+      ],
+      "rollback": "Revert the future implementation PR or restore the original failing test fixture; no code is changed by this read-only playbook."
+    },
+    {
+      "candidate_id": "rerun-only-green-wait",
+      "decision": "rejected",
+      "score": 0.17,
+      "reasons": [
+        "does not preserve the failing log excerpt",
+        "does not identify job, step, platform, runtime, or package manager",
+        "does not compare failure surface to changed files",
+        "does not add or name a regression gate",
+        "risks merging a flaky or still-unexplained CI failure"
+      ],
+      "rollback": "Do not treat this as a fix; restart diagnosis from captured failed logs and changed-file context."
+    }
+  ],
+  "promoted_candidate_id": "log-backed-minimal-fix"
+}