docs: add evaluator CI failure scenario (#1826)

This commit is contained in:
Affaan Mustafa
2026-05-12 17:44:00 -04:00
committed by GitHub
parent 863519eecf
commit cd90c84c32
8 changed files with 274 additions and 6 deletions

View File

@@ -0,0 +1,46 @@
# CI Failure Diagnosis Playbook
Candidate id: `log-backed-minimal-fix`
Use this playbook when a PR, maintainer branch, or release-readiness branch has
one or more red GitHub Actions checks.
## Accepted Path
1. Capture PR and branch context:
- `gh pr view <pr-number> --json files,statusCheckRollup,headRefName,baseRefName`
- `gh run view <run-id> --json jobs`
2. Fetch the failed log evidence:
- `gh run view <run-id> --log-failed`
3. Record the failing job, step, OS, Node/Python/Rust version, package manager,
and shortest useful error excerpt.
4. Compare the failing step to the PR changed files.
5. Search current docs, tests, and prior PRs for a known matching failure mode.
6. Promote the smallest fix path only when it includes a local reproduction or
regression command.
7. After a separate implementation branch exists, rerun the focused local gate,
then wait for the full GitHub Actions matrix before merge.
## Rejected Path
Do not keep rerunning CI until a transient green result appears without
recording the original failure and why it is safe to ignore.
Do not weaken tests, skip matrix legs, or broaden the patch to unrelated files
just to make the check pass.
Do not claim release readiness from a branch with required checks still red.
## Minimum Validation
- `gh run view <run-id> --log-failed`
- Focused local command matching the failing surface, such as:
- `node tests/<matching-test>.js`
- `npm run harness:audit -- --format json`
- `npm run observability:ready`
- `cargo test`
- `git diff --check`
- Full required GitHub Actions matrix before merge
Record the failed-log excerpt and the chosen regression command in the
maintainer PR body or handoff before merging the fix.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "ci-failure-diagnosis",
"run_id": "2026-05-12-ci-failure-diagnosis-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"failure_evidence": 0.92,
"scope_control": 0.9,
"regression_specificity": 0.86,
"matrix_safety": 1,
"publication_safety": 1
},
"findings": [
{
"id": "log-first-required",
"severity": "warning",
"summary": "A CI fix candidate must start from the exact failed job, step, platform, runtime, package manager, and log excerpt rather than from a generic rerun."
},
{
"id": "changed-file-scope-needed",
"severity": "info",
"summary": "Changed-file context should narrow the fix to the surface that can affect the failing step, especially in a broad OS/runtime matrix."
},
{
"id": "regression-gate-needed",
"severity": "warning",
"summary": "A promoted fix playbook must name a local reproduction or regression command before the branch is allowed to merge."
}
],
"recommended_next_action": {
"candidate_id": "log-backed-minimal-fix",
"action": "Use the promoted CI failure diagnosis playbook whenever a PR check goes red before implementing or rerunning fixes."
}
}

View File

@@ -0,0 +1,57 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "ci-failure-diagnosis",
"title": "Diagnose CI failures from captured logs before proposing fixes",
"mode": "read_only_prototype",
"objective": "Given a failed CI run on a PR or maintainer branch, retrieve the exact failing job, captured log excerpt, changed-file context, and prior known-fix evidence before promoting a fix playbook.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/ECC-2.0-GA-ROADMAP.md",
"purpose": "Records ECC-Tools CI failure-mode predictive follow-ups and the evaluator/RAG corpus expansion need"
},
{
"kind": "repo_doc",
"path": "docs/architecture/evaluator-rag-prototype.md",
"purpose": "Defines the artifact contract and promotion rules for evaluator/RAG scenarios"
},
{
"kind": "github_actions",
"command": "gh run view <run-id> --log-failed",
"purpose": "Primary evidence for the failing job, failing step, and deterministic error text"
},
{
"kind": "github_pr",
"command": "gh pr view <pr-number> --json files,statusCheckRollup,headRefName,baseRefName",
"purpose": "Changed-file and check-rollup context for scoping the fix"
},
{
"kind": "repo_test",
"command": "node tests/run-all.js",
"purpose": "Local regression gate after a candidate fix is implemented outside the read-only evaluator run"
}
],
"retrieval_questions": [
"Which job, step, platform, runtime, and package manager failed?",
"What is the smallest failing log excerpt that explains the failure?",
"Which changed files are plausibly connected to the failing step?",
"Is there a prior known-fix, troubleshooting note, or fixture that matches this failure mode?",
"Which local command reproduces or guards the failure before a fix can merge?"
],
"forbidden_actions": [
"rerunning CI until it passes without diagnosing the failure",
"pushing speculative fixes without a captured failing log excerpt",
"editing unrelated files to make the matrix green",
"weakening or deleting tests to silence a failure",
"merging or publishing while required checks are red",
"creating release tags or posting announcements from this evaluator run"
],
"acceptance_gates": [
"failing job and step are named",
"captured log excerpt is linked or summarized",
"changed-file context is compared to the failing step",
"known-fix or no-known-fix status is recorded",
"local reproduction or regression command is named",
"at least one rerun-only candidate is rejected"
]
}

View File

@@ -0,0 +1,45 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "ci-failure-diagnosis",
"run_id": "2026-05-12-ci-failure-diagnosis-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A PR or maintainer branch has a red GitHub Actions matrix. The evaluator records status without rerunning, merging, or editing code.",
"evidence": [
"gh pr view <pr-number> --json statusCheckRollup,files",
"gh run view <run-id> --json jobs"
]
},
{
"phase": "retrieval",
"summary": "Retrieved failed-job logs, changed-file context, current roadmap CI failure-mode requirements, and existing local regression commands.",
"evidence": [
"gh run view <run-id> --log-failed",
"docs/ECC-2.0-GA-ROADMAP.md",
"tests/run-all.js"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: log-backed minimal fix with regression coverage, and rerun-only optimism that treats CI flake as proven without evidence.",
"candidate_ids": [
"log-backed-minimal-fix",
"rerun-only-green-wait"
]
},
{
"phase": "verification",
"summary": "Accepted the log-backed minimal fix because it names failing evidence, scope, and validation. Rejected rerun-only waiting because it does not explain the failure or preserve a regression guard.",
"evidence": [
"examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only CI triage playbook. The evaluator does not push a fix, rerun CI, merge, publish, or weaken checks.",
"promoted_candidate_id": "log-backed-minimal-fix"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "ci-failure-diagnosis",
"run_id": "2026-05-12-ci-failure-diagnosis-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "log-backed-minimal-fix",
"decision": "accepted",
"score": 0.93,
"reasons": [
"requires failed job, step, platform, runtime, and log evidence",
"compares changed files to the failing surface before proposing a fix",
"names a focused local reproduction or regression command",
"keeps required checks intact",
"keeps merge, release, package, plugin, billing, and announcement actions approval-gated"
],
"rollback": "Revert the future implementation PR or restore the original failing test fixture; no code is changed by this read-only playbook."
},
{
"candidate_id": "rerun-only-green-wait",
"decision": "rejected",
"score": 0.17,
"reasons": [
"does not preserve the failing log excerpt",
"does not identify job, step, platform, runtime, or package manager",
"does not compare failure surface to changed files",
"does not add or name a regression gate",
"risks merging a flaky or still-unexplained CI failure"
],
"rollback": "Do not treat this as a fix; restart diagnosis from captured failed logs and changed-file context."
}
],
"promoted_candidate_id": "log-backed-minimal-fix"
}