From cd90c84c3266216c26af98c36eed9a2cd96dfb78 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Tue, 12 May 2026 17:44:00 -0400 Subject: [PATCH] docs: add evaluator CI failure scenario (#1826) --- docs/ECC-2.0-GA-ROADMAP.md | 9 +-- docs/architecture/evaluator-rag-prototype.md | 7 ++- .../candidate-playbook.md | 46 +++++++++++++++ .../ci-failure-diagnosis/report.json | 35 ++++++++++++ .../ci-failure-diagnosis/scenario.json | 57 +++++++++++++++++++ .../ci-failure-diagnosis/trace.json | 45 +++++++++++++++ .../ci-failure-diagnosis/verifier-result.json | 35 ++++++++++++ tests/docs/evaluator-rag-prototype.test.js | 46 +++++++++++++++ 8 files changed, 274 insertions(+), 6 deletions(-) create mode 100644 examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md create mode 100644 examples/evaluator-rag-prototype/ci-failure-diagnosis/report.json create mode 100644 examples/evaluator-rag-prototype/ci-failure-diagnosis/scenario.json create mode 100644 examples/evaluator-rag-prototype/ci-failure-diagnosis/trace.json create mode 100644 examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json diff --git a/docs/ECC-2.0-GA-ROADMAP.md b/docs/ECC-2.0-GA-ROADMAP.md index 289b6644..3033f07f 100644 --- a/docs/ECC-2.0-GA-ROADMAP.md +++ b/docs/ECC-2.0-GA-ROADMAP.md @@ -56,9 +56,10 @@ As of 2026-05-12: `2.0.0-rc.1`. - `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define the first read-only - self-improving harness prototype: scenario spec, trace, report, candidate - playbook, verifier result, accepted maintainer-salvage candidate, and - rejected blind-translation candidate. + self-improving harness prototype: scenario specs, traces, reports, + candidate playbooks, verifier results, accepted maintainer-salvage, + billing-readiness, and CI-failure-diagnosis candidates, plus rejected + unsafe candidates. - The npm package surface now excludes Python bytecode/cache artifacts through package `files` negation rules and a publish-surface regression test. - `docs/legacy-artifact-inventory.md` records that no `_legacy-documents-*` @@ -199,7 +200,7 @@ is not complete unless the evidence column exists and has been freshly verified. | AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal | | ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus | | GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete | -| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage and billing-readiness scenarios with trace, report, playbook, and verifier result artifacts | Needs broader evaluator corpus | +| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, and CI-failure-diagnosis scenarios with trace, report, playbook, and verifier result artifacts | Needs broader evaluator corpus | | Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch | | Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active | | Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout | diff --git a/docs/architecture/evaluator-rag-prototype.md b/docs/architecture/evaluator-rag-prototype.md index 7e20e10d..60bb7a32 100644 --- a/docs/architecture/evaluator-rag-prototype.md +++ b/docs/architecture/evaluator-rag-prototype.md @@ -10,7 +10,9 @@ The fixture set lives in It started with the May 2026 stale-PR cleanup and salvage lane because that lane has real inputs, real accepted work, and real rejected work. The corpus now also includes a billing/Marketplace readiness scenario so launch copy cannot -treat dry-run release evidence or roadmap intent as live billing state. +treat dry-run release evidence or roadmap intent as live billing state. A +CI-failure diagnosis scenario adds the log-first workflow needed before an +agent proposes fixes for red checks. ## Reference Pressure @@ -96,6 +98,8 @@ Current corpus: maintainer-owned branches with attribution and validation. - `billing-marketplace-readiness`: verifies billing, App, and Marketplace launch claims before public copy says they are live. +- `ci-failure-diagnosis`: requires failed-job logs, changed-file scope, and a + named regression command before a CI fix playbook can be promoted. ## ECC Tools Mapping @@ -129,6 +133,5 @@ A candidate can be promoted only when: The next evaluator/RAG corpus should add: -- a CI-failure diagnosis scenario with captured logs and a known fix; - a harness-config quality scenario covering MCP/plugin/hook drift; - an AgentShield policy exception scenario with SARIF and report evidence. diff --git a/examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md b/examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md new file mode 100644 index 00000000..af7e206e --- /dev/null +++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md @@ -0,0 +1,46 @@ +# CI Failure Diagnosis Playbook + +Candidate id: `log-backed-minimal-fix` + +Use this playbook when a PR, maintainer branch, or release-readiness branch has +one or more red GitHub Actions checks. + +## Accepted Path + +1. Capture PR and branch context: + - `gh pr view --json files,statusCheckRollup,headRefName,baseRefName` + - `gh run view --json jobs` +2. Fetch the failed log evidence: + - `gh run view --log-failed` +3. Record the failing job, step, OS, Node/Python/Rust version, package manager, + and shortest useful error excerpt. +4. Compare the failing step to the PR changed files. +5. Search current docs, tests, and prior PRs for a known matching failure mode. +6. Promote the smallest fix path only when it includes a local reproduction or + regression command. +7. After a separate implementation branch exists, rerun the focused local gate, + then wait for the full GitHub Actions matrix before merge. + +## Rejected Path + +Do not keep rerunning CI until a transient green result appears without +recording the original failure and why it is safe to ignore. + +Do not weaken tests, skip matrix legs, or broaden the patch to unrelated files +just to make the check pass. + +Do not claim release readiness from a branch with required checks still red. + +## Minimum Validation + +- `gh run view --log-failed` +- Focused local command matching the failing surface, such as: + - `node tests/.js` + - `npm run harness:audit -- --format json` + - `npm run observability:ready` + - `cargo test` +- `git diff --check` +- Full required GitHub Actions matrix before merge + +Record the failed-log excerpt and the chosen regression command in the +maintainer PR body or handoff before merging the fix. diff --git a/examples/evaluator-rag-prototype/ci-failure-diagnosis/report.json b/examples/evaluator-rag-prototype/ci-failure-diagnosis/report.json new file mode 100644 index 00000000..ce0683ca --- /dev/null +++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/report.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.report.v1", + "scenario_id": "ci-failure-diagnosis", + "run_id": "2026-05-12-ci-failure-diagnosis-prototype", + "result": "prototype_passed", + "read_only": true, + "scores": { + "failure_evidence": 0.92, + "scope_control": 0.9, + "regression_specificity": 0.86, + "matrix_safety": 1, + "publication_safety": 1 + }, + "findings": [ + { + "id": "log-first-required", + "severity": "warning", + "summary": "A CI fix candidate must start from the exact failed job, step, platform, runtime, package manager, and log excerpt rather than from a generic rerun." + }, + { + "id": "changed-file-scope-needed", + "severity": "info", + "summary": "Changed-file context should narrow the fix to the surface that can affect the failing step, especially in a broad OS/runtime matrix." + }, + { + "id": "regression-gate-needed", + "severity": "warning", + "summary": "A promoted fix playbook must name a local reproduction or regression command before the branch is allowed to merge." + } + ], + "recommended_next_action": { + "candidate_id": "log-backed-minimal-fix", + "action": "Use the promoted CI failure diagnosis playbook whenever a PR check goes red before implementing or rerunning fixes." + } +} diff --git a/examples/evaluator-rag-prototype/ci-failure-diagnosis/scenario.json b/examples/evaluator-rag-prototype/ci-failure-diagnosis/scenario.json new file mode 100644 index 00000000..da33a512 --- /dev/null +++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/scenario.json @@ -0,0 +1,57 @@ +{ + "schema_version": "ecc.evaluator-rag.scenario.v1", + "scenario_id": "ci-failure-diagnosis", + "title": "Diagnose CI failures from captured logs before proposing fixes", + "mode": "read_only_prototype", + "objective": "Given a failed CI run on a PR or maintainer branch, retrieve the exact failing job, captured log excerpt, changed-file context, and prior known-fix evidence before promoting a fix playbook.", + "sources": [ + { + "kind": "repo_doc", + "path": "docs/ECC-2.0-GA-ROADMAP.md", + "purpose": "Records ECC-Tools CI failure-mode predictive follow-ups and the evaluator/RAG corpus expansion need" + }, + { + "kind": "repo_doc", + "path": "docs/architecture/evaluator-rag-prototype.md", + "purpose": "Defines the artifact contract and promotion rules for evaluator/RAG scenarios" + }, + { + "kind": "github_actions", + "command": "gh run view --log-failed", + "purpose": "Primary evidence for the failing job, failing step, and deterministic error text" + }, + { + "kind": "github_pr", + "command": "gh pr view --json files,statusCheckRollup,headRefName,baseRefName", + "purpose": "Changed-file and check-rollup context for scoping the fix" + }, + { + "kind": "repo_test", + "command": "node tests/run-all.js", + "purpose": "Local regression gate after a candidate fix is implemented outside the read-only evaluator run" + } + ], + "retrieval_questions": [ + "Which job, step, platform, runtime, and package manager failed?", + "What is the smallest failing log excerpt that explains the failure?", + "Which changed files are plausibly connected to the failing step?", + "Is there a prior known-fix, troubleshooting note, or fixture that matches this failure mode?", + "Which local command reproduces or guards the failure before a fix can merge?" + ], + "forbidden_actions": [ + "rerunning CI until it passes without diagnosing the failure", + "pushing speculative fixes without a captured failing log excerpt", + "editing unrelated files to make the matrix green", + "weakening or deleting tests to silence a failure", + "merging or publishing while required checks are red", + "creating release tags or posting announcements from this evaluator run" + ], + "acceptance_gates": [ + "failing job and step are named", + "captured log excerpt is linked or summarized", + "changed-file context is compared to the failing step", + "known-fix or no-known-fix status is recorded", + "local reproduction or regression command is named", + "at least one rerun-only candidate is rejected" + ] +} diff --git a/examples/evaluator-rag-prototype/ci-failure-diagnosis/trace.json b/examples/evaluator-rag-prototype/ci-failure-diagnosis/trace.json new file mode 100644 index 00000000..2f2880da --- /dev/null +++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/trace.json @@ -0,0 +1,45 @@ +{ + "schema_version": "ecc.evaluator-rag.trace.v1", + "scenario_id": "ci-failure-diagnosis", + "run_id": "2026-05-12-ci-failure-diagnosis-prototype", + "read_only": true, + "events": [ + { + "phase": "observation", + "summary": "A PR or maintainer branch has a red GitHub Actions matrix. The evaluator records status without rerunning, merging, or editing code.", + "evidence": [ + "gh pr view --json statusCheckRollup,files", + "gh run view --json jobs" + ] + }, + { + "phase": "retrieval", + "summary": "Retrieved failed-job logs, changed-file context, current roadmap CI failure-mode requirements, and existing local regression commands.", + "evidence": [ + "gh run view --log-failed", + "docs/ECC-2.0-GA-ROADMAP.md", + "tests/run-all.js" + ] + }, + { + "phase": "proposal", + "summary": "Generated two candidate playbooks: log-backed minimal fix with regression coverage, and rerun-only optimism that treats CI flake as proven without evidence.", + "candidate_ids": [ + "log-backed-minimal-fix", + "rerun-only-green-wait" + ] + }, + { + "phase": "verification", + "summary": "Accepted the log-backed minimal fix because it names failing evidence, scope, and validation. Rejected rerun-only waiting because it does not explain the failure or preserve a regression guard.", + "evidence": [ + "examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json" + ] + }, + { + "phase": "promotion", + "summary": "Promoted only the read-only CI triage playbook. The evaluator does not push a fix, rerun CI, merge, publish, or weaken checks.", + "promoted_candidate_id": "log-backed-minimal-fix" + } + ] +} diff --git a/examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json b/examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json new file mode 100644 index 00000000..a9ebdba0 --- /dev/null +++ b/examples/evaluator-rag-prototype/ci-failure-diagnosis/verifier-result.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.verifier.v1", + "scenario_id": "ci-failure-diagnosis", + "run_id": "2026-05-12-ci-failure-diagnosis-prototype", + "read_only": true, + "candidates": [ + { + "candidate_id": "log-backed-minimal-fix", + "decision": "accepted", + "score": 0.93, + "reasons": [ + "requires failed job, step, platform, runtime, and log evidence", + "compares changed files to the failing surface before proposing a fix", + "names a focused local reproduction or regression command", + "keeps required checks intact", + "keeps merge, release, package, plugin, billing, and announcement actions approval-gated" + ], + "rollback": "Revert the future implementation PR or restore the original failing test fixture; no code is changed by this read-only playbook." + }, + { + "candidate_id": "rerun-only-green-wait", + "decision": "rejected", + "score": 0.17, + "reasons": [ + "does not preserve the failing log excerpt", + "does not identify job, step, platform, runtime, or package manager", + "does not compare failure surface to changed files", + "does not add or name a regression gate", + "risks merging a flaky or still-unexplained CI failure" + ], + "rollback": "Do not treat this as a fix; restart diagnosis from captured failed logs and changed-file context." + } + ], + "promoted_candidate_id": "log-backed-minimal-fix" +} diff --git a/tests/docs/evaluator-rag-prototype.test.js b/tests/docs/evaluator-rag-prototype.test.js index dbed6d26..7c853131 100644 --- a/tests/docs/evaluator-rag-prototype.test.js +++ b/tests/docs/evaluator-rag-prototype.test.js @@ -175,6 +175,52 @@ test('billing readiness scenario rejects launch copy overclaims', () => { assert.ok(playbook.includes('https://github.com/marketplace/ecc-tools')); }); +test('ci failure diagnosis scenario rejects rerun-only fixes', () => { + const scenario = readFixtureJson('ci-failure-diagnosis/scenario.json'); + const trace = readFixtureJson('ci-failure-diagnosis/trace.json'); + const report = readFixtureJson('ci-failure-diagnosis/report.json'); + const verifier = readFixtureJson('ci-failure-diagnosis/verifier-result.json'); + const playbook = read('examples/evaluator-rag-prototype/ci-failure-diagnosis/candidate-playbook.md'); + + assert.strictEqual(scenario.scenario_id, 'ci-failure-diagnosis'); + assert.strictEqual(trace.scenario_id, scenario.scenario_id); + assert.strictEqual(report.scenario_id, scenario.scenario_id); + assert.strictEqual(verifier.scenario_id, scenario.scenario_id); + assert.strictEqual(trace.read_only, true); + assert.strictEqual(report.read_only, true); + assert.strictEqual(verifier.read_only, true); + + for (const blocked of [ + 'rerunning CI until it passes without diagnosing the failure', + 'pushing speculative fixes without a captured failing log excerpt', + 'weakening or deleting tests to silence a failure', + 'merging or publishing while required checks are red' + ]) { + assert.ok(scenario.forbidden_actions.includes(blocked), `Missing CI forbidden action: ${blocked}`); + } + + for (const required of [ + 'failing job and step are named', + 'captured log excerpt is linked or summarized', + 'changed-file context is compared to the failing step', + 'local reproduction or regression command is named' + ]) { + assert.ok(scenario.acceptance_gates.includes(required), `Missing CI acceptance gate: ${required}`); + } + + const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'log-backed-minimal-fix'); + const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'rerun-only-green-wait'); + + assert.ok(accepted, 'Missing accepted log-backed CI candidate'); + assert.ok(rejected, 'Missing rejected rerun-only CI candidate'); + assert.strictEqual(accepted.decision, 'accepted'); + assert.strictEqual(rejected.decision, 'rejected'); + assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id); + assert.ok(rejected.reasons.join('\n').includes('failing log excerpt')); + assert.ok(playbook.includes('gh run view --log-failed')); + assert.ok(playbook.includes('Full required GitHub Actions matrix before merge')); +}); + if (failed > 0) { console.log(`\nFailed: ${failed}`); process.exit(1);