From 37c27a60fdafb1f18da7a23a35f9fef7ad6d9b76 Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Tue, 12 May 2026 18:43:28 -0400 Subject: [PATCH] docs: add deep-analyzer evaluator scenario --- docs/ECC-2.0-GA-ROADMAP.md | 10 ++-- docs/architecture/evaluator-rag-prototype.md | 15 +++-- .../candidate-playbook.md | 60 +++++++++++++++++++ .../deep-analyzer-evidence/report.json | 35 +++++++++++ .../deep-analyzer-evidence/scenario.json | 57 ++++++++++++++++++ .../deep-analyzer-evidence/trace.json | 45 ++++++++++++++ .../verifier-result.json | 35 +++++++++++ tests/docs/evaluator-rag-prototype.test.js | 52 +++++++++++++++- 8 files changed, 297 insertions(+), 12 deletions(-) create mode 100644 examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md create mode 100644 examples/evaluator-rag-prototype/deep-analyzer-evidence/report.json create mode 100644 examples/evaluator-rag-prototype/deep-analyzer-evidence/scenario.json create mode 100644 examples/evaluator-rag-prototype/deep-analyzer-evidence/trace.json create mode 100644 examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json diff --git a/docs/ECC-2.0-GA-ROADMAP.md b/docs/ECC-2.0-GA-ROADMAP.md index 97954c34..899d652c 100644 --- a/docs/ECC-2.0-GA-ROADMAP.md +++ b/docs/ECC-2.0-GA-ROADMAP.md @@ -201,7 +201,7 @@ is not complete unless the evidence column exists and has been freshly verified. | AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal | | ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus | | GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete | -| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, and skill-quality evidence scenarios with trace, report, playbook, and verifier result artifacts | Needs deep-analyzer corpus | +| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, skill-quality evidence, and deep-analyzer evidence scenarios with trace, report, playbook, and verifier result artifacts | Local corpus complete; hosted integration remains future | | Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch | | Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active | | Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout | @@ -220,7 +220,7 @@ back to the repo evidence and merge commits. | Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch | | Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag | | Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA | -| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, and skill-quality evidence fixtures | Expand to deep-analyzer evidence scenario | +| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, skill-quality evidence, and deep-analyzer evidence fixtures | Use as fixture contract before hosted retrieval/check-run automation | | AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision | | ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch | | Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch | @@ -419,6 +419,6 @@ Acceptance: executive report, corpus benchmark output, and exception lifecycle audit. 2. Enable/configure the merged Linear backlog sync path after workspace issue capacity clears or the Linear workspace is upgraded. -3. Expand the evaluator/RAG corpus beyond stale-salvage, billing, CI, - harness-config, AgentShield policy-exception, and skill-quality evidence - prototypes toward deep-analyzer evidence scenarios. +3. Consume the local evaluator/RAG corpus from ECC Tools before adding hosted + retrieval, vector storage, model-backed judging, or automated check-run + promotion. diff --git a/docs/architecture/evaluator-rag-prototype.md b/docs/architecture/evaluator-rag-prototype.md index 48b684cc..4543e578 100644 --- a/docs/architecture/evaluator-rag-prototype.md +++ b/docs/architecture/evaluator-rag-prototype.md @@ -19,7 +19,9 @@ exception scenario gates security exceptions on SARIF/report evidence, owner fields, expiry state, and remediation-versus-exception decisions. A skill-quality evidence scenario requires observed failure or feedback evidence, working examples, reference-set gaps, and validation commands before a skill -amendment can be promoted. +amendment can be promoted. A deep-analyzer evidence scenario requires analyzer +corpus cases, expected-output comparisons, and risk-taxonomy proof before +repository or commit-analysis behavior can change. ## Reference Pressure @@ -116,6 +118,9 @@ Current corpus: - `skill-quality-evidence`: requires focused skill scope, observed failure or user-feedback evidence, examples/reference-set coverage, validation commands, and publication safety before a skill amendment can be promoted. +- `deep-analyzer-evidence`: requires maintained analyzer corpus cases, + expected-output comparisons, representative repository/commit histories, and + regression commands before deep-analysis behavior can be promoted. ## ECC Tools Mapping @@ -147,7 +152,7 @@ A candidate can be promoted only when: ## Next Expansion -The next evaluator/RAG corpus should add: - -- a deep-analyzer evidence scenario with maintained reference sets and rejected - low-evidence candidates. +The local evaluator/RAG corpus now covers the current evidence buckets. Future +work should consume these fixtures from ECC Tools before adding hosted +retrieval, vector storage, model-backed judging, or automated check-run +promotion. diff --git a/examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md b/examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md new file mode 100644 index 00000000..86eaf499 --- /dev/null +++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md @@ -0,0 +1,60 @@ +# Deep Analyzer Evidence Playbook + +Candidate id: `corpus-backed-analyzer-change` + +Use this playbook when a PR changes repository analysis, commit analysis, +architecture classification, workflow detection, pattern detection, or +deep-analysis risk-taxonomy behavior. + +## Accepted Path + +1. Name the changed analyzer surface and source file. +2. Retrieve the Deep Analyzer Evidence contract from `../ECC-Tools/README.md` + and the follow-up logic in `../ECC-Tools/src/lib/analyzer.ts`. +3. Match the change to maintained corpus or reference evidence: + - `../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts` + - `../ECC-Tools/src/analyzers/deep-analyzer-corpus.test.ts` + - `../ECC-Tools/src/lib/analyzer.compare.test.ts` +4. Compare expected outputs for the affected behavior: + - folder type; + - module organization; + - test location; + - primary language; + - commit message type; + - detected workflow names. +5. Add or update analyzer corpus, expected-output snapshots, fixtures, + benchmarks, golden cases, evals, or reference sets for the same changed + surface. +6. Run the relevant validation gate from `../ECC-Tools/`: + - `npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts` + - `npm run typecheck` + - `npm run lint` +7. Record the corpus case, expected-output comparison, validation output, and + rollback notes in the maintainer PR body or handoff. + +## Rejected Path + +Do not promote analyzer threshold, classification, or risk-taxonomy changes +without corpus, snapshot, fixture, benchmark, golden, eval, or reference-set +evidence. + +Do not suppress the `Deep Analyzer Evidence` PR-risk bucket just because the +change is small. Suppress it only when co-located evidence covers the same +analyzer surface. + +Do not rely only on broad manual review notes. Analyzer changes need +representative repository shapes or commit-history cases with expected outputs. + +Do not post PR comments, create check runs, sync Linear, publish packages, edit +plugins, or create release artifacts from the evaluator run. + +## Minimum Validation + +- `npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts` +- `npm run typecheck` +- `npm run lint` +- `git diff --check` +- Markdown lint when docs or playbooks are touched + +Preserve source attribution for analyzer evidence and include rollback guidance +for the future maintainer PR. diff --git a/examples/evaluator-rag-prototype/deep-analyzer-evidence/report.json b/examples/evaluator-rag-prototype/deep-analyzer-evidence/report.json new file mode 100644 index 00000000..ee373ebc --- /dev/null +++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/report.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.report.v1", + "scenario_id": "deep-analyzer-evidence", + "run_id": "2026-05-12-deep-analyzer-evidence-prototype", + "result": "prototype_passed", + "read_only": true, + "scores": { + "corpus_retrieval": 0.95, + "expected_output_comparison": 0.91, + "representative_case_coverage": 0.89, + "taxonomy_gap_safety": 0.93, + "publication_safety": 1 + }, + "findings": [ + { + "id": "corpus-required", + "severity": "warning", + "summary": "Deep-analysis behavior changes need maintained corpus, snapshot, fixture, benchmark, golden, eval, or reference-set evidence before promotion." + }, + { + "id": "expected-output-required", + "severity": "warning", + "summary": "Analyzer changes should compare expected folder type, module organization, test location, primary language, commit pattern, or workflow outputs." + }, + { + "id": "read-only-routing", + "severity": "info", + "summary": "The evaluator can recommend a maintainer PR but cannot post PR comments, check runs, Linear sync updates, packages, plugins, or release actions itself." + } + ], + "recommended_next_action": { + "candidate_id": "corpus-backed-analyzer-change", + "action": "Use the promoted deep-analyzer evidence playbook for PRs that change repository, commit, architecture, workflow, pattern, or risk-taxonomy analysis behavior." + } +} diff --git a/examples/evaluator-rag-prototype/deep-analyzer-evidence/scenario.json b/examples/evaluator-rag-prototype/deep-analyzer-evidence/scenario.json new file mode 100644 index 00000000..e2fb06a2 --- /dev/null +++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/scenario.json @@ -0,0 +1,57 @@ +{ + "schema_version": "ecc.evaluator-rag.scenario.v1", + "scenario_id": "deep-analyzer-evidence", + "title": "Require analyzer corpus evidence before promoting deep-analysis changes", + "mode": "read_only_prototype", + "objective": "Given a change to repository, commit, architecture, pattern, or deep-analysis logic, retrieve maintained analyzer corpus evidence and expected-output comparisons before promoting analyzer behavior or risk-taxonomy changes.", + "sources": [ + { + "kind": "sibling_repo_doc", + "path": "../ECC-Tools/README.md", + "purpose": "Public description of deep-analyzer predictive follow-ups and the Deep Analyzer Evidence PR-risk bucket" + }, + { + "kind": "sibling_repo_source", + "path": "../ECC-Tools/src/lib/analyzer.ts", + "purpose": "Predictive follow-up logic that flags analyzer changes without corpus, snapshot, fixture, or benchmark evidence" + }, + { + "kind": "sibling_repo_source", + "path": "../ECC-Tools/src/lib/pr-risk-taxonomy.ts", + "purpose": "Non-blocking PR-risk taxonomy bucket for deep-analyzer evidence" + }, + { + "kind": "sibling_repo_fixture", + "path": "../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts", + "purpose": "Maintained corpus cases for representative repository shapes, commit histories, and expected analyzer outputs" + }, + { + "kind": "sibling_repo_test", + "command": "npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts", + "purpose": "Regression evidence for analyzer corpus outputs and deep-analyzer follow-up generation" + } + ], + "retrieval_questions": [ + "Which analyzer surface changed: repository structure, architecture, code style, commit messages, workflow detection, pattern detection, or risk taxonomy?", + "Which maintained corpus case or reference set covers the same analyzer behavior?", + "Do expected outputs compare folder type, module organization, test location, primary language, commit type, and workflow names?", + "Does the PR add analyzer corpus, snapshot, fixture, benchmark, golden, eval, or reference-set evidence alongside analyzer code changes?", + "Does the evaluator keep PR comments, check runs, Linear sync, package changes, and publication actions out of the read-only pass?" + ], + "forbidden_actions": [ + "promoting repository, commit, architecture, or deep-analysis changes without analyzer corpus evidence", + "suppressing the Deep Analyzer Evidence risk bucket without co-located corpus, snapshot, fixture, or benchmark evidence", + "changing analyzer thresholds or classifications without expected-output comparison", + "relying only on broad manual review notes instead of representative repository and commit-history cases", + "posting PR comments, check runs, or Linear sync updates from this read-only evaluator run", + "changing package, plugin, release, or publication state from this evaluator run" + ], + "acceptance_gates": [ + "changed analyzer surface is named", + "maintained corpus or reference-set path is included", + "expected analyzer outputs are compared", + "representative repository shape or commit history is described", + "regression command is named", + "at least one no-corpus analyzer change is rejected" + ] +} diff --git a/examples/evaluator-rag-prototype/deep-analyzer-evidence/trace.json b/examples/evaluator-rag-prototype/deep-analyzer-evidence/trace.json new file mode 100644 index 00000000..d6411786 --- /dev/null +++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/trace.json @@ -0,0 +1,45 @@ +{ + "schema_version": "ecc.evaluator-rag.trace.v1", + "scenario_id": "deep-analyzer-evidence", + "run_id": "2026-05-12-deep-analyzer-evidence-prototype", + "read_only": true, + "events": [ + { + "phase": "observation", + "summary": "A deep-analysis PR changes repository, commit, architecture, workflow, pattern, or risk-taxonomy behavior. The evaluator records the touched analyzer surface and remains read-only.", + "evidence": [ + "../ECC-Tools/src/lib/analyzer.ts", + "../ECC-Tools/src/lib/pr-risk-taxonomy.ts" + ] + }, + { + "phase": "retrieval", + "summary": "Retrieved the maintained analyzer corpus, corpus regression test, and follow-up tests that distinguish corpus-backed analyzer changes from no-evidence analyzer rewrites.", + "evidence": [ + "../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts", + "../ECC-Tools/src/analyzers/deep-analyzer-corpus.test.ts", + "../ECC-Tools/src/lib/analyzer.compare.test.ts" + ] + }, + { + "phase": "proposal", + "summary": "Generated two candidate playbooks: corpus-backed analyzer change, and threshold-only analyzer rewrite without expected-output evidence.", + "candidate_ids": [ + "corpus-backed-analyzer-change", + "threshold-only-analyzer-rewrite" + ] + }, + { + "phase": "verification", + "summary": "Accepted the corpus-backed analyzer change because it names representative repository/commit cases and expected-output comparisons. Rejected the threshold-only rewrite because it lacks corpus or benchmark evidence.", + "evidence": [ + "examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json" + ] + }, + { + "phase": "promotion", + "summary": "Promoted only the read-only deep-analyzer evidence playbook. Future analyzer edits must move through maintainer PRs with corpus evidence, regression commands, and rollback notes.", + "promoted_candidate_id": "corpus-backed-analyzer-change" + } + ] +} diff --git a/examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json b/examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json new file mode 100644 index 00000000..ef3d7d83 --- /dev/null +++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.verifier.v1", + "scenario_id": "deep-analyzer-evidence", + "run_id": "2026-05-12-deep-analyzer-evidence-prototype", + "read_only": true, + "candidates": [ + { + "candidate_id": "corpus-backed-analyzer-change", + "decision": "accepted", + "score": 0.92, + "reasons": [ + "names the changed analyzer surface and matching maintained corpus case", + "compares expected analyzer outputs for representative repository and commit-history inputs", + "keeps Deep Analyzer Evidence taxonomy behavior tied to co-located corpus or benchmark evidence", + "names the regression command that exercises corpus and follow-up behavior", + "keeps PR comments, check runs, Linear sync, and publication actions out of the evaluator run" + ], + "rollback": "Revert the future analyzer PR and restore the prior corpus expectations; no hosted check-run, Linear, package, or publication state changes in this read-only playbook." + }, + { + "candidate_id": "threshold-only-analyzer-rewrite", + "decision": "rejected", + "score": 0.13, + "reasons": [ + "changes analyzer thresholds without corpus evidence", + "does not compare expected outputs against representative repository or commit-history cases", + "does not update analyzer corpus, snapshot, fixture, benchmark, golden, eval, or reference-set artifacts", + "would suppress Deep Analyzer Evidence risk without proof", + "does not name a regression command" + ], + "rollback": "Do not promote this analyzer rewrite; restart from maintained corpus inputs, expected-output snapshots, and a focused maintainer PR." + } + ], + "promoted_candidate_id": "corpus-backed-analyzer-change" +} diff --git a/tests/docs/evaluator-rag-prototype.test.js b/tests/docs/evaluator-rag-prototype.test.js index 4655990c..52459379 100644 --- a/tests/docs/evaluator-rag-prototype.test.js +++ b/tests/docs/evaluator-rag-prototype.test.js @@ -130,12 +130,12 @@ test('candidate playbook preserves stale-salvage operating rules', () => { } }); -test('roadmap points to the evaluator RAG prototype and keeps broader corpus work open', () => { +test('roadmap points to the evaluator RAG prototype and keeps hosted integration open', () => { const roadmap = read('docs/ECC-2.0-GA-ROADMAP.md'); assert.ok(roadmap.includes('docs/architecture/evaluator-rag-prototype.md')); assert.ok(roadmap.includes('examples/evaluator-rag-prototype/')); - assert.ok(roadmap.includes('Needs deep-analyzer corpus')); + assert.ok(roadmap.includes('Local corpus complete; hosted integration remains future')); }); test('billing readiness scenario rejects launch copy overclaims', () => { @@ -361,6 +361,54 @@ test('skill quality evidence scenario rejects vague rewrites', () => { assert.ok(playbook.includes('observed skill-run failure')); }); +test('deep analyzer evidence scenario rejects no-corpus analyzer changes', () => { + const scenario = readFixtureJson('deep-analyzer-evidence/scenario.json'); + const trace = readFixtureJson('deep-analyzer-evidence/trace.json'); + const report = readFixtureJson('deep-analyzer-evidence/report.json'); + const verifier = readFixtureJson('deep-analyzer-evidence/verifier-result.json'); + const playbook = read('examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md'); + + assert.strictEqual(scenario.scenario_id, 'deep-analyzer-evidence'); + assert.strictEqual(trace.scenario_id, scenario.scenario_id); + assert.strictEqual(report.scenario_id, scenario.scenario_id); + assert.strictEqual(verifier.scenario_id, scenario.scenario_id); + assert.strictEqual(trace.read_only, true); + assert.strictEqual(report.read_only, true); + assert.strictEqual(verifier.read_only, true); + + for (const blocked of [ + 'promoting repository, commit, architecture, or deep-analysis changes without analyzer corpus evidence', + 'suppressing the Deep Analyzer Evidence risk bucket without co-located corpus, snapshot, fixture, or benchmark evidence', + 'changing analyzer thresholds or classifications without expected-output comparison', + 'posting PR comments, check runs, or Linear sync updates from this read-only evaluator run' + ]) { + assert.ok(scenario.forbidden_actions.includes(blocked), `Missing deep-analyzer forbidden action: ${blocked}`); + } + + for (const required of [ + 'changed analyzer surface is named', + 'maintained corpus or reference-set path is included', + 'expected analyzer outputs are compared', + 'representative repository shape or commit history is described', + 'regression command is named' + ]) { + assert.ok(scenario.acceptance_gates.includes(required), `Missing deep-analyzer acceptance gate: ${required}`); + } + + const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'corpus-backed-analyzer-change'); + const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'threshold-only-analyzer-rewrite'); + + assert.ok(accepted, 'Missing accepted deep-analyzer candidate'); + assert.ok(rejected, 'Missing rejected threshold-only analyzer candidate'); + assert.strictEqual(accepted.decision, 'accepted'); + assert.strictEqual(rejected.decision, 'rejected'); + assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id); + assert.ok(rejected.reasons.join('\n').includes('does not compare expected outputs')); + assert.ok(playbook.includes('../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts')); + assert.ok(playbook.includes('npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts')); + assert.ok(playbook.includes('Deep Analyzer Evidence')); +}); + if (failed > 0) { console.log(`\nFailed: ${failed}`); process.exit(1);