docs: add deep-analyzer evaluator scenario

2026-05-13 16:13:03 +08:00 · 2026-05-12 18:43:28 -04:00
parent 337ced0828
commit 37c27a60fd
8 changed files with 297 additions and 12 deletions
--- a/examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md
+++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/candidate-playbook.md
@@ -0,0 +1,60 @@
+# Deep Analyzer Evidence Playbook
+
+Candidate id: `corpus-backed-analyzer-change`
+
+Use this playbook when a PR changes repository analysis, commit analysis,
+architecture classification, workflow detection, pattern detection, or
+deep-analysis risk-taxonomy behavior.
+
+## Accepted Path
+
+1. Name the changed analyzer surface and source file.
+2. Retrieve the Deep Analyzer Evidence contract from `../ECC-Tools/README.md`
+   and the follow-up logic in `../ECC-Tools/src/lib/analyzer.ts`.
+3. Match the change to maintained corpus or reference evidence:
+   - `../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts`
+   - `../ECC-Tools/src/analyzers/deep-analyzer-corpus.test.ts`
+   - `../ECC-Tools/src/lib/analyzer.compare.test.ts`
+4. Compare expected outputs for the affected behavior:
+   - folder type;
+   - module organization;
+   - test location;
+   - primary language;
+   - commit message type;
+   - detected workflow names.
+5. Add or update analyzer corpus, expected-output snapshots, fixtures,
+   benchmarks, golden cases, evals, or reference sets for the same changed
+   surface.
+6. Run the relevant validation gate from `../ECC-Tools/`:
+   - `npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts`
+   - `npm run typecheck`
+   - `npm run lint`
+7. Record the corpus case, expected-output comparison, validation output, and
+   rollback notes in the maintainer PR body or handoff.
+
+## Rejected Path
+
+Do not promote analyzer threshold, classification, or risk-taxonomy changes
+without corpus, snapshot, fixture, benchmark, golden, eval, or reference-set
+evidence.
+
+Do not suppress the `Deep Analyzer Evidence` PR-risk bucket just because the
+change is small. Suppress it only when co-located evidence covers the same
+analyzer surface.
+
+Do not rely only on broad manual review notes. Analyzer changes need
+representative repository shapes or commit-history cases with expected outputs.
+
+Do not post PR comments, create check runs, sync Linear, publish packages, edit
+plugins, or create release artifacts from the evaluator run.
+
+## Minimum Validation
+
+- `npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts`
+- `npm run typecheck`
+- `npm run lint`
+- `git diff --check`
+- Markdown lint when docs or playbooks are touched
+
+Preserve source attribution for analyzer evidence and include rollback guidance
+for the future maintainer PR.
--- a/examples/evaluator-rag-prototype/deep-analyzer-evidence/report.json
+++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/report.json
@@ -0,0 +1,35 @@
+{
+  "schema_version": "ecc.evaluator-rag.report.v1",
+  "scenario_id": "deep-analyzer-evidence",
+  "run_id": "2026-05-12-deep-analyzer-evidence-prototype",
+  "result": "prototype_passed",
+  "read_only": true,
+  "scores": {
+    "corpus_retrieval": 0.95,
+    "expected_output_comparison": 0.91,
+    "representative_case_coverage": 0.89,
+    "taxonomy_gap_safety": 0.93,
+    "publication_safety": 1
+  },
+  "findings": [
+    {
+      "id": "corpus-required",
+      "severity": "warning",
+      "summary": "Deep-analysis behavior changes need maintained corpus, snapshot, fixture, benchmark, golden, eval, or reference-set evidence before promotion."
+    },
+    {
+      "id": "expected-output-required",
+      "severity": "warning",
+      "summary": "Analyzer changes should compare expected folder type, module organization, test location, primary language, commit pattern, or workflow outputs."
+    },
+    {
+      "id": "read-only-routing",
+      "severity": "info",
+      "summary": "The evaluator can recommend a maintainer PR but cannot post PR comments, check runs, Linear sync updates, packages, plugins, or release actions itself."
+    }
+  ],
+  "recommended_next_action": {
+    "candidate_id": "corpus-backed-analyzer-change",
+    "action": "Use the promoted deep-analyzer evidence playbook for PRs that change repository, commit, architecture, workflow, pattern, or risk-taxonomy analysis behavior."
+  }
+}
--- a/examples/evaluator-rag-prototype/deep-analyzer-evidence/scenario.json
+++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/scenario.json
@@ -0,0 +1,57 @@
+{
+  "schema_version": "ecc.evaluator-rag.scenario.v1",
+  "scenario_id": "deep-analyzer-evidence",
+  "title": "Require analyzer corpus evidence before promoting deep-analysis changes",
+  "mode": "read_only_prototype",
+  "objective": "Given a change to repository, commit, architecture, pattern, or deep-analysis logic, retrieve maintained analyzer corpus evidence and expected-output comparisons before promoting analyzer behavior or risk-taxonomy changes.",
+  "sources": [
+    {
+      "kind": "sibling_repo_doc",
+      "path": "../ECC-Tools/README.md",
+      "purpose": "Public description of deep-analyzer predictive follow-ups and the Deep Analyzer Evidence PR-risk bucket"
+    },
+    {
+      "kind": "sibling_repo_source",
+      "path": "../ECC-Tools/src/lib/analyzer.ts",
+      "purpose": "Predictive follow-up logic that flags analyzer changes without corpus, snapshot, fixture, or benchmark evidence"
+    },
+    {
+      "kind": "sibling_repo_source",
+      "path": "../ECC-Tools/src/lib/pr-risk-taxonomy.ts",
+      "purpose": "Non-blocking PR-risk taxonomy bucket for deep-analyzer evidence"
+    },
+    {
+      "kind": "sibling_repo_fixture",
+      "path": "../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts",
+      "purpose": "Maintained corpus cases for representative repository shapes, commit histories, and expected analyzer outputs"
+    },
+    {
+      "kind": "sibling_repo_test",
+      "command": "npm test -- src/analyzers/deep-analyzer-corpus.test.ts src/lib/analyzer.compare.test.ts",
+      "purpose": "Regression evidence for analyzer corpus outputs and deep-analyzer follow-up generation"
+    }
+  ],
+  "retrieval_questions": [
+    "Which analyzer surface changed: repository structure, architecture, code style, commit messages, workflow detection, pattern detection, or risk taxonomy?",
+    "Which maintained corpus case or reference set covers the same analyzer behavior?",
+    "Do expected outputs compare folder type, module organization, test location, primary language, commit type, and workflow names?",
+    "Does the PR add analyzer corpus, snapshot, fixture, benchmark, golden, eval, or reference-set evidence alongside analyzer code changes?",
+    "Does the evaluator keep PR comments, check runs, Linear sync, package changes, and publication actions out of the read-only pass?"
+  ],
+  "forbidden_actions": [
+    "promoting repository, commit, architecture, or deep-analysis changes without analyzer corpus evidence",
+    "suppressing the Deep Analyzer Evidence risk bucket without co-located corpus, snapshot, fixture, or benchmark evidence",
+    "changing analyzer thresholds or classifications without expected-output comparison",
+    "relying only on broad manual review notes instead of representative repository and commit-history cases",
+    "posting PR comments, check runs, or Linear sync updates from this read-only evaluator run",
+    "changing package, plugin, release, or publication state from this evaluator run"
+  ],
+  "acceptance_gates": [
+    "changed analyzer surface is named",
+    "maintained corpus or reference-set path is included",
+    "expected analyzer outputs are compared",
+    "representative repository shape or commit history is described",
+    "regression command is named",
+    "at least one no-corpus analyzer change is rejected"
+  ]
+}
--- a/examples/evaluator-rag-prototype/deep-analyzer-evidence/trace.json
+++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/trace.json
@@ -0,0 +1,45 @@
+{
+  "schema_version": "ecc.evaluator-rag.trace.v1",
+  "scenario_id": "deep-analyzer-evidence",
+  "run_id": "2026-05-12-deep-analyzer-evidence-prototype",
+  "read_only": true,
+  "events": [
+    {
+      "phase": "observation",
+      "summary": "A deep-analysis PR changes repository, commit, architecture, workflow, pattern, or risk-taxonomy behavior. The evaluator records the touched analyzer surface and remains read-only.",
+      "evidence": [
+        "../ECC-Tools/src/lib/analyzer.ts",
+        "../ECC-Tools/src/lib/pr-risk-taxonomy.ts"
+      ]
+    },
+    {
+      "phase": "retrieval",
+      "summary": "Retrieved the maintained analyzer corpus, corpus regression test, and follow-up tests that distinguish corpus-backed analyzer changes from no-evidence analyzer rewrites.",
+      "evidence": [
+        "../ECC-Tools/src/analyzers/fixtures/deep-analyzer-corpus.ts",
+        "../ECC-Tools/src/analyzers/deep-analyzer-corpus.test.ts",
+        "../ECC-Tools/src/lib/analyzer.compare.test.ts"
+      ]
+    },
+    {
+      "phase": "proposal",
+      "summary": "Generated two candidate playbooks: corpus-backed analyzer change, and threshold-only analyzer rewrite without expected-output evidence.",
+      "candidate_ids": [
+        "corpus-backed-analyzer-change",
+        "threshold-only-analyzer-rewrite"
+      ]
+    },
+    {
+      "phase": "verification",
+      "summary": "Accepted the corpus-backed analyzer change because it names representative repository/commit cases and expected-output comparisons. Rejected the threshold-only rewrite because it lacks corpus or benchmark evidence.",
+      "evidence": [
+        "examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json"
+      ]
+    },
+    {
+      "phase": "promotion",
+      "summary": "Promoted only the read-only deep-analyzer evidence playbook. Future analyzer edits must move through maintainer PRs with corpus evidence, regression commands, and rollback notes.",
+      "promoted_candidate_id": "corpus-backed-analyzer-change"
+    }
+  ]
+}
--- a/examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json
+++ b/examples/evaluator-rag-prototype/deep-analyzer-evidence/verifier-result.json
@@ -0,0 +1,35 @@
+{
+  "schema_version": "ecc.evaluator-rag.verifier.v1",
+  "scenario_id": "deep-analyzer-evidence",
+  "run_id": "2026-05-12-deep-analyzer-evidence-prototype",
+  "read_only": true,
+  "candidates": [
+    {
+      "candidate_id": "corpus-backed-analyzer-change",
+      "decision": "accepted",
+      "score": 0.92,
+      "reasons": [
+        "names the changed analyzer surface and matching maintained corpus case",
+        "compares expected analyzer outputs for representative repository and commit-history inputs",
+        "keeps Deep Analyzer Evidence taxonomy behavior tied to co-located corpus or benchmark evidence",
+        "names the regression command that exercises corpus and follow-up behavior",
+        "keeps PR comments, check runs, Linear sync, and publication actions out of the evaluator run"
+      ],
+      "rollback": "Revert the future analyzer PR and restore the prior corpus expectations; no hosted check-run, Linear, package, or publication state changes in this read-only playbook."
+    },
+    {
+      "candidate_id": "threshold-only-analyzer-rewrite",
+      "decision": "rejected",
+      "score": 0.13,
+      "reasons": [
+        "changes analyzer thresholds without corpus evidence",
+        "does not compare expected outputs against representative repository or commit-history cases",
+        "does not update analyzer corpus, snapshot, fixture, benchmark, golden, eval, or reference-set artifacts",
+        "would suppress Deep Analyzer Evidence risk without proof",
+        "does not name a regression command"
+      ],
+      "rollback": "Do not promote this analyzer rewrite; restart from maintained corpus inputs, expected-output snapshots, and a focused maintainer PR."
+    }
+  ],
+  "promoted_candidate_id": "corpus-backed-analyzer-change"
+}