docs: add skill-quality evaluator scenario

2026-05-13 16:13:03 +08:00 · 2026-05-12 18:24:41 -04:00
parent b25d4770f5
commit 337ced0828
8 changed files with 291 additions and 8 deletions
--- a/examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md
+++ b/examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md
@@ -0,0 +1,57 @@
+# Skill Quality Evidence Playbook
+
+Candidate id: `evidence-backed-skill-amendment`
+
+Use this playbook when a PR or follow-up proposes adding, rewriting, or
+amending a skill, agent, command, or rule guidance surface.
+
+## Accepted Path
+
+1. Name the changed guidance surface and source file.
+2. Retrieve the quality contract from `docs/SKILL-DEVELOPMENT-GUIDE.md`.
+3. Compare the proposed change to nearby focused examples under `skills/*/SKILL.md`.
+4. Record the evidence source that justifies the change:
+   - observed skill-run failure;
+   - user feedback;
+   - repeated review finding;
+   - reference-set gap;
+   - failing example or regression test.
+5. Keep the scope narrow. One skill should cover one domain, workflow, or
+   reusable pattern.
+6. Add or update examples only when they can be validated.
+7. Run the relevant validation gate:
+   - `node scripts/ci/validate-skills.js`
+   - `node tests/lib/skill-improvement.test.js`
+   - `node tests/lib/skill-evolution.test.js`
+   - `npm run catalog:check`
+   - language-specific example commands such as `npx tsc --noEmit`,
+     `python -m py_compile`, or `go build` when examples are touched.
+8. Record validation output, source attribution, and rollback notes in the
+   maintainer PR body or handoff.
+
+## Rejected Path
+
+Do not promote a vague skill rewrite because the prose "sounds better" without
+observed failure evidence, examples, or a reference set.
+
+Do not merge multi-domain catch-all skills that duplicate focused skills or make
+activation less predictable.
+
+Do not copy private operator context, secrets, tokens, personal paths, customer
+data, or unpublished release claims into skills.
+
+Do not update package manifests, plugin manifests, catalogs, release notes, or
+publication state from the evaluator run.
+
+## Minimum Validation
+
+- `node scripts/ci/validate-skills.js`
+- `npm run catalog:check` when catalog/package-visible skill surfaces change
+- Focused skill-improvement or skill-evolution regression test when amendment
+  behavior changes
+- Language-specific compile/lint checks for touched examples
+- `git diff --check`
+- Markdown lint when docs or playbooks are touched
+
+Preserve source attribution for contributed skill material and include rollback
+guidance for the future maintainer PR.
--- a/examples/evaluator-rag-prototype/skill-quality-evidence/report.json
+++ b/examples/evaluator-rag-prototype/skill-quality-evidence/report.json
@@ -0,0 +1,35 @@
+{
+  "schema_version": "ecc.evaluator-rag.report.v1",
+  "scenario_id": "skill-quality-evidence",
+  "run_id": "2026-05-12-skill-quality-evidence-prototype",
+  "result": "prototype_passed",
+  "read_only": true,
+  "scores": {
+    "skill_contract_retrieval": 0.94,
+    "observed_failure_evidence": 0.88,
+    "example_quality": 0.9,
+    "validation_specificity": 0.93,
+    "publication_safety": 1
+  },
+  "findings": [
+    {
+      "id": "examples-required",
+      "severity": "warning",
+      "summary": "Skill-quality changes need working examples or regression evidence; prose-only rewrites are not enough for promotion."
+    },
+    {
+      "id": "observation-source-required",
+      "severity": "warning",
+      "summary": "Skill amendments should cite observed failure, user feedback, or a reference-set gap rather than broad style preference."
+    },
+    {
+      "id": "publication-stays-blocked",
+      "severity": "info",
+      "summary": "The evaluator can recommend a maintainer PR, but it cannot update package, plugin, catalog, or publication state itself."
+    }
+  ],
+  "recommended_next_action": {
+    "candidate_id": "evidence-backed-skill-amendment",
+    "action": "Use the promoted skill-quality playbook for PRs that add, rewrite, or amend skills, agents, commands, or rules guidance."
+  }
+}
--- a/examples/evaluator-rag-prototype/skill-quality-evidence/scenario.json
+++ b/examples/evaluator-rag-prototype/skill-quality-evidence/scenario.json
@@ -0,0 +1,57 @@
+{
+  "schema_version": "ecc.evaluator-rag.scenario.v1",
+  "scenario_id": "skill-quality-evidence",
+  "title": "Require examples and validation before promoting skill guidance changes",
+  "mode": "read_only_prototype",
+  "objective": "Given a change to skills, agents, commands, or rules guidance, retrieve the skill development contract and observed skill-run evidence before promoting an amendment or new skill-quality recommendation.",
+  "sources": [
+    {
+      "kind": "repo_doc",
+      "path": "docs/SKILL-DEVELOPMENT-GUIDE.md",
+      "purpose": "Public skill quality contract for frontmatter, focused scope, examples, testing, and submission evidence"
+    },
+    {
+      "kind": "repo_source",
+      "path": "scripts/ci/validate-skills.js",
+      "purpose": "Curated skill structure and frontmatter validation gate"
+    },
+    {
+      "kind": "repo_source",
+      "path": "scripts/lib/skill-improvement/",
+      "purpose": "Observation, health, amendment, and evaluation helpers for evidence-backed skill evolution"
+    },
+    {
+      "kind": "repo_test",
+      "command": "node tests/lib/skill-improvement.test.js",
+      "purpose": "Regression coverage for observation-backed skill amendment and evaluation scaffolds"
+    },
+    {
+      "kind": "repo_test",
+      "command": "node scripts/ci/validate-skills.js",
+      "purpose": "Skill structure validation before catalog or package changes merge"
+    }
+  ],
+  "retrieval_questions": [
+    "Which skill, agent, command, or rule surface changed?",
+    "Does the change preserve focused scope, clear activation text, and working examples?",
+    "Which validation command proves frontmatter, catalog, example, or behavior quality?",
+    "Does observed failure or user feedback justify the amendment?",
+    "Does the candidate avoid private context, secrets, personal paths, and publication actions?"
+  ],
+  "forbidden_actions": [
+    "promoting a skill rewrite without examples, validation, or observed failure evidence",
+    "adding broad multi-domain skills that duplicate existing focused skills",
+    "shipping code examples that are uncompiled, untested, or disconnected from the skill guidance",
+    "copying private operator context, secrets, tokens, or personal paths into skills",
+    "changing package, plugin, catalog, or publication state from this evaluator run",
+    "claiming a skill-quality improvement without a reference set or regression command"
+  ],
+  "acceptance_gates": [
+    "changed skill or guidance surface is named",
+    "source evidence includes the skill development guide or current skill examples",
+    "observed failure, user feedback, or reference-set gap is recorded",
+    "validation command is named",
+    "example or regression evidence is attached",
+    "at least one vague no-evidence rewrite is rejected"
+  ]
+}
--- a/examples/evaluator-rag-prototype/skill-quality-evidence/trace.json
+++ b/examples/evaluator-rag-prototype/skill-quality-evidence/trace.json
@@ -0,0 +1,46 @@
+{
+  "schema_version": "ecc.evaluator-rag.trace.v1",
+  "scenario_id": "skill-quality-evidence",
+  "run_id": "2026-05-12-skill-quality-evidence-prototype",
+  "read_only": true,
+  "events": [
+    {
+      "phase": "observation",
+      "summary": "A skill or guidance PR proposes updated instructions. The evaluator records the changed surface and stays read-only; it does not edit skills, package manifests, catalogs, or publication state.",
+      "evidence": [
+        "docs/SKILL-DEVELOPMENT-GUIDE.md",
+        "scripts/ci/validate-skills.js"
+      ]
+    },
+    {
+      "phase": "retrieval",
+      "summary": "Retrieved the skill quality contract, existing focused skill examples, observation-backed amendment helpers, and validation commands for skill structure and regression evidence.",
+      "evidence": [
+        "node scripts/ci/validate-skills.js",
+        "node tests/lib/skill-improvement.test.js",
+        "node tests/lib/skill-evolution.test.js",
+        "npm run catalog:check"
+      ]
+    },
+    {
+      "phase": "proposal",
+      "summary": "Generated two candidate playbooks: evidence-backed skill amendment, and broad rewrite with no examples or validation.",
+      "candidate_ids": [
+        "evidence-backed-skill-amendment",
+        "vague-skill-rewrite"
+      ]
+    },
+    {
+      "phase": "verification",
+      "summary": "Accepted the evidence-backed amendment because it names observed failure evidence, examples, and validation commands. Rejected the vague rewrite because it lacks a reference set and testable examples.",
+      "evidence": [
+        "examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json"
+      ]
+    },
+    {
+      "phase": "promotion",
+      "summary": "Promoted only the read-only skill-quality evidence playbook. Future skill edits must move through maintainer PRs with source attribution, validation, and rollback notes.",
+      "promoted_candidate_id": "evidence-backed-skill-amendment"
+    }
+  ]
+}
--- a/examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json
+++ b/examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json
@@ -0,0 +1,35 @@
+{
+  "schema_version": "ecc.evaluator-rag.verifier.v1",
+  "scenario_id": "skill-quality-evidence",
+  "run_id": "2026-05-12-skill-quality-evidence-prototype",
+  "read_only": true,
+  "candidates": [
+    {
+      "candidate_id": "evidence-backed-skill-amendment",
+      "decision": "accepted",
+      "score": 0.91,
+      "reasons": [
+        "retrieves the skill development guide and existing focused skill examples",
+        "records observed failure, user feedback, or reference-set gap before proposing an amendment",
+        "names validation commands for skill structure, examples, catalog consistency, and regression behavior",
+        "keeps package, plugin, catalog, and publication actions out of the evaluator run",
+        "includes rollback guidance for reverting the future maintainer PR"
+      ],
+      "rollback": "Revert the future skill-amendment PR and restore the prior SKILL.md content; no installed user skill or publication surface changes in this read-only playbook."
+    },
+    {
+      "candidate_id": "vague-skill-rewrite",
+      "decision": "rejected",
+      "score": 0.14,
+      "reasons": [
+        "does not name observed failure evidence or user feedback",
+        "rewrites broad skill guidance without focused scope",
+        "does not include working examples or a reference set",
+        "does not name a regression command",
+        "risks changing catalog or publication state from evaluator output"
+      ],
+      "rollback": "Do not promote this rewrite; restart from observed skill-run evidence, example validation, and a focused maintainer PR."
+    }
+  ],
+  "promoted_candidate_id": "evidence-backed-skill-amendment"
+}