docs: add skill-quality evaluator scenario

This commit is contained in:
Affaan Mustafa
2026-05-12 18:24:41 -04:00
committed by Affaan Mustafa
parent b25d4770f5
commit 337ced0828
8 changed files with 291 additions and 8 deletions

View File

@@ -0,0 +1,57 @@
# Skill Quality Evidence Playbook
Candidate id: `evidence-backed-skill-amendment`
Use this playbook when a PR or follow-up proposes adding, rewriting, or
amending a skill, agent, command, or rule guidance surface.
## Accepted Path
1. Name the changed guidance surface and source file.
2. Retrieve the quality contract from `docs/SKILL-DEVELOPMENT-GUIDE.md`.
3. Compare the proposed change to nearby focused examples under `skills/*/SKILL.md`.
4. Record the evidence source that justifies the change:
- observed skill-run failure;
- user feedback;
- repeated review finding;
- reference-set gap;
- failing example or regression test.
5. Keep the scope narrow. One skill should cover one domain, workflow, or
reusable pattern.
6. Add or update examples only when they can be validated.
7. Run the relevant validation gate:
- `node scripts/ci/validate-skills.js`
- `node tests/lib/skill-improvement.test.js`
- `node tests/lib/skill-evolution.test.js`
- `npm run catalog:check`
- language-specific example commands such as `npx tsc --noEmit`,
`python -m py_compile`, or `go build` when examples are touched.
8. Record validation output, source attribution, and rollback notes in the
maintainer PR body or handoff.
## Rejected Path
Do not promote a vague skill rewrite because the prose "sounds better" without
observed failure evidence, examples, or a reference set.
Do not merge multi-domain catch-all skills that duplicate focused skills or make
activation less predictable.
Do not copy private operator context, secrets, tokens, personal paths, customer
data, or unpublished release claims into skills.
Do not update package manifests, plugin manifests, catalogs, release notes, or
publication state from the evaluator run.
## Minimum Validation
- `node scripts/ci/validate-skills.js`
- `npm run catalog:check` when catalog/package-visible skill surfaces change
- Focused skill-improvement or skill-evolution regression test when amendment
behavior changes
- Language-specific compile/lint checks for touched examples
- `git diff --check`
- Markdown lint when docs or playbooks are touched
Preserve source attribution for contributed skill material and include rollback
guidance for the future maintainer PR.

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.report.v1",
"scenario_id": "skill-quality-evidence",
"run_id": "2026-05-12-skill-quality-evidence-prototype",
"result": "prototype_passed",
"read_only": true,
"scores": {
"skill_contract_retrieval": 0.94,
"observed_failure_evidence": 0.88,
"example_quality": 0.9,
"validation_specificity": 0.93,
"publication_safety": 1
},
"findings": [
{
"id": "examples-required",
"severity": "warning",
"summary": "Skill-quality changes need working examples or regression evidence; prose-only rewrites are not enough for promotion."
},
{
"id": "observation-source-required",
"severity": "warning",
"summary": "Skill amendments should cite observed failure, user feedback, or a reference-set gap rather than broad style preference."
},
{
"id": "publication-stays-blocked",
"severity": "info",
"summary": "The evaluator can recommend a maintainer PR, but it cannot update package, plugin, catalog, or publication state itself."
}
],
"recommended_next_action": {
"candidate_id": "evidence-backed-skill-amendment",
"action": "Use the promoted skill-quality playbook for PRs that add, rewrite, or amend skills, agents, commands, or rules guidance."
}
}

View File

@@ -0,0 +1,57 @@
{
"schema_version": "ecc.evaluator-rag.scenario.v1",
"scenario_id": "skill-quality-evidence",
"title": "Require examples and validation before promoting skill guidance changes",
"mode": "read_only_prototype",
"objective": "Given a change to skills, agents, commands, or rules guidance, retrieve the skill development contract and observed skill-run evidence before promoting an amendment or new skill-quality recommendation.",
"sources": [
{
"kind": "repo_doc",
"path": "docs/SKILL-DEVELOPMENT-GUIDE.md",
"purpose": "Public skill quality contract for frontmatter, focused scope, examples, testing, and submission evidence"
},
{
"kind": "repo_source",
"path": "scripts/ci/validate-skills.js",
"purpose": "Curated skill structure and frontmatter validation gate"
},
{
"kind": "repo_source",
"path": "scripts/lib/skill-improvement/",
"purpose": "Observation, health, amendment, and evaluation helpers for evidence-backed skill evolution"
},
{
"kind": "repo_test",
"command": "node tests/lib/skill-improvement.test.js",
"purpose": "Regression coverage for observation-backed skill amendment and evaluation scaffolds"
},
{
"kind": "repo_test",
"command": "node scripts/ci/validate-skills.js",
"purpose": "Skill structure validation before catalog or package changes merge"
}
],
"retrieval_questions": [
"Which skill, agent, command, or rule surface changed?",
"Does the change preserve focused scope, clear activation text, and working examples?",
"Which validation command proves frontmatter, catalog, example, or behavior quality?",
"Does observed failure or user feedback justify the amendment?",
"Does the candidate avoid private context, secrets, personal paths, and publication actions?"
],
"forbidden_actions": [
"promoting a skill rewrite without examples, validation, or observed failure evidence",
"adding broad multi-domain skills that duplicate existing focused skills",
"shipping code examples that are uncompiled, untested, or disconnected from the skill guidance",
"copying private operator context, secrets, tokens, or personal paths into skills",
"changing package, plugin, catalog, or publication state from this evaluator run",
"claiming a skill-quality improvement without a reference set or regression command"
],
"acceptance_gates": [
"changed skill or guidance surface is named",
"source evidence includes the skill development guide or current skill examples",
"observed failure, user feedback, or reference-set gap is recorded",
"validation command is named",
"example or regression evidence is attached",
"at least one vague no-evidence rewrite is rejected"
]
}

View File

@@ -0,0 +1,46 @@
{
"schema_version": "ecc.evaluator-rag.trace.v1",
"scenario_id": "skill-quality-evidence",
"run_id": "2026-05-12-skill-quality-evidence-prototype",
"read_only": true,
"events": [
{
"phase": "observation",
"summary": "A skill or guidance PR proposes updated instructions. The evaluator records the changed surface and stays read-only; it does not edit skills, package manifests, catalogs, or publication state.",
"evidence": [
"docs/SKILL-DEVELOPMENT-GUIDE.md",
"scripts/ci/validate-skills.js"
]
},
{
"phase": "retrieval",
"summary": "Retrieved the skill quality contract, existing focused skill examples, observation-backed amendment helpers, and validation commands for skill structure and regression evidence.",
"evidence": [
"node scripts/ci/validate-skills.js",
"node tests/lib/skill-improvement.test.js",
"node tests/lib/skill-evolution.test.js",
"npm run catalog:check"
]
},
{
"phase": "proposal",
"summary": "Generated two candidate playbooks: evidence-backed skill amendment, and broad rewrite with no examples or validation.",
"candidate_ids": [
"evidence-backed-skill-amendment",
"vague-skill-rewrite"
]
},
{
"phase": "verification",
"summary": "Accepted the evidence-backed amendment because it names observed failure evidence, examples, and validation commands. Rejected the vague rewrite because it lacks a reference set and testable examples.",
"evidence": [
"examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json"
]
},
{
"phase": "promotion",
"summary": "Promoted only the read-only skill-quality evidence playbook. Future skill edits must move through maintainer PRs with source attribution, validation, and rollback notes.",
"promoted_candidate_id": "evidence-backed-skill-amendment"
}
]
}

View File

@@ -0,0 +1,35 @@
{
"schema_version": "ecc.evaluator-rag.verifier.v1",
"scenario_id": "skill-quality-evidence",
"run_id": "2026-05-12-skill-quality-evidence-prototype",
"read_only": true,
"candidates": [
{
"candidate_id": "evidence-backed-skill-amendment",
"decision": "accepted",
"score": 0.91,
"reasons": [
"retrieves the skill development guide and existing focused skill examples",
"records observed failure, user feedback, or reference-set gap before proposing an amendment",
"names validation commands for skill structure, examples, catalog consistency, and regression behavior",
"keeps package, plugin, catalog, and publication actions out of the evaluator run",
"includes rollback guidance for reverting the future maintainer PR"
],
"rollback": "Revert the future skill-amendment PR and restore the prior SKILL.md content; no installed user skill or publication surface changes in this read-only playbook."
},
{
"candidate_id": "vague-skill-rewrite",
"decision": "rejected",
"score": 0.14,
"reasons": [
"does not name observed failure evidence or user feedback",
"rewrites broad skill guidance without focused scope",
"does not include working examples or a reference set",
"does not name a regression command",
"risks changing catalog or publication state from evaluator output"
],
"rollback": "Do not promote this rewrite; restart from observed skill-run evidence, example validation, and a focused maintainer PR."
}
],
"promoted_candidate_id": "evidence-backed-skill-amendment"
}