mirror of
https://github.com/affaan-m/everything-claude-code.git
synced 2026-05-13 16:13:03 +08:00
docs: add skill-quality evaluator scenario
This commit is contained in:
committed by
Affaan Mustafa
parent
b25d4770f5
commit
337ced0828
@@ -0,0 +1,57 @@
|
||||
# Skill Quality Evidence Playbook
|
||||
|
||||
Candidate id: `evidence-backed-skill-amendment`
|
||||
|
||||
Use this playbook when a PR or follow-up proposes adding, rewriting, or
|
||||
amending a skill, agent, command, or rule guidance surface.
|
||||
|
||||
## Accepted Path
|
||||
|
||||
1. Name the changed guidance surface and source file.
|
||||
2. Retrieve the quality contract from `docs/SKILL-DEVELOPMENT-GUIDE.md`.
|
||||
3. Compare the proposed change to nearby focused examples under `skills/*/SKILL.md`.
|
||||
4. Record the evidence source that justifies the change:
|
||||
- observed skill-run failure;
|
||||
- user feedback;
|
||||
- repeated review finding;
|
||||
- reference-set gap;
|
||||
- failing example or regression test.
|
||||
5. Keep the scope narrow. One skill should cover one domain, workflow, or
|
||||
reusable pattern.
|
||||
6. Add or update examples only when they can be validated.
|
||||
7. Run the relevant validation gate:
|
||||
- `node scripts/ci/validate-skills.js`
|
||||
- `node tests/lib/skill-improvement.test.js`
|
||||
- `node tests/lib/skill-evolution.test.js`
|
||||
- `npm run catalog:check`
|
||||
- language-specific example commands such as `npx tsc --noEmit`,
|
||||
`python -m py_compile`, or `go build` when examples are touched.
|
||||
8. Record validation output, source attribution, and rollback notes in the
|
||||
maintainer PR body or handoff.
|
||||
|
||||
## Rejected Path
|
||||
|
||||
Do not promote a vague skill rewrite because the prose "sounds better" without
|
||||
observed failure evidence, examples, or a reference set.
|
||||
|
||||
Do not merge multi-domain catch-all skills that duplicate focused skills or make
|
||||
activation less predictable.
|
||||
|
||||
Do not copy private operator context, secrets, tokens, personal paths, customer
|
||||
data, or unpublished release claims into skills.
|
||||
|
||||
Do not update package manifests, plugin manifests, catalogs, release notes, or
|
||||
publication state from the evaluator run.
|
||||
|
||||
## Minimum Validation
|
||||
|
||||
- `node scripts/ci/validate-skills.js`
|
||||
- `npm run catalog:check` when catalog/package-visible skill surfaces change
|
||||
- Focused skill-improvement or skill-evolution regression test when amendment
|
||||
behavior changes
|
||||
- Language-specific compile/lint checks for touched examples
|
||||
- `git diff --check`
|
||||
- Markdown lint when docs or playbooks are touched
|
||||
|
||||
Preserve source attribution for contributed skill material and include rollback
|
||||
guidance for the future maintainer PR.
|
||||
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"schema_version": "ecc.evaluator-rag.report.v1",
|
||||
"scenario_id": "skill-quality-evidence",
|
||||
"run_id": "2026-05-12-skill-quality-evidence-prototype",
|
||||
"result": "prototype_passed",
|
||||
"read_only": true,
|
||||
"scores": {
|
||||
"skill_contract_retrieval": 0.94,
|
||||
"observed_failure_evidence": 0.88,
|
||||
"example_quality": 0.9,
|
||||
"validation_specificity": 0.93,
|
||||
"publication_safety": 1
|
||||
},
|
||||
"findings": [
|
||||
{
|
||||
"id": "examples-required",
|
||||
"severity": "warning",
|
||||
"summary": "Skill-quality changes need working examples or regression evidence; prose-only rewrites are not enough for promotion."
|
||||
},
|
||||
{
|
||||
"id": "observation-source-required",
|
||||
"severity": "warning",
|
||||
"summary": "Skill amendments should cite observed failure, user feedback, or a reference-set gap rather than broad style preference."
|
||||
},
|
||||
{
|
||||
"id": "publication-stays-blocked",
|
||||
"severity": "info",
|
||||
"summary": "The evaluator can recommend a maintainer PR, but it cannot update package, plugin, catalog, or publication state itself."
|
||||
}
|
||||
],
|
||||
"recommended_next_action": {
|
||||
"candidate_id": "evidence-backed-skill-amendment",
|
||||
"action": "Use the promoted skill-quality playbook for PRs that add, rewrite, or amend skills, agents, commands, or rules guidance."
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
{
|
||||
"schema_version": "ecc.evaluator-rag.scenario.v1",
|
||||
"scenario_id": "skill-quality-evidence",
|
||||
"title": "Require examples and validation before promoting skill guidance changes",
|
||||
"mode": "read_only_prototype",
|
||||
"objective": "Given a change to skills, agents, commands, or rules guidance, retrieve the skill development contract and observed skill-run evidence before promoting an amendment or new skill-quality recommendation.",
|
||||
"sources": [
|
||||
{
|
||||
"kind": "repo_doc",
|
||||
"path": "docs/SKILL-DEVELOPMENT-GUIDE.md",
|
||||
"purpose": "Public skill quality contract for frontmatter, focused scope, examples, testing, and submission evidence"
|
||||
},
|
||||
{
|
||||
"kind": "repo_source",
|
||||
"path": "scripts/ci/validate-skills.js",
|
||||
"purpose": "Curated skill structure and frontmatter validation gate"
|
||||
},
|
||||
{
|
||||
"kind": "repo_source",
|
||||
"path": "scripts/lib/skill-improvement/",
|
||||
"purpose": "Observation, health, amendment, and evaluation helpers for evidence-backed skill evolution"
|
||||
},
|
||||
{
|
||||
"kind": "repo_test",
|
||||
"command": "node tests/lib/skill-improvement.test.js",
|
||||
"purpose": "Regression coverage for observation-backed skill amendment and evaluation scaffolds"
|
||||
},
|
||||
{
|
||||
"kind": "repo_test",
|
||||
"command": "node scripts/ci/validate-skills.js",
|
||||
"purpose": "Skill structure validation before catalog or package changes merge"
|
||||
}
|
||||
],
|
||||
"retrieval_questions": [
|
||||
"Which skill, agent, command, or rule surface changed?",
|
||||
"Does the change preserve focused scope, clear activation text, and working examples?",
|
||||
"Which validation command proves frontmatter, catalog, example, or behavior quality?",
|
||||
"Does observed failure or user feedback justify the amendment?",
|
||||
"Does the candidate avoid private context, secrets, personal paths, and publication actions?"
|
||||
],
|
||||
"forbidden_actions": [
|
||||
"promoting a skill rewrite without examples, validation, or observed failure evidence",
|
||||
"adding broad multi-domain skills that duplicate existing focused skills",
|
||||
"shipping code examples that are uncompiled, untested, or disconnected from the skill guidance",
|
||||
"copying private operator context, secrets, tokens, or personal paths into skills",
|
||||
"changing package, plugin, catalog, or publication state from this evaluator run",
|
||||
"claiming a skill-quality improvement without a reference set or regression command"
|
||||
],
|
||||
"acceptance_gates": [
|
||||
"changed skill or guidance surface is named",
|
||||
"source evidence includes the skill development guide or current skill examples",
|
||||
"observed failure, user feedback, or reference-set gap is recorded",
|
||||
"validation command is named",
|
||||
"example or regression evidence is attached",
|
||||
"at least one vague no-evidence rewrite is rejected"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
{
|
||||
"schema_version": "ecc.evaluator-rag.trace.v1",
|
||||
"scenario_id": "skill-quality-evidence",
|
||||
"run_id": "2026-05-12-skill-quality-evidence-prototype",
|
||||
"read_only": true,
|
||||
"events": [
|
||||
{
|
||||
"phase": "observation",
|
||||
"summary": "A skill or guidance PR proposes updated instructions. The evaluator records the changed surface and stays read-only; it does not edit skills, package manifests, catalogs, or publication state.",
|
||||
"evidence": [
|
||||
"docs/SKILL-DEVELOPMENT-GUIDE.md",
|
||||
"scripts/ci/validate-skills.js"
|
||||
]
|
||||
},
|
||||
{
|
||||
"phase": "retrieval",
|
||||
"summary": "Retrieved the skill quality contract, existing focused skill examples, observation-backed amendment helpers, and validation commands for skill structure and regression evidence.",
|
||||
"evidence": [
|
||||
"node scripts/ci/validate-skills.js",
|
||||
"node tests/lib/skill-improvement.test.js",
|
||||
"node tests/lib/skill-evolution.test.js",
|
||||
"npm run catalog:check"
|
||||
]
|
||||
},
|
||||
{
|
||||
"phase": "proposal",
|
||||
"summary": "Generated two candidate playbooks: evidence-backed skill amendment, and broad rewrite with no examples or validation.",
|
||||
"candidate_ids": [
|
||||
"evidence-backed-skill-amendment",
|
||||
"vague-skill-rewrite"
|
||||
]
|
||||
},
|
||||
{
|
||||
"phase": "verification",
|
||||
"summary": "Accepted the evidence-backed amendment because it names observed failure evidence, examples, and validation commands. Rejected the vague rewrite because it lacks a reference set and testable examples.",
|
||||
"evidence": [
|
||||
"examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"phase": "promotion",
|
||||
"summary": "Promoted only the read-only skill-quality evidence playbook. Future skill edits must move through maintainer PRs with source attribution, validation, and rollback notes.",
|
||||
"promoted_candidate_id": "evidence-backed-skill-amendment"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"schema_version": "ecc.evaluator-rag.verifier.v1",
|
||||
"scenario_id": "skill-quality-evidence",
|
||||
"run_id": "2026-05-12-skill-quality-evidence-prototype",
|
||||
"read_only": true,
|
||||
"candidates": [
|
||||
{
|
||||
"candidate_id": "evidence-backed-skill-amendment",
|
||||
"decision": "accepted",
|
||||
"score": 0.91,
|
||||
"reasons": [
|
||||
"retrieves the skill development guide and existing focused skill examples",
|
||||
"records observed failure, user feedback, or reference-set gap before proposing an amendment",
|
||||
"names validation commands for skill structure, examples, catalog consistency, and regression behavior",
|
||||
"keeps package, plugin, catalog, and publication actions out of the evaluator run",
|
||||
"includes rollback guidance for reverting the future maintainer PR"
|
||||
],
|
||||
"rollback": "Revert the future skill-amendment PR and restore the prior SKILL.md content; no installed user skill or publication surface changes in this read-only playbook."
|
||||
},
|
||||
{
|
||||
"candidate_id": "vague-skill-rewrite",
|
||||
"decision": "rejected",
|
||||
"score": 0.14,
|
||||
"reasons": [
|
||||
"does not name observed failure evidence or user feedback",
|
||||
"rewrites broad skill guidance without focused scope",
|
||||
"does not include working examples or a reference set",
|
||||
"does not name a regression command",
|
||||
"risks changing catalog or publication state from evaluator output"
|
||||
],
|
||||
"rollback": "Do not promote this rewrite; restart from observed skill-run evidence, example validation, and a focused maintainer PR."
|
||||
}
|
||||
],
|
||||
"promoted_candidate_id": "evidence-backed-skill-amendment"
|
||||
}
|
||||
Reference in New Issue
Block a user