From 337ced0828587b31a985ab430c4ed92dcd915c5c Mon Sep 17 00:00:00 2001 From: Affaan Mustafa Date: Tue, 12 May 2026 18:24:41 -0400 Subject: [PATCH] docs: add skill-quality evaluator scenario --- docs/ECC-2.0-GA-ROADMAP.md | 8 +-- docs/architecture/evaluator-rag-prototype.md | 12 +++- .../candidate-playbook.md | 57 +++++++++++++++++++ .../skill-quality-evidence/report.json | 35 ++++++++++++ .../skill-quality-evidence/scenario.json | 57 +++++++++++++++++++ .../skill-quality-evidence/trace.json | 46 +++++++++++++++ .../verifier-result.json | 35 ++++++++++++ tests/docs/evaluator-rag-prototype.test.js | 49 +++++++++++++++- 8 files changed, 291 insertions(+), 8 deletions(-) create mode 100644 examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md create mode 100644 examples/evaluator-rag-prototype/skill-quality-evidence/report.json create mode 100644 examples/evaluator-rag-prototype/skill-quality-evidence/scenario.json create mode 100644 examples/evaluator-rag-prototype/skill-quality-evidence/trace.json create mode 100644 examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json diff --git a/docs/ECC-2.0-GA-ROADMAP.md b/docs/ECC-2.0-GA-ROADMAP.md index cb39bb3c..97954c34 100644 --- a/docs/ECC-2.0-GA-ROADMAP.md +++ b/docs/ECC-2.0-GA-ROADMAP.md @@ -201,7 +201,7 @@ is not complete unless the evidence column exists and has been freshly verified. | AgentShield enterprise iteration | Policy gates, SARIF, packs, provenance, corpus, HTML reports, exception lifecycle audit | PRs #53, #55-#62 landed with test evidence | Needs PDF/export decision or next enterprise signal | | ECC Tools next-level app | Billing audit, PR checks, deep analyzer, sync backlog | PRs #26-#39 landed with test evidence | Needs capacity-backed Linear rollout / broader evaluator corpus | | GitGuardian/Dependabot/CodeRabbit-style checks | Non-blocking taxonomy and deterministic follow-up checks | ECC-Tools risk taxonomy check plus follow-up signals landed, including Skill Quality, Deep Analyzer Evidence, Analyzer Corpus Evidence, RAG/Evaluator Evidence, and PR Review/Salvage Evidence | Partially complete | -| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, and AgentShield policy-exception scenarios with trace, report, playbook, and verifier result artifacts | Needs skill-quality and deep-analyzer corpus | +| Harness-agnostic learning system | Audit, adapter matrix, observability, traces, promotion loop | Audit/adapters/observability gates plus `docs/architecture/evaluator-rag-prototype.md` and `examples/evaluator-rag-prototype/` define read-only stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, and skill-quality evidence scenarios with trace, report, playbook, and verifier result artifacts | Needs deep-analyzer corpus | | Linear roadmap is detailed | Linear project status plus repo mirror | Repo mirror exists; issue creation was retried on 2026-05-12 and remains blocked by the workspace free issue limit | Needs recurring status updates after each merge batch | | Flow separation and progress tracking | Flow lanes with owner artifacts and update cadence | This roadmap defines lanes below | Active | | Realtime Linear sync | Project updates while issue limit is blocked; issues later | ECC-Tools #39 implements opt-in Linear API sync for deferred follow-up backlog items | Needs workspace capacity/config rollout | @@ -220,7 +220,7 @@ back to the repo evidence and merge commits. | Queue hygiene and salvage | GitHub PR/issue state, salvage ledger | Append ledger entries for any future stale closures | Every cleanup batch | | Release and publication | rc.1 release docs, publication readiness doc | Naming matrix and plugin submission/contact checklist | Before any tag | | Harness OS core | Audit, adapter matrix, observability docs, `ecc2/` | HUD/session-control acceptance spec | Weekly until GA | -| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, and AgentShield policy-exception fixtures | Expand to skill-quality or deep-analyzer evidence scenario | +| Evaluation and RAG | Reference-set validation, harness audit, traces | Read-only evaluator/RAG prototype plus stale-salvage, billing-readiness, CI-failure-diagnosis, harness-config-quality, AgentShield policy-exception, and skill-quality evidence fixtures | Expand to deep-analyzer evidence scenario | | AgentShield enterprise | AgentShield PR evidence and roadmap notes | PDF-export decision or next enterprise signal | After value decision | | ECC Tools app | ECC-Tools PR evidence, billing audit, risk taxonomy | Capacity-backed Linear rollout or broader evaluator/RAG corpus slice | Next implementation batch | | Linear progress | Linear project status updates and this mirror | Status update with queue/evidence/missing gates | Every significant merge batch | @@ -420,5 +420,5 @@ Acceptance: 2. Enable/configure the merged Linear backlog sync path after workspace issue capacity clears or the Linear workspace is upgraded. 3. Expand the evaluator/RAG corpus beyond stale-salvage, billing, CI, - harness-config, and AgentShield policy-exception prototypes toward - skill-quality and deep-analyzer evidence scenarios. + harness-config, AgentShield policy-exception, and skill-quality evidence + prototypes toward deep-analyzer evidence scenarios. diff --git a/docs/architecture/evaluator-rag-prototype.md b/docs/architecture/evaluator-rag-prototype.md index d55013d7..48b684cc 100644 --- a/docs/architecture/evaluator-rag-prototype.md +++ b/docs/architecture/evaluator-rag-prototype.md @@ -16,7 +16,10 @@ agent proposes fixes for red checks. A harness-config quality scenario keeps MCP, plugin, hook, command, agent, and adapter recommendations tied to the adapter matrix before they mutate setup guidance. An AgentShield policy exception scenario gates security exceptions on SARIF/report evidence, owner -fields, expiry state, and remediation-versus-exception decisions. +fields, expiry state, and remediation-versus-exception decisions. A +skill-quality evidence scenario requires observed failure or feedback evidence, +working examples, reference-set gaps, and validation commands before a skill +amendment can be promoted. ## Reference Pressure @@ -110,6 +113,9 @@ Current corpus: - `agentshield-policy-exception`: requires AgentShield SARIF or report evidence, policy-pack source, owner/ticket/scope/expiry fields, and expired exception enforcement before a policy exception can be promoted. +- `skill-quality-evidence`: requires focused skill scope, observed failure or + user-feedback evidence, examples/reference-set coverage, validation commands, + and publication safety before a skill amendment can be promoted. ## ECC Tools Mapping @@ -143,5 +149,5 @@ A candidate can be promoted only when: The next evaluator/RAG corpus should add: -- skill-quality or deep-analyzer evidence scenarios with maintained reference - sets and rejected low-evidence candidates. +- a deep-analyzer evidence scenario with maintained reference sets and rejected + low-evidence candidates. diff --git a/examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md b/examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md new file mode 100644 index 00000000..0668e95a --- /dev/null +++ b/examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md @@ -0,0 +1,57 @@ +# Skill Quality Evidence Playbook + +Candidate id: `evidence-backed-skill-amendment` + +Use this playbook when a PR or follow-up proposes adding, rewriting, or +amending a skill, agent, command, or rule guidance surface. + +## Accepted Path + +1. Name the changed guidance surface and source file. +2. Retrieve the quality contract from `docs/SKILL-DEVELOPMENT-GUIDE.md`. +3. Compare the proposed change to nearby focused examples under `skills/*/SKILL.md`. +4. Record the evidence source that justifies the change: + - observed skill-run failure; + - user feedback; + - repeated review finding; + - reference-set gap; + - failing example or regression test. +5. Keep the scope narrow. One skill should cover one domain, workflow, or + reusable pattern. +6. Add or update examples only when they can be validated. +7. Run the relevant validation gate: + - `node scripts/ci/validate-skills.js` + - `node tests/lib/skill-improvement.test.js` + - `node tests/lib/skill-evolution.test.js` + - `npm run catalog:check` + - language-specific example commands such as `npx tsc --noEmit`, + `python -m py_compile`, or `go build` when examples are touched. +8. Record validation output, source attribution, and rollback notes in the + maintainer PR body or handoff. + +## Rejected Path + +Do not promote a vague skill rewrite because the prose "sounds better" without +observed failure evidence, examples, or a reference set. + +Do not merge multi-domain catch-all skills that duplicate focused skills or make +activation less predictable. + +Do not copy private operator context, secrets, tokens, personal paths, customer +data, or unpublished release claims into skills. + +Do not update package manifests, plugin manifests, catalogs, release notes, or +publication state from the evaluator run. + +## Minimum Validation + +- `node scripts/ci/validate-skills.js` +- `npm run catalog:check` when catalog/package-visible skill surfaces change +- Focused skill-improvement or skill-evolution regression test when amendment + behavior changes +- Language-specific compile/lint checks for touched examples +- `git diff --check` +- Markdown lint when docs or playbooks are touched + +Preserve source attribution for contributed skill material and include rollback +guidance for the future maintainer PR. diff --git a/examples/evaluator-rag-prototype/skill-quality-evidence/report.json b/examples/evaluator-rag-prototype/skill-quality-evidence/report.json new file mode 100644 index 00000000..692195f2 --- /dev/null +++ b/examples/evaluator-rag-prototype/skill-quality-evidence/report.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.report.v1", + "scenario_id": "skill-quality-evidence", + "run_id": "2026-05-12-skill-quality-evidence-prototype", + "result": "prototype_passed", + "read_only": true, + "scores": { + "skill_contract_retrieval": 0.94, + "observed_failure_evidence": 0.88, + "example_quality": 0.9, + "validation_specificity": 0.93, + "publication_safety": 1 + }, + "findings": [ + { + "id": "examples-required", + "severity": "warning", + "summary": "Skill-quality changes need working examples or regression evidence; prose-only rewrites are not enough for promotion." + }, + { + "id": "observation-source-required", + "severity": "warning", + "summary": "Skill amendments should cite observed failure, user feedback, or a reference-set gap rather than broad style preference." + }, + { + "id": "publication-stays-blocked", + "severity": "info", + "summary": "The evaluator can recommend a maintainer PR, but it cannot update package, plugin, catalog, or publication state itself." + } + ], + "recommended_next_action": { + "candidate_id": "evidence-backed-skill-amendment", + "action": "Use the promoted skill-quality playbook for PRs that add, rewrite, or amend skills, agents, commands, or rules guidance." + } +} diff --git a/examples/evaluator-rag-prototype/skill-quality-evidence/scenario.json b/examples/evaluator-rag-prototype/skill-quality-evidence/scenario.json new file mode 100644 index 00000000..54a1c14d --- /dev/null +++ b/examples/evaluator-rag-prototype/skill-quality-evidence/scenario.json @@ -0,0 +1,57 @@ +{ + "schema_version": "ecc.evaluator-rag.scenario.v1", + "scenario_id": "skill-quality-evidence", + "title": "Require examples and validation before promoting skill guidance changes", + "mode": "read_only_prototype", + "objective": "Given a change to skills, agents, commands, or rules guidance, retrieve the skill development contract and observed skill-run evidence before promoting an amendment or new skill-quality recommendation.", + "sources": [ + { + "kind": "repo_doc", + "path": "docs/SKILL-DEVELOPMENT-GUIDE.md", + "purpose": "Public skill quality contract for frontmatter, focused scope, examples, testing, and submission evidence" + }, + { + "kind": "repo_source", + "path": "scripts/ci/validate-skills.js", + "purpose": "Curated skill structure and frontmatter validation gate" + }, + { + "kind": "repo_source", + "path": "scripts/lib/skill-improvement/", + "purpose": "Observation, health, amendment, and evaluation helpers for evidence-backed skill evolution" + }, + { + "kind": "repo_test", + "command": "node tests/lib/skill-improvement.test.js", + "purpose": "Regression coverage for observation-backed skill amendment and evaluation scaffolds" + }, + { + "kind": "repo_test", + "command": "node scripts/ci/validate-skills.js", + "purpose": "Skill structure validation before catalog or package changes merge" + } + ], + "retrieval_questions": [ + "Which skill, agent, command, or rule surface changed?", + "Does the change preserve focused scope, clear activation text, and working examples?", + "Which validation command proves frontmatter, catalog, example, or behavior quality?", + "Does observed failure or user feedback justify the amendment?", + "Does the candidate avoid private context, secrets, personal paths, and publication actions?" + ], + "forbidden_actions": [ + "promoting a skill rewrite without examples, validation, or observed failure evidence", + "adding broad multi-domain skills that duplicate existing focused skills", + "shipping code examples that are uncompiled, untested, or disconnected from the skill guidance", + "copying private operator context, secrets, tokens, or personal paths into skills", + "changing package, plugin, catalog, or publication state from this evaluator run", + "claiming a skill-quality improvement without a reference set or regression command" + ], + "acceptance_gates": [ + "changed skill or guidance surface is named", + "source evidence includes the skill development guide or current skill examples", + "observed failure, user feedback, or reference-set gap is recorded", + "validation command is named", + "example or regression evidence is attached", + "at least one vague no-evidence rewrite is rejected" + ] +} diff --git a/examples/evaluator-rag-prototype/skill-quality-evidence/trace.json b/examples/evaluator-rag-prototype/skill-quality-evidence/trace.json new file mode 100644 index 00000000..d906c096 --- /dev/null +++ b/examples/evaluator-rag-prototype/skill-quality-evidence/trace.json @@ -0,0 +1,46 @@ +{ + "schema_version": "ecc.evaluator-rag.trace.v1", + "scenario_id": "skill-quality-evidence", + "run_id": "2026-05-12-skill-quality-evidence-prototype", + "read_only": true, + "events": [ + { + "phase": "observation", + "summary": "A skill or guidance PR proposes updated instructions. The evaluator records the changed surface and stays read-only; it does not edit skills, package manifests, catalogs, or publication state.", + "evidence": [ + "docs/SKILL-DEVELOPMENT-GUIDE.md", + "scripts/ci/validate-skills.js" + ] + }, + { + "phase": "retrieval", + "summary": "Retrieved the skill quality contract, existing focused skill examples, observation-backed amendment helpers, and validation commands for skill structure and regression evidence.", + "evidence": [ + "node scripts/ci/validate-skills.js", + "node tests/lib/skill-improvement.test.js", + "node tests/lib/skill-evolution.test.js", + "npm run catalog:check" + ] + }, + { + "phase": "proposal", + "summary": "Generated two candidate playbooks: evidence-backed skill amendment, and broad rewrite with no examples or validation.", + "candidate_ids": [ + "evidence-backed-skill-amendment", + "vague-skill-rewrite" + ] + }, + { + "phase": "verification", + "summary": "Accepted the evidence-backed amendment because it names observed failure evidence, examples, and validation commands. Rejected the vague rewrite because it lacks a reference set and testable examples.", + "evidence": [ + "examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json" + ] + }, + { + "phase": "promotion", + "summary": "Promoted only the read-only skill-quality evidence playbook. Future skill edits must move through maintainer PRs with source attribution, validation, and rollback notes.", + "promoted_candidate_id": "evidence-backed-skill-amendment" + } + ] +} diff --git a/examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json b/examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json new file mode 100644 index 00000000..8837c60b --- /dev/null +++ b/examples/evaluator-rag-prototype/skill-quality-evidence/verifier-result.json @@ -0,0 +1,35 @@ +{ + "schema_version": "ecc.evaluator-rag.verifier.v1", + "scenario_id": "skill-quality-evidence", + "run_id": "2026-05-12-skill-quality-evidence-prototype", + "read_only": true, + "candidates": [ + { + "candidate_id": "evidence-backed-skill-amendment", + "decision": "accepted", + "score": 0.91, + "reasons": [ + "retrieves the skill development guide and existing focused skill examples", + "records observed failure, user feedback, or reference-set gap before proposing an amendment", + "names validation commands for skill structure, examples, catalog consistency, and regression behavior", + "keeps package, plugin, catalog, and publication actions out of the evaluator run", + "includes rollback guidance for reverting the future maintainer PR" + ], + "rollback": "Revert the future skill-amendment PR and restore the prior SKILL.md content; no installed user skill or publication surface changes in this read-only playbook." + }, + { + "candidate_id": "vague-skill-rewrite", + "decision": "rejected", + "score": 0.14, + "reasons": [ + "does not name observed failure evidence or user feedback", + "rewrites broad skill guidance without focused scope", + "does not include working examples or a reference set", + "does not name a regression command", + "risks changing catalog or publication state from evaluator output" + ], + "rollback": "Do not promote this rewrite; restart from observed skill-run evidence, example validation, and a focused maintainer PR." + } + ], + "promoted_candidate_id": "evidence-backed-skill-amendment" +} diff --git a/tests/docs/evaluator-rag-prototype.test.js b/tests/docs/evaluator-rag-prototype.test.js index 6ecc6c6a..4655990c 100644 --- a/tests/docs/evaluator-rag-prototype.test.js +++ b/tests/docs/evaluator-rag-prototype.test.js @@ -135,7 +135,7 @@ test('roadmap points to the evaluator RAG prototype and keeps broader corpus wor assert.ok(roadmap.includes('docs/architecture/evaluator-rag-prototype.md')); assert.ok(roadmap.includes('examples/evaluator-rag-prototype/')); - assert.ok(roadmap.includes('Needs skill-quality and deep-analyzer corpus')); + assert.ok(roadmap.includes('Needs deep-analyzer corpus')); }); test('billing readiness scenario rejects launch copy overclaims', () => { @@ -314,6 +314,53 @@ test('AgentShield policy exception scenario rejects blanket suppression', () => assert.ok(playbook.includes('npx ecc-agentshield scan --format json')); }); +test('skill quality evidence scenario rejects vague rewrites', () => { + const scenario = readFixtureJson('skill-quality-evidence/scenario.json'); + const trace = readFixtureJson('skill-quality-evidence/trace.json'); + const report = readFixtureJson('skill-quality-evidence/report.json'); + const verifier = readFixtureJson('skill-quality-evidence/verifier-result.json'); + const playbook = read('examples/evaluator-rag-prototype/skill-quality-evidence/candidate-playbook.md'); + + assert.strictEqual(scenario.scenario_id, 'skill-quality-evidence'); + assert.strictEqual(trace.scenario_id, scenario.scenario_id); + assert.strictEqual(report.scenario_id, scenario.scenario_id); + assert.strictEqual(verifier.scenario_id, scenario.scenario_id); + assert.strictEqual(trace.read_only, true); + assert.strictEqual(report.read_only, true); + assert.strictEqual(verifier.read_only, true); + + for (const blocked of [ + 'promoting a skill rewrite without examples, validation, or observed failure evidence', + 'adding broad multi-domain skills that duplicate existing focused skills', + 'copying private operator context, secrets, tokens, or personal paths into skills', + 'claiming a skill-quality improvement without a reference set or regression command' + ]) { + assert.ok(scenario.forbidden_actions.includes(blocked), `Missing skill-quality forbidden action: ${blocked}`); + } + + for (const required of [ + 'changed skill or guidance surface is named', + 'observed failure, user feedback, or reference-set gap is recorded', + 'validation command is named', + 'example or regression evidence is attached' + ]) { + assert.ok(scenario.acceptance_gates.includes(required), `Missing skill-quality acceptance gate: ${required}`); + } + + const accepted = verifier.candidates.find(candidate => candidate.candidate_id === 'evidence-backed-skill-amendment'); + const rejected = verifier.candidates.find(candidate => candidate.candidate_id === 'vague-skill-rewrite'); + + assert.ok(accepted, 'Missing accepted skill-quality candidate'); + assert.ok(rejected, 'Missing rejected vague rewrite candidate'); + assert.strictEqual(accepted.decision, 'accepted'); + assert.strictEqual(rejected.decision, 'rejected'); + assert.strictEqual(verifier.promoted_candidate_id, accepted.candidate_id); + assert.ok(rejected.reasons.join('\n').includes('does not include working examples')); + assert.ok(playbook.includes('docs/SKILL-DEVELOPMENT-GUIDE.md')); + assert.ok(playbook.includes('node scripts/ci/validate-skills.js')); + assert.ok(playbook.includes('observed skill-run failure')); +}); + if (failed > 0) { console.log(`\nFailed: ${failed}`); process.exit(1);