mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-17 01:31:26 +08:00
Merge branch 'main' into garrytan/team-supabase-store
Brings in 55 commits from main (v0.12.x–v0.13.5.0): Factory Droid compat, prompt injection defense, user sovereignty, security audit, design binary, skill namespacing, modular resolvers, Chrome sidebar, and more. Conflict resolution: - .agents/ SKILL.md files: deleted (main moved to .factory/) - 8 .tmpl templates: accepted main (new features: CDP mode, design tools, global retro, parallelization, distribution checks, plan audits) - scripts/gen-skill-docs.ts: accepted main's modular resolver refactor - test/helpers/session-runner.ts: accepted main + layered back CostEntry tracking from team branch - Generated SKILL.md files: regenerated via bun run gen:skill-docs - Updated tests to match main's gstack-slug output (2 lines, no PROJECTS_DIR) and review log mechanism (gstack-review-log, not $BRANCH.jsonl) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -74,7 +74,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
|
||||
/** Skip an individual test if not selected (for multi-test describe blocks). */
|
||||
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
|
||||
const shouldRun = selectedTests === null || selectedTests.includes(testName);
|
||||
(shouldRun ? test : test.skip)(testName, fn, timeout);
|
||||
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
|
||||
}
|
||||
|
||||
describeIfSelected('LLM-as-judge quality evals', [
|
||||
@@ -91,11 +91,14 @@ describeIfSelected('LLM-as-judge quality evals', [
|
||||
const { result: scores, meta } = await judge('command reference table', section);
|
||||
console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
|
||||
|
||||
// Completeness threshold is 3 (not 4) — the command reference table is
|
||||
// intentionally terse (quick-reference format). The judge consistently scores
|
||||
// completeness=3 because detailed argument docs live in per-command sections.
|
||||
evalCollector?.addTest({
|
||||
name: 'command reference table',
|
||||
suite: 'LLM-as-judge quality evals',
|
||||
tier: 'llm-judge',
|
||||
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
|
||||
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: judgeCost(meta),
|
||||
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
|
||||
@@ -104,7 +107,7 @@ describeIfSelected('LLM-as-judge quality evals', [
|
||||
});
|
||||
|
||||
expect(scores.clarity).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(4);
|
||||
expect(scores.completeness).toBeGreaterThanOrEqual(3);
|
||||
expect(scores.actionability).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
|
||||
@@ -790,6 +793,69 @@ describeIfSelected('Other skill evals', [
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Voice directive eval — tests that the voice section produces the right tone
|
||||
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
|
||||
testIfSelected('voice directive tone', async () => {
|
||||
const t0 = Date.now();
|
||||
// Read a tier 2+ skill to get the full voice directive in context
|
||||
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
|
||||
const voiceStart = content.indexOf('## Voice');
|
||||
if (voiceStart === -1) {
|
||||
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
|
||||
}
|
||||
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
|
||||
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
|
||||
|
||||
const result = await callJudge<{
|
||||
directness: number;
|
||||
concreteness: number;
|
||||
avoids_corporate: number;
|
||||
avoids_ai_vocabulary: number;
|
||||
connects_user_outcomes: number;
|
||||
reasoning: string;
|
||||
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
|
||||
Score each dimension 1-5 where 5 is excellent:
|
||||
|
||||
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
|
||||
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
|
||||
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
|
||||
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
|
||||
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
|
||||
|
||||
Return JSON only:
|
||||
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
|
||||
|
||||
THE VOICE DIRECTIVE:
|
||||
${voiceSection}`);
|
||||
|
||||
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
|
||||
|
||||
evalCollector?.addTest({
|
||||
name: 'voice directive tone',
|
||||
suite: 'Voice directive eval',
|
||||
tier: 'llm-judge',
|
||||
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
|
||||
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
|
||||
duration_ms: Date.now() - t0,
|
||||
cost_usd: 0.02,
|
||||
judge_scores: {
|
||||
directness: result.directness,
|
||||
concreteness: result.concreteness,
|
||||
avoids_corporate: result.avoids_corporate,
|
||||
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
|
||||
connects_user_outcomes: result.connects_user_outcomes,
|
||||
},
|
||||
judge_reasoning: result.reasoning,
|
||||
});
|
||||
|
||||
expect(result.directness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.concreteness).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
|
||||
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
|
||||
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
|
||||
}, 30_000);
|
||||
});
|
||||
|
||||
// Module-level afterAll — finalize eval collector after all tests complete
|
||||
afterAll(async () => {
|
||||
if (evalCollector) {
|
||||
|
||||
Reference in New Issue
Block a user