Merge branch 'main' into garrytan/team-supabase-store

Brings in 55 commits from main (v0.12.x–v0.13.5.0): Factory Droid compat,
prompt injection defense, user sovereignty, security audit, design binary,
skill namespacing, modular resolvers, Chrome sidebar, and more.

Conflict resolution:
- .agents/ SKILL.md files: deleted (main moved to .factory/)
- 8 .tmpl templates: accepted main (new features: CDP mode, design tools,
  global retro, parallelization, distribution checks, plan audits)
- scripts/gen-skill-docs.ts: accepted main's modular resolver refactor
- test/helpers/session-runner.ts: accepted main + layered back CostEntry
  tracking from team branch
- Generated SKILL.md files: regenerated via bun run gen:skill-docs
- Updated tests to match main's gstack-slug output (2 lines, no PROJECTS_DIR)
  and review log mechanism (gstack-review-log, not $BRANCH.jsonl)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Garry Tan
2026-03-29 15:12:12 -07:00
267 changed files with 60292 additions and 12207 deletions

View File

@@ -74,7 +74,7 @@ function describeIfSelected(name: string, testNames: string[], fn: () => void) {
/** Skip an individual test if not selected (for multi-test describe blocks). */
function testIfSelected(testName: string, fn: () => Promise<void>, timeout: number) {
const shouldRun = selectedTests === null || selectedTests.includes(testName);
(shouldRun ? test : test.skip)(testName, fn, timeout);
(shouldRun ? test.concurrent : test.skip)(testName, fn, timeout);
}
describeIfSelected('LLM-as-judge quality evals', [
@@ -91,11 +91,14 @@ describeIfSelected('LLM-as-judge quality evals', [
const { result: scores, meta } = await judge('command reference table', section);
console.log('Command reference scores:', JSON.stringify(scores, null, 2), meta.cached ? '(cached)' : '');
// Completeness threshold is 3 (not 4) — the command reference table is
// intentionally terse (quick-reference format). The judge consistently scores
// completeness=3 because detailed argument docs live in per-command sections.
evalCollector?.addTest({
name: 'command reference table',
suite: 'LLM-as-judge quality evals',
tier: 'llm-judge',
passed: scores.clarity >= 4 && scores.completeness >= 4 && scores.actionability >= 4,
passed: scores.clarity >= 4 && scores.completeness >= 3 && scores.actionability >= 4,
duration_ms: Date.now() - t0,
cost_usd: judgeCost(meta),
judge_scores: { clarity: scores.clarity, completeness: scores.completeness, actionability: scores.actionability },
@@ -104,7 +107,7 @@ describeIfSelected('LLM-as-judge quality evals', [
});
expect(scores.clarity).toBeGreaterThanOrEqual(4);
expect(scores.completeness).toBeGreaterThanOrEqual(4);
expect(scores.completeness).toBeGreaterThanOrEqual(3);
expect(scores.actionability).toBeGreaterThanOrEqual(4);
}, 30_000);
@@ -790,6 +793,69 @@ describeIfSelected('Other skill evals', [
}, 30_000);
});
// Voice directive eval — tests that the voice section produces the right tone
describeIfSelected('Voice directive eval', ['voice directive tone'], () => {
testIfSelected('voice directive tone', async () => {
const t0 = Date.now();
// Read a tier 2+ skill to get the full voice directive in context
const content = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8');
const voiceStart = content.indexOf('## Voice');
if (voiceStart === -1) {
throw new Error('Voice section not found in review/SKILL.md. Was preamble.ts regenerated?');
}
const voiceEnd = content.indexOf('\n## ', voiceStart + 1);
const voiceSection = content.slice(voiceStart, voiceEnd > 0 ? voiceEnd : voiceStart + 3000);
const result = await callJudge<{
directness: number;
concreteness: number;
avoids_corporate: number;
avoids_ai_vocabulary: number;
connects_user_outcomes: number;
reasoning: string;
}>(`You are evaluating a voice directive for an AI coding assistant framework called GStack.
Score each dimension 1-5 where 5 is excellent:
1. directness: Does it instruct the agent to be direct, lead with the point, take positions?
2. concreteness: Does it instruct the agent to name specific files, commands, line numbers, real numbers?
3. avoids_corporate: Does it explicitly ban corporate/formal/academic tone and provide alternatives?
4. avoids_ai_vocabulary: Does it ban AI-tell words and phrases with specific lists?
5. connects_user_outcomes: Does it instruct the agent to connect technical work to real user experience?
Return JSON only:
{"directness": N, "concreteness": N, "avoids_corporate": N, "avoids_ai_vocabulary": N, "connects_user_outcomes": N, "reasoning": "..."}
THE VOICE DIRECTIVE:
${voiceSection}`);
console.log('Voice directive scores:', JSON.stringify(result, null, 2));
evalCollector?.addTest({
name: 'voice directive tone',
suite: 'Voice directive eval',
tier: 'llm-judge',
passed: result.directness >= 4 && result.concreteness >= 4 && result.avoids_corporate >= 4
&& result.avoids_ai_vocabulary >= 4 && result.connects_user_outcomes >= 4,
duration_ms: Date.now() - t0,
cost_usd: 0.02,
judge_scores: {
directness: result.directness,
concreteness: result.concreteness,
avoids_corporate: result.avoids_corporate,
avoids_ai_vocabulary: result.avoids_ai_vocabulary,
connects_user_outcomes: result.connects_user_outcomes,
},
judge_reasoning: result.reasoning,
});
expect(result.directness).toBeGreaterThanOrEqual(4);
expect(result.concreteness).toBeGreaterThanOrEqual(4);
expect(result.avoids_corporate).toBeGreaterThanOrEqual(4);
expect(result.avoids_ai_vocabulary).toBeGreaterThanOrEqual(4);
expect(result.connects_user_outcomes).toBeGreaterThanOrEqual(4);
}, 30_000);
});
// Module-level afterAll — finalize eval collector after all tests complete
afterAll(async () => {
if (evalCollector) {