mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-21 12:18:24 +08:00
chore: upgrade eval judge to Sonnet 4.6, update changelog
Switch LLM-as-judge evals from Haiku to Sonnet 4.6 for more stable, nuanced scoring. Add changelog entry for all eval improvements. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
16
CHANGELOG.md
16
CHANGELOG.md
@@ -1,5 +1,21 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## Unreleased — 2026-03-14
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- Enriched 14 command descriptions with specific arg formats, valid values, error behavior, and return types
|
||||||
|
- Fixed `header` usage from `<name> <value>` to `<name>:<value>` (matching actual implementation)
|
||||||
|
- Added `cookie` usage syntax: `cookie <name>=<value>`
|
||||||
|
- Enriched 4 snapshot flag descriptions with defaults, output paths, and behavior details
|
||||||
|
- Snapshot flags section now shows long flag names (`-i / --interactive`) alongside short
|
||||||
|
- Added ref numbering explanation and output format example to snapshot docs
|
||||||
|
- Replaced hand-maintained server.ts help text with auto-generated `generateHelpText()` from COMMAND_DESCRIPTIONS
|
||||||
|
- Upgraded LLM eval judge from Haiku to Sonnet 4.6 for more stable scoring
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- Usage string consistency test: cross-checks `Usage:` patterns in implementation against COMMAND_DESCRIPTIONS
|
||||||
|
- Pipe guard test: ensures no command description contains `|` (would break markdown tables)
|
||||||
|
|
||||||
## 0.3.3 — 2026-03-13
|
## 0.3.3 — 2026-03-13
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
* Requires: ANTHROPIC_API_KEY env var
|
* Requires: ANTHROPIC_API_KEY env var
|
||||||
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
* Run: ANTHROPIC_API_KEY=sk-... bun test test/skill-llm-eval.test.ts
|
||||||
*
|
*
|
||||||
* Cost: ~$0.01-0.03 per run (haiku)
|
* Cost: ~$0.05-0.15 per run (sonnet)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { describe, test, expect } from 'bun:test';
|
import { describe, test, expect } from 'bun:test';
|
||||||
@@ -30,7 +30,7 @@ async function judge(section: string, prompt: string): Promise<JudgeScore> {
|
|||||||
const client = new Anthropic();
|
const client = new Anthropic();
|
||||||
|
|
||||||
const response = await client.messages.create({
|
const response = await client.messages.create({
|
||||||
model: 'claude-haiku-4-5-20251001',
|
model: 'claude-sonnet-4-6',
|
||||||
max_tokens: 1024,
|
max_tokens: 1024,
|
||||||
messages: [{
|
messages: [{
|
||||||
role: 'user',
|
role: 'user',
|
||||||
@@ -158,7 +158,7 @@ describeEval('LLM-as-judge quality evals', () => {
|
|||||||
|
|
||||||
const client = new Anthropic();
|
const client = new Anthropic();
|
||||||
const response = await client.messages.create({
|
const response = await client.messages.create({
|
||||||
model: 'claude-haiku-4-5-20251001',
|
model: 'claude-sonnet-4-6',
|
||||||
max_tokens: 1024,
|
max_tokens: 1024,
|
||||||
messages: [{
|
messages: [{
|
||||||
role: 'user',
|
role: 'user',
|
||||||
|
|||||||
Reference in New Issue
Block a user