mirror of
https://github.com/garrytan/gstack.git
synced 2026-05-21 20:28:24 +08:00
feat: add eval format validation, tier selection, cost tracking
- lib/eval-format.ts: StandardEvalResult interfaces, validateEvalResult(), normalizeFromLegacy/normalizeToLegacy round-trip converters - lib/eval-tier.ts: EvalTier type, resolveTier/resolveJudgeTier from env, tierToModel mapping, TIER_ALIASES (haiku→fast, sonnet→standard, opus→full) - lib/eval-cost.ts: MODEL_PRICING (last verified 2025-05-01), computeCosts(), formatCostDashboard(), aggregateCosts(), fallback for unknown models - 42 tests across 3 test files Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
94
test/lib-eval-tier.test.ts
Normal file
94
test/lib-eval-tier.test.ts
Normal file
@@ -0,0 +1,94 @@
|
||||
/**
|
||||
* Tests for lib/eval-tier.ts — model tier selection.
|
||||
*/
|
||||
|
||||
import { describe, test, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { resolveTier, resolveJudgeTier, tierToModel, TIER_ALIASES } from '../lib/eval-tier';
|
||||
|
||||
describe('lib/eval-tier', () => {
|
||||
const origEvalTier = process.env.EVAL_TIER;
|
||||
const origJudgeTier = process.env.EVAL_JUDGE_TIER;
|
||||
|
||||
afterEach(() => {
|
||||
if (origEvalTier === undefined) delete process.env.EVAL_TIER;
|
||||
else process.env.EVAL_TIER = origEvalTier;
|
||||
if (origJudgeTier === undefined) delete process.env.EVAL_JUDGE_TIER;
|
||||
else process.env.EVAL_JUDGE_TIER = origJudgeTier;
|
||||
});
|
||||
|
||||
describe('resolveTier', () => {
|
||||
test('defaults to standard when unset', () => {
|
||||
delete process.env.EVAL_TIER;
|
||||
expect(resolveTier()).toBe('standard');
|
||||
});
|
||||
|
||||
test('resolves tier names directly', () => {
|
||||
process.env.EVAL_TIER = 'fast';
|
||||
expect(resolveTier()).toBe('fast');
|
||||
process.env.EVAL_TIER = 'full';
|
||||
expect(resolveTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('resolves model aliases', () => {
|
||||
process.env.EVAL_TIER = 'haiku';
|
||||
expect(resolveTier()).toBe('fast');
|
||||
process.env.EVAL_TIER = 'sonnet';
|
||||
expect(resolveTier()).toBe('standard');
|
||||
process.env.EVAL_TIER = 'opus';
|
||||
expect(resolveTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('is case-insensitive', () => {
|
||||
process.env.EVAL_TIER = 'HAIKU';
|
||||
expect(resolveTier()).toBe('fast');
|
||||
process.env.EVAL_TIER = 'Full';
|
||||
expect(resolveTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('defaults to standard for unknown value', () => {
|
||||
process.env.EVAL_TIER = 'gpt-4';
|
||||
expect(resolveTier()).toBe('standard');
|
||||
});
|
||||
});
|
||||
|
||||
describe('resolveJudgeTier', () => {
|
||||
test('falls back to EVAL_TIER when EVAL_JUDGE_TIER unset', () => {
|
||||
delete process.env.EVAL_JUDGE_TIER;
|
||||
process.env.EVAL_TIER = 'fast';
|
||||
expect(resolveJudgeTier()).toBe('fast');
|
||||
});
|
||||
|
||||
test('uses EVAL_JUDGE_TIER when set', () => {
|
||||
process.env.EVAL_TIER = 'fast';
|
||||
process.env.EVAL_JUDGE_TIER = 'full';
|
||||
expect(resolveJudgeTier()).toBe('full');
|
||||
});
|
||||
|
||||
test('resolves aliases for judge tier', () => {
|
||||
process.env.EVAL_JUDGE_TIER = 'opus';
|
||||
expect(resolveJudgeTier()).toBe('full');
|
||||
});
|
||||
});
|
||||
|
||||
describe('tierToModel', () => {
|
||||
test('maps fast to haiku', () => {
|
||||
expect(tierToModel('fast')).toBe('claude-haiku-4-5');
|
||||
});
|
||||
|
||||
test('maps standard to sonnet', () => {
|
||||
expect(tierToModel('standard')).toBe('claude-sonnet-4-6');
|
||||
});
|
||||
|
||||
test('maps full to opus', () => {
|
||||
expect(tierToModel('full')).toBe('claude-opus-4-6');
|
||||
});
|
||||
});
|
||||
|
||||
describe('TIER_ALIASES', () => {
|
||||
test('contains expected aliases', () => {
|
||||
expect(TIER_ALIASES.haiku).toBe('fast');
|
||||
expect(TIER_ALIASES.sonnet).toBe('standard');
|
||||
expect(TIER_ALIASES.opus).toBe('full');
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user