haiany
/
codegraph
cermin dari https://github.com/colbymchenry/codegraph.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
							import { describe, it, expect } from 'vitest';
import {
  splitIdentifierSegments,
  extractProseCandidates,
  normalizeProseWord,
  segmentLookupVariants,
} from '../src/search/identifier-segments';

describe('splitIdentifierSegments — symbol names → prose words', () => {
  it('splits camelCase / PascalCase at humps', () => {
    expect(splitIdentifierSegments('OrderStateMachine')).toEqual(['order', 'state', 'machine']);
    expect(splitIdentifierSegments('userId')).toEqual(['user', 'id']);
  });

  it('handles acronym runs — HTML stays one segment', () => {
    expect(splitIdentifierSegments('parseHTMLDocument')).toEqual(['parse', 'html', 'document']);
    expect(splitIdentifierSegments('HTMLParser')).toEqual(['html', 'parser']);
  });

  it('keeps digits glued to their word', () => {
    expect(splitIdentifierSegments('base64Encode')).toEqual(['base64', 'encode']);
    expect(splitIdentifierSegments('parseHTML5Doc')).toEqual(['parse', 'html5', 'doc']);
  });

  it('splits snake_case, kebab-case, and dotted file names', () => {
    expect(splitIdentifierSegments('snake_case_name')).toEqual(['snake', 'case', 'name']);
    expect(splitIdentifierSegments('MAX_RETRY_COUNT')).toEqual(['max', 'retry', 'count']);
    expect(splitIdentifierSegments('checkout.service.ts')).toEqual(['checkout', 'service', 'ts']);
    expect(splitIdentifierSegments('state-machine')).toEqual(['state', 'machine']);
  });

  it('drops sub-minimum and digit-only fragments, dedupes', () => {
    expect(splitIdentifierSegments('x')).toEqual([]);
    expect(splitIdentifierSegments('42')).toEqual([]);
    expect(splitIdentifierSegments('getData_getData')).toEqual(['get', 'data']);
  });
});

describe('extractProseCandidates — prompt prose → lookup words', () => {
  it('keeps content words, drops short function words, in any Latin language', () => {
    expect(extractProseCandidates('comment marche la state machine des commandes ?')).toEqual([
      'comment', 'marche', 'state', 'machine', 'commandes',
    ]);
  });

  it('strips diacritics so loanwords meet ASCII identifier segments', () => {
    expect(extractProseCandidates('la résolution des références')).toEqual(['resolution', 'references']);
    expect(normalizeProseWord('Übersicht')).toBe('ubersicht');
  });

  it("splits on apostrophes — l'architecture keeps the noun", () => {
    expect(extractProseCandidates("explique l'architecture du module de stock")).toEqual([
      'explique', 'architecture', 'module', 'stock',
    ]);
  });

  it('caps candidates and skips unsegmented-script sentence runs', () => {
    const many = Array.from({ length: 25 }, (_, i) => `distinctword${String.fromCharCode(97 + i)}`).join(' ');
    expect(extractProseCandidates(many)).toHaveLength(16);
    // A no-spaces CJK sentence is one giant run — over the length ceiling, skipped.
    expect(extractProseCandidates('請解釋一下這個訂單狀態機的整體運作流程與架構設計方式')).toEqual([]);
    // Short CJK runs pass through as candidates — no script filter; the graph
    // verification tier rejects them (identifiers are almost never CJK).
    expect(extractProseCandidates('修复这个拼写错误')).toEqual(['修复这个拼写错误']);
  });

  it('drops digit-only and sub-4-char words', () => {
    expect(extractProseCandidates('fix the bug in v2 at 1234')).toEqual([]);
  });
});

describe('segmentLookupVariants — light plural folding', () => {
  it('folds trailing s/es so plurals hit singular segments', () => {
    expect(segmentLookupVariants('services')).toContain('service');
    expect(segmentLookupVariants('machines')).toContain('machine');
  });

  it('never strips a word below the minimum', () => {
    expect(segmentLookupVariants('bus')).toEqual(['bus']);
  });
});