identifier-segments.test.ts 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import { describe, it, expect } from 'vitest';
  2. import {
  3. splitIdentifierSegments,
  4. extractProseCandidates,
  5. normalizeProseWord,
  6. segmentLookupVariants,
  7. } from '../src/search/identifier-segments';
  8. describe('splitIdentifierSegments — symbol names → prose words', () => {
  9. it('splits camelCase / PascalCase at humps', () => {
  10. expect(splitIdentifierSegments('OrderStateMachine')).toEqual(['order', 'state', 'machine']);
  11. expect(splitIdentifierSegments('userId')).toEqual(['user', 'id']);
  12. });
  13. it('handles acronym runs — HTML stays one segment', () => {
  14. expect(splitIdentifierSegments('parseHTMLDocument')).toEqual(['parse', 'html', 'document']);
  15. expect(splitIdentifierSegments('HTMLParser')).toEqual(['html', 'parser']);
  16. });
  17. it('keeps digits glued to their word', () => {
  18. expect(splitIdentifierSegments('base64Encode')).toEqual(['base64', 'encode']);
  19. expect(splitIdentifierSegments('parseHTML5Doc')).toEqual(['parse', 'html5', 'doc']);
  20. });
  21. it('splits snake_case, kebab-case, and dotted file names', () => {
  22. expect(splitIdentifierSegments('snake_case_name')).toEqual(['snake', 'case', 'name']);
  23. expect(splitIdentifierSegments('MAX_RETRY_COUNT')).toEqual(['max', 'retry', 'count']);
  24. expect(splitIdentifierSegments('checkout.service.ts')).toEqual(['checkout', 'service', 'ts']);
  25. expect(splitIdentifierSegments('state-machine')).toEqual(['state', 'machine']);
  26. });
  27. it('drops sub-minimum and digit-only fragments, dedupes', () => {
  28. expect(splitIdentifierSegments('x')).toEqual([]);
  29. expect(splitIdentifierSegments('42')).toEqual([]);
  30. expect(splitIdentifierSegments('getData_getData')).toEqual(['get', 'data']);
  31. });
  32. });
  33. describe('extractProseCandidates — prompt prose → lookup words', () => {
  34. it('keeps content words, drops short function words, in any Latin language', () => {
  35. expect(extractProseCandidates('comment marche la state machine des commandes ?')).toEqual([
  36. 'comment', 'marche', 'state', 'machine', 'commandes',
  37. ]);
  38. });
  39. it('strips diacritics so loanwords meet ASCII identifier segments', () => {
  40. expect(extractProseCandidates('la résolution des références')).toEqual(['resolution', 'references']);
  41. expect(normalizeProseWord('Übersicht')).toBe('ubersicht');
  42. });
  43. it("splits on apostrophes — l'architecture keeps the noun", () => {
  44. expect(extractProseCandidates("explique l'architecture du module de stock")).toEqual([
  45. 'explique', 'architecture', 'module', 'stock',
  46. ]);
  47. });
  48. it('caps candidates and skips unsegmented-script sentence runs', () => {
  49. const many = Array.from({ length: 25 }, (_, i) => `distinctword${String.fromCharCode(97 + i)}`).join(' ');
  50. expect(extractProseCandidates(many)).toHaveLength(16);
  51. // A no-spaces CJK sentence is one giant run — over the length ceiling, skipped.
  52. expect(extractProseCandidates('請解釋一下這個訂單狀態機的整體運作流程與架構設計方式')).toEqual([]);
  53. // Short CJK runs pass through as candidates — no script filter; the graph
  54. // verification tier rejects them (identifiers are almost never CJK).
  55. expect(extractProseCandidates('修复这个拼写错误')).toEqual(['修复这个拼写错误']);
  56. });
  57. it('drops digit-only and sub-4-char words', () => {
  58. expect(extractProseCandidates('fix the bug in v2 at 1234')).toEqual([]);
  59. });
  60. });
  61. describe('segmentLookupVariants — light plural folding', () => {
  62. it('folds trailing s/es so plurals hit singular segments', () => {
  63. expect(segmentLookupVariants('services')).toContain('service');
  64. expect(segmentLookupVariants('machines')).toContain('machine');
  65. });
  66. it('never strips a word below the minimum', () => {
  67. expect(segmentLookupVariants('bus')).toEqual(['bus']);
  68. });
  69. });