1
0

segment-vocab.test.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import { describe, it, expect, beforeEach, afterEach } from 'vitest';
  2. import * as fs from 'node:fs';
  3. import * as path from 'node:path';
  4. import * as os from 'node:os';
  5. import { CodeGraph } from '../src';
  6. import { extractProseCandidates } from '../src/search/identifier-segments';
  7. /**
  8. * The graph-derived gate behind the prompt hook's MEDIUM tier: symbol names
  9. * are segmented into the words a human uses for them in prose
  10. * (name_segment_vocab, populated on the node write path), and
  11. * CodeGraph.getSegmentMatches verifies prompt words against them with
  12. * co-occurrence / rarity rules. Precision comes from the repo's own naming
  13. * statistics — no keyword vocabulary involved.
  14. */
  15. describe('name-segment vocabulary + getSegmentMatches (graph-derived gate)', () => {
  16. let dir: string;
  17. let cg: CodeGraph;
  18. beforeEach(async () => {
  19. dir = fs.mkdtempSync(path.join(os.tmpdir(), 'segment-vocab-'));
  20. fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
  21. fs.writeFileSync(
  22. path.join(dir, 'src', 'state-machine.ts'),
  23. `export class OrderStateMachine {
  24. transition(from: string, to: string): boolean { return from !== to; }
  25. }
  26. `,
  27. );
  28. fs.writeFileSync(
  29. path.join(dir, 'src', 'checkout.ts'),
  30. `export class CheckoutService {
  31. submitOrder(): void {}
  32. }
  33. export class CheckoutController {
  34. handle(): void {}
  35. }
  36. export function loadConfig(): void {}
  37. `,
  38. );
  39. // 30 distinct names sharing the segment "data" — a ubiquitous segment that
  40. // must NOT qualify as a single-word signal (rarity ceiling).
  41. const noise = Array.from({ length: 30 }, (_, i) => {
  42. const suffix = `${String.fromCharCode(65 + (i % 26))}${i}`;
  43. return `export function dataLoader${suffix}(): number { return ${i}; }`;
  44. }).join('\n');
  45. // The measured-FP shapes: a repo-rare segment that is an English function
  46. // word ("this"), and a common-verb segment ("write").
  47. const fpBait = `
  48. export function resolveDeferredThisMemberRefs(): void {}
  49. export function writeConfig(): void {}
  50. `;
  51. fs.writeFileSync(path.join(dir, 'src', 'noise.ts'), noise + fpBait + '\n');
  52. cg = await CodeGraph.init(dir, { silent: true });
  53. await cg.indexAll();
  54. });
  55. afterEach(() => {
  56. cg.destroy();
  57. fs.rmSync(dir, { recursive: true, force: true });
  58. });
  59. it('co-occurrence: two prose words on one name find it — the reported-prompt shape', () => {
  60. // The words a French prompt would produce: "comment marche la state
  61. // machine des commandes ?" — no keyword list knows any of them.
  62. const words = extractProseCandidates('comment marche la state machine des commandes ?');
  63. const matches = cg.getSegmentMatches(words);
  64. expect(matches.map((m) => m.name)).toContain('OrderStateMachine');
  65. const hit = matches.find((m) => m.name === 'OrderStateMachine')!;
  66. expect(hit.matchedWords).toEqual(['machine', 'state']);
  67. expect(hit.filePath).toContain('state-machine.ts');
  68. expect(hit.kind).not.toBe('file');
  69. });
  70. it('single rare word qualifies; ubiquitous and singleton words do not', () => {
  71. // "checkout" clusters (Service + Controller) — a concept this repo is about.
  72. expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
  73. // "data" appears in 30 names here — noise, not signal.
  74. expect(cg.getSegmentMatches(['data'])).toEqual([]);
  75. // "machine" appears in exactly ONE name — a singleton is prose
  76. // coincidence for a single word (the "deploy to production" FP shape);
  77. // it stays reachable through co-occurrence ("state machine").
  78. expect(cg.getSegmentMatches(['machine'])).toEqual([]);
  79. });
  80. it('plural folding: "services" still meets the "service" segment', () => {
  81. const matches = cg.getSegmentMatches(['checkout', 'services']);
  82. const hit = matches.find((m) => m.name === 'CheckoutService');
  83. expect(hit).toBeDefined();
  84. expect(hit!.matchedWords).toEqual(['checkout', 'services']);
  85. });
  86. it('vocab rows are proposals — a name with no surviving node is never surfaced', () => {
  87. // Plant an orphan row (as file deletion would): the honesty gate must drop it.
  88. const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
  89. queries.insertNameSegmentsBatch(['GhostSymbolMachine']);
  90. const matches = cg.getSegmentMatches(['ghost', 'symbol']);
  91. expect(matches).toEqual([]);
  92. });
  93. it('unrelated prose matches nothing', () => {
  94. expect(cg.getSegmentMatches(extractProseCandidates('write a haiku about autumn leaves'))).toEqual([]);
  95. });
  96. it('English function/filler words are never single-word evidence — the measured FPs', () => {
  97. // "fix this typo" — 'this' IS a (rare!) segment here via
  98. // resolveDeferredThisMemberRefs; the stoplist keeps it out of candidates.
  99. expect(cg.getSegmentMatches(extractProseCandidates('fix this typo'))).toEqual([]);
  100. // "write …" — writeConfig exists; 'write' is stoplisted prose.
  101. expect(cg.getSegmentMatches(extractProseCandidates('write something for the readme'))).toEqual([]);
  102. // Engine-level backstop, independent of extraction: a sub-5-char single
  103. // word never fires the single-word tier even if a caller passes it raw.
  104. expect(cg.getSegmentMatches(['this'])).toEqual([]);
  105. // But the same segments remain reachable through CO-OCCURRENCE — the
  106. // stoplist only removes thin single-word evidence: naming both halves of
  107. // writeConfig via prose is still a match ("config" is not stoplisted).
  108. expect(cg.getSegmentMatches(['config']).map((m) => m.name)).toContain('writeConfig');
  109. });
  110. it('sync heals an empty vocab over a populated graph (pre-vocab-table upgrade path)', async () => {
  111. const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean } }).queries;
  112. queries.clearNameSegmentVocab();
  113. expect(queries.isNameSegmentVocabEmpty()).toBe(true);
  114. await cg.sync();
  115. expect(queries.isNameSegmentVocabEmpty()).toBe(false);
  116. expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
  117. });
  118. it('heal covers UNCHANGED files even when the same sync also indexes changed ones', async () => {
  119. // Regression: emptiness must be captured at sync ENTRY — the sync's own
  120. // incremental writes populate rows for the files it touches, and an
  121. // end-of-sync emptiness check would see those rows and skip the backfill,
  122. // leaving every unchanged file's names unsegmented forever.
  123. const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void } }).queries;
  124. queries.clearNameSegmentVocab();
  125. const touched = path.join(dir, 'src', 'state-machine.ts');
  126. fs.writeFileSync(touched, fs.readFileSync(touched, 'utf8') + '\n// touched\n');
  127. await cg.sync();
  128. // The touched file's names came from the incremental write path; the
  129. // UNTOUCHED file's names must come from the backfill.
  130. expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
  131. });
  132. it('healSegmentVocabIfEmpty backfills WITHOUT a sync — the prompt-hook open path (#1142)', async () => {
  133. // The hook opens the graph without syncing, and a database migrated from
  134. // before the vocab table existed starts with it empty — sync's backfill
  135. // never runs on that path, leaving the MEDIUM tier permanently dormant.
  136. const queries = (cg as unknown as {
  137. queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean };
  138. }).queries;
  139. queries.clearNameSegmentVocab();
  140. expect(queries.isNameSegmentVocabEmpty()).toBe(true);
  141. await expect(cg.healSegmentVocabIfEmpty()).resolves.toBe(true);
  142. expect(queries.isNameSegmentVocabEmpty()).toBe(false);
  143. expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
  144. // Populated vocab: the fast path (one SELECT) still reports usable.
  145. await expect(cg.healSegmentVocabIfEmpty()).resolves.toBe(true);
  146. });
  147. it('a rename through updateNode reaches the vocab — the framework post-extract path (#1141)', () => {
  148. // Framework resolvers rewrite node names after extraction (NestJS route
  149. // prefixing) via updateNode. The new name must become prose-searchable;
  150. // the old name's rows become orphans the honesty gate drops.
  151. const queries = (cg as unknown as {
  152. queries: {
  153. getNodesByName(name: string): Array<Record<string, unknown>>;
  154. updateNode(node: Record<string, unknown>): void;
  155. };
  156. }).queries;
  157. const node = queries.getNodesByName('OrderStateMachine')[0]!;
  158. queries.updateNode({ ...node, name: 'RenamedWorkflowEngine', qualifiedName: 'RenamedWorkflowEngine' });
  159. expect(cg.getSegmentMatches(['renamed', 'workflow']).map((m) => m.name)).toContain('RenamedWorkflowEngine');
  160. expect(cg.getSegmentMatches(['state', 'machine'])).toEqual([]);
  161. });
  162. it('a name that exists only as an import statement is never surfaced (#1144)', async () => {
  163. // Import nodes are named after module specifiers, not symbols. The write
  164. // path no longer segments them; and even against legacy vocab rows (a DB
  165. // populated before that exclusion), the representative picker must skip
  166. // the name rather than surface the import line as a matched symbol.
  167. fs.writeFileSync(
  168. path.join(dir, 'src', 'consumer.ts'),
  169. `import { Thing } from 'external-unindexed-pkg';\nexport function useIt(): void {}\n`,
  170. );
  171. await cg.sync();
  172. expect(cg.getSegmentMatches(['external', 'unindexed'])).toEqual([]);
  173. // Legacy rows: plant the vocab entries a pre-exclusion version wrote.
  174. const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
  175. queries.insertNameSegmentsBatch(['external-unindexed-pkg']);
  176. expect(cg.getSegmentMatches(['external', 'unindexed'])).toEqual([]);
  177. });
  178. it('co-occurrence counts distinct WORDS, not variants — plural pairs cannot pose as two words (#1146)', () => {
  179. const queries = (cg as unknown as {
  180. queries: {
  181. insertNameSegmentsBatch(names: string[]): void;
  182. getSegmentCoOccurrence(
  183. variants: Array<{ segment: string; word: string }>,
  184. minWords: number,
  185. limit: number,
  186. ): Array<{ name: string; matches: number }>;
  187. };
  188. }).queries;
  189. // BillingServicesService carries BOTH the `services` and `service`
  190. // segments — two variants of ONE prompt word. It must not meet minWords=2.
  191. queries.insertNameSegmentsBatch(['BillingServicesService']);
  192. const hits = queries.getSegmentCoOccurrence(
  193. [
  194. { segment: 'services', word: 'services' },
  195. { segment: 'service', word: 'services' },
  196. { segment: 'checkout', word: 'checkout' },
  197. ],
  198. 2,
  199. 24,
  200. );
  201. const names = hits.map((h) => h.name);
  202. expect(names).toContain('CheckoutService'); // checkout + service(s) — two real words
  203. expect(names).not.toContain('BillingServicesService'); // services + service — one word
  204. });
  205. });