segment-vocab.test.ts 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. import { describe, it, expect, beforeEach, afterEach } from 'vitest';
  2. import * as fs from 'node:fs';
  3. import * as path from 'node:path';
  4. import * as os from 'node:os';
  5. import { CodeGraph } from '../src';
  6. import { extractProseCandidates } from '../src/search/identifier-segments';
  7. /**
  8. * The graph-derived gate behind the prompt hook's MEDIUM tier: symbol names
  9. * are segmented into the words a human uses for them in prose
  10. * (name_segment_vocab, populated on the node write path), and
  11. * CodeGraph.getSegmentMatches verifies prompt words against them with
  12. * co-occurrence / rarity rules. Precision comes from the repo's own naming
  13. * statistics — no keyword vocabulary involved.
  14. */
  15. describe('name-segment vocabulary + getSegmentMatches (graph-derived gate)', () => {
  16. let dir: string;
  17. let cg: CodeGraph;
  18. beforeEach(async () => {
  19. dir = fs.mkdtempSync(path.join(os.tmpdir(), 'segment-vocab-'));
  20. fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
  21. fs.writeFileSync(
  22. path.join(dir, 'src', 'state-machine.ts'),
  23. `export class OrderStateMachine {
  24. transition(from: string, to: string): boolean { return from !== to; }
  25. }
  26. `,
  27. );
  28. fs.writeFileSync(
  29. path.join(dir, 'src', 'checkout.ts'),
  30. `export class CheckoutService {
  31. submitOrder(): void {}
  32. }
  33. export class CheckoutController {
  34. handle(): void {}
  35. }
  36. export function loadConfig(): void {}
  37. `,
  38. );
  39. // 30 distinct names sharing the segment "data" — a ubiquitous segment that
  40. // must NOT qualify as a single-word signal (rarity ceiling).
  41. const noise = Array.from({ length: 30 }, (_, i) => {
  42. const suffix = `${String.fromCharCode(65 + (i % 26))}${i}`;
  43. return `export function dataLoader${suffix}(): number { return ${i}; }`;
  44. }).join('\n');
  45. // The measured-FP shapes: a repo-rare segment that is an English function
  46. // word ("this"), and a common-verb segment ("write").
  47. const fpBait = `
  48. export function resolveDeferredThisMemberRefs(): void {}
  49. export function writeConfig(): void {}
  50. `;
  51. fs.writeFileSync(path.join(dir, 'src', 'noise.ts'), noise + fpBait + '\n');
  52. cg = await CodeGraph.init(dir, { silent: true });
  53. await cg.indexAll();
  54. });
  55. afterEach(() => {
  56. cg.destroy();
  57. fs.rmSync(dir, { recursive: true, force: true });
  58. });
  59. it('co-occurrence: two prose words on one name find it — the reported-prompt shape', () => {
  60. // The words a French prompt would produce: "comment marche la state
  61. // machine des commandes ?" — no keyword list knows any of them.
  62. const words = extractProseCandidates('comment marche la state machine des commandes ?');
  63. const matches = cg.getSegmentMatches(words);
  64. expect(matches.map((m) => m.name)).toContain('OrderStateMachine');
  65. const hit = matches.find((m) => m.name === 'OrderStateMachine')!;
  66. expect(hit.matchedWords).toEqual(['machine', 'state']);
  67. expect(hit.filePath).toContain('state-machine.ts');
  68. expect(hit.kind).not.toBe('file');
  69. });
  70. it('single rare word qualifies; ubiquitous and singleton words do not', () => {
  71. // "checkout" clusters (Service + Controller) — a concept this repo is about.
  72. expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
  73. // "data" appears in 30 names here — noise, not signal.
  74. expect(cg.getSegmentMatches(['data'])).toEqual([]);
  75. // "machine" appears in exactly ONE name — a singleton is prose
  76. // coincidence for a single word (the "deploy to production" FP shape);
  77. // it stays reachable through co-occurrence ("state machine").
  78. expect(cg.getSegmentMatches(['machine'])).toEqual([]);
  79. });
  80. it('plural folding: "services" still meets the "service" segment', () => {
  81. const matches = cg.getSegmentMatches(['checkout', 'services']);
  82. const hit = matches.find((m) => m.name === 'CheckoutService');
  83. expect(hit).toBeDefined();
  84. expect(hit!.matchedWords).toEqual(['checkout', 'services']);
  85. });
  86. it('vocab rows are proposals — a name with no surviving node is never surfaced', () => {
  87. // Plant an orphan row (as file deletion would): the honesty gate must drop it.
  88. const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
  89. queries.insertNameSegmentsBatch(['GhostSymbolMachine']);
  90. const matches = cg.getSegmentMatches(['ghost', 'symbol']);
  91. expect(matches).toEqual([]);
  92. });
  93. it('unrelated prose matches nothing', () => {
  94. expect(cg.getSegmentMatches(extractProseCandidates('write a haiku about autumn leaves'))).toEqual([]);
  95. });
  96. it('English function/filler words are never single-word evidence — the measured FPs', () => {
  97. // "fix this typo" — 'this' IS a (rare!) segment here via
  98. // resolveDeferredThisMemberRefs; the stoplist keeps it out of candidates.
  99. expect(cg.getSegmentMatches(extractProseCandidates('fix this typo'))).toEqual([]);
  100. // "write …" — writeConfig exists; 'write' is stoplisted prose.
  101. expect(cg.getSegmentMatches(extractProseCandidates('write something for the readme'))).toEqual([]);
  102. // Engine-level backstop, independent of extraction: a sub-5-char single
  103. // word never fires the single-word tier even if a caller passes it raw.
  104. expect(cg.getSegmentMatches(['this'])).toEqual([]);
  105. // But the same segments remain reachable through CO-OCCURRENCE — the
  106. // stoplist only removes thin single-word evidence: naming both halves of
  107. // writeConfig via prose is still a match ("config" is not stoplisted).
  108. expect(cg.getSegmentMatches(['config']).map((m) => m.name)).toContain('writeConfig');
  109. });
  110. it('sync heals an empty vocab over a populated graph (pre-vocab-table upgrade path)', async () => {
  111. const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean } }).queries;
  112. queries.clearNameSegmentVocab();
  113. expect(queries.isNameSegmentVocabEmpty()).toBe(true);
  114. await cg.sync();
  115. expect(queries.isNameSegmentVocabEmpty()).toBe(false);
  116. expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
  117. });
  118. it('heal covers UNCHANGED files even when the same sync also indexes changed ones', async () => {
  119. // Regression: emptiness must be captured at sync ENTRY — the sync's own
  120. // incremental writes populate rows for the files it touches, and an
  121. // end-of-sync emptiness check would see those rows and skip the backfill,
  122. // leaving every unchanged file's names unsegmented forever.
  123. const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void } }).queries;
  124. queries.clearNameSegmentVocab();
  125. const touched = path.join(dir, 'src', 'state-machine.ts');
  126. fs.writeFileSync(touched, fs.readFileSync(touched, 'utf8') + '\n// touched\n');
  127. await cg.sync();
  128. // The touched file's names came from the incremental write path; the
  129. // UNTOUCHED file's names must come from the backfill.
  130. expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
  131. });
  132. });