| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- import { describe, it, expect, beforeEach, afterEach } from 'vitest';
- import * as fs from 'node:fs';
- import * as path from 'node:path';
- import * as os from 'node:os';
- import { CodeGraph } from '../src';
- import { extractProseCandidates } from '../src/search/identifier-segments';
- /**
- * The graph-derived gate behind the prompt hook's MEDIUM tier: symbol names
- * are segmented into the words a human uses for them in prose
- * (name_segment_vocab, populated on the node write path), and
- * CodeGraph.getSegmentMatches verifies prompt words against them with
- * co-occurrence / rarity rules. Precision comes from the repo's own naming
- * statistics — no keyword vocabulary involved.
- */
- describe('name-segment vocabulary + getSegmentMatches (graph-derived gate)', () => {
- let dir: string;
- let cg: CodeGraph;
- beforeEach(async () => {
- dir = fs.mkdtempSync(path.join(os.tmpdir(), 'segment-vocab-'));
- fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
- fs.writeFileSync(
- path.join(dir, 'src', 'state-machine.ts'),
- `export class OrderStateMachine {
- transition(from: string, to: string): boolean { return from !== to; }
- }
- `,
- );
- fs.writeFileSync(
- path.join(dir, 'src', 'checkout.ts'),
- `export class CheckoutService {
- submitOrder(): void {}
- }
- export class CheckoutController {
- handle(): void {}
- }
- export function loadConfig(): void {}
- `,
- );
- // 30 distinct names sharing the segment "data" — a ubiquitous segment that
- // must NOT qualify as a single-word signal (rarity ceiling).
- const noise = Array.from({ length: 30 }, (_, i) => {
- const suffix = `${String.fromCharCode(65 + (i % 26))}${i}`;
- return `export function dataLoader${suffix}(): number { return ${i}; }`;
- }).join('\n');
- // The measured-FP shapes: a repo-rare segment that is an English function
- // word ("this"), and a common-verb segment ("write").
- const fpBait = `
- export function resolveDeferredThisMemberRefs(): void {}
- export function writeConfig(): void {}
- `;
- fs.writeFileSync(path.join(dir, 'src', 'noise.ts'), noise + fpBait + '\n');
- cg = await CodeGraph.init(dir, { silent: true });
- await cg.indexAll();
- });
- afterEach(() => {
- cg.destroy();
- fs.rmSync(dir, { recursive: true, force: true });
- });
- it('co-occurrence: two prose words on one name find it — the reported-prompt shape', () => {
- // The words a French prompt would produce: "comment marche la state
- // machine des commandes ?" — no keyword list knows any of them.
- const words = extractProseCandidates('comment marche la state machine des commandes ?');
- const matches = cg.getSegmentMatches(words);
- expect(matches.map((m) => m.name)).toContain('OrderStateMachine');
- const hit = matches.find((m) => m.name === 'OrderStateMachine')!;
- expect(hit.matchedWords).toEqual(['machine', 'state']);
- expect(hit.filePath).toContain('state-machine.ts');
- expect(hit.kind).not.toBe('file');
- });
- it('single rare word qualifies; ubiquitous and singleton words do not', () => {
- // "checkout" clusters (Service + Controller) — a concept this repo is about.
- expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
- // "data" appears in 30 names here — noise, not signal.
- expect(cg.getSegmentMatches(['data'])).toEqual([]);
- // "machine" appears in exactly ONE name — a singleton is prose
- // coincidence for a single word (the "deploy to production" FP shape);
- // it stays reachable through co-occurrence ("state machine").
- expect(cg.getSegmentMatches(['machine'])).toEqual([]);
- });
- it('plural folding: "services" still meets the "service" segment', () => {
- const matches = cg.getSegmentMatches(['checkout', 'services']);
- const hit = matches.find((m) => m.name === 'CheckoutService');
- expect(hit).toBeDefined();
- expect(hit!.matchedWords).toEqual(['checkout', 'services']);
- });
- it('vocab rows are proposals — a name with no surviving node is never surfaced', () => {
- // Plant an orphan row (as file deletion would): the honesty gate must drop it.
- const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
- queries.insertNameSegmentsBatch(['GhostSymbolMachine']);
- const matches = cg.getSegmentMatches(['ghost', 'symbol']);
- expect(matches).toEqual([]);
- });
- it('unrelated prose matches nothing', () => {
- expect(cg.getSegmentMatches(extractProseCandidates('write a haiku about autumn leaves'))).toEqual([]);
- });
- it('English function/filler words are never single-word evidence — the measured FPs', () => {
- // "fix this typo" — 'this' IS a (rare!) segment here via
- // resolveDeferredThisMemberRefs; the stoplist keeps it out of candidates.
- expect(cg.getSegmentMatches(extractProseCandidates('fix this typo'))).toEqual([]);
- // "write …" — writeConfig exists; 'write' is stoplisted prose.
- expect(cg.getSegmentMatches(extractProseCandidates('write something for the readme'))).toEqual([]);
- // Engine-level backstop, independent of extraction: a sub-5-char single
- // word never fires the single-word tier even if a caller passes it raw.
- expect(cg.getSegmentMatches(['this'])).toEqual([]);
- // But the same segments remain reachable through CO-OCCURRENCE — the
- // stoplist only removes thin single-word evidence: naming both halves of
- // writeConfig via prose is still a match ("config" is not stoplisted).
- expect(cg.getSegmentMatches(['config']).map((m) => m.name)).toContain('writeConfig');
- });
- it('sync heals an empty vocab over a populated graph (pre-vocab-table upgrade path)', async () => {
- const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean } }).queries;
- queries.clearNameSegmentVocab();
- expect(queries.isNameSegmentVocabEmpty()).toBe(true);
- await cg.sync();
- expect(queries.isNameSegmentVocabEmpty()).toBe(false);
- expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
- });
- it('heal covers UNCHANGED files even when the same sync also indexes changed ones', async () => {
- // Regression: emptiness must be captured at sync ENTRY — the sync's own
- // incremental writes populate rows for the files it touches, and an
- // end-of-sync emptiness check would see those rows and skip the backfill,
- // leaving every unchanged file's names unsegmented forever.
- const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void } }).queries;
- queries.clearNameSegmentVocab();
- const touched = path.join(dir, 'src', 'state-machine.ts');
- fs.writeFileSync(touched, fs.readFileSync(touched, 'utf8') + '\n// touched\n');
- await cg.sync();
- // The touched file's names came from the incremental write path; the
- // UNTOUCHED file's names must come from the backfill.
- expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
- });
- });
|