haiany
/
codegraph
mirror of https://github.com/colbymchenry/codegraph.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
							import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import * as fs from 'node:fs';
import * as path from 'node:path';
import * as os from 'node:os';
import { CodeGraph } from '../src';
import { extractProseCandidates } from '../src/search/identifier-segments';

/**
 * The graph-derived gate behind the prompt hook's MEDIUM tier: symbol names
 * are segmented into the words a human uses for them in prose
 * (name_segment_vocab, populated on the node write path), and
 * CodeGraph.getSegmentMatches verifies prompt words against them with
 * co-occurrence / rarity rules. Precision comes from the repo's own naming
 * statistics — no keyword vocabulary involved.
 */
describe('name-segment vocabulary + getSegmentMatches (graph-derived gate)', () => {
  let dir: string;
  let cg: CodeGraph;

  beforeEach(async () => {
    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'segment-vocab-'));
    fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
    fs.writeFileSync(
      path.join(dir, 'src', 'state-machine.ts'),
      `export class OrderStateMachine {
  transition(from: string, to: string): boolean { return from !== to; }
}
`,
    );
    fs.writeFileSync(
      path.join(dir, 'src', 'checkout.ts'),
      `export class CheckoutService {
  submitOrder(): void {}
}
export class CheckoutController {
  handle(): void {}
}
export function loadConfig(): void {}
`,
    );
    // 30 distinct names sharing the segment "data" — a ubiquitous segment that
    // must NOT qualify as a single-word signal (rarity ceiling).
    const noise = Array.from({ length: 30 }, (_, i) => {
      const suffix = `${String.fromCharCode(65 + (i % 26))}${i}`;
      return `export function dataLoader${suffix}(): number { return ${i}; }`;
    }).join('\n');
    // The measured-FP shapes: a repo-rare segment that is an English function
    // word ("this"), and a common-verb segment ("write").
    const fpBait = `
export function resolveDeferredThisMemberRefs(): void {}
export function writeConfig(): void {}
`;
    fs.writeFileSync(path.join(dir, 'src', 'noise.ts'), noise + fpBait + '\n');

    cg = await CodeGraph.init(dir, { silent: true });
    await cg.indexAll();
  });

  afterEach(() => {
    cg.destroy();
    fs.rmSync(dir, { recursive: true, force: true });
  });

  it('co-occurrence: two prose words on one name find it — the reported-prompt shape', () => {
    // The words a French prompt would produce: "comment marche la state
    // machine des commandes ?" — no keyword list knows any of them.
    const words = extractProseCandidates('comment marche la state machine des commandes ?');
    const matches = cg.getSegmentMatches(words);
    expect(matches.map((m) => m.name)).toContain('OrderStateMachine');
    const hit = matches.find((m) => m.name === 'OrderStateMachine')!;
    expect(hit.matchedWords).toEqual(['machine', 'state']);
    expect(hit.filePath).toContain('state-machine.ts');
    expect(hit.kind).not.toBe('file');
  });

  it('single rare word qualifies; ubiquitous and singleton words do not', () => {
    // "checkout" clusters (Service + Controller) — a concept this repo is about.
    expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
    // "data" appears in 30 names here — noise, not signal.
    expect(cg.getSegmentMatches(['data'])).toEqual([]);
    // "machine" appears in exactly ONE name — a singleton is prose
    // coincidence for a single word (the "deploy to production" FP shape);
    // it stays reachable through co-occurrence ("state machine").
    expect(cg.getSegmentMatches(['machine'])).toEqual([]);
  });

  it('plural folding: "services" still meets the "service" segment', () => {
    const matches = cg.getSegmentMatches(['checkout', 'services']);
    const hit = matches.find((m) => m.name === 'CheckoutService');
    expect(hit).toBeDefined();
    expect(hit!.matchedWords).toEqual(['checkout', 'services']);
  });

  it('vocab rows are proposals — a name with no surviving node is never surfaced', () => {
    // Plant an orphan row (as file deletion would): the honesty gate must drop it.
    const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
    queries.insertNameSegmentsBatch(['GhostSymbolMachine']);
    const matches = cg.getSegmentMatches(['ghost', 'symbol']);
    expect(matches).toEqual([]);
  });

  it('unrelated prose matches nothing', () => {
    expect(cg.getSegmentMatches(extractProseCandidates('write a haiku about autumn leaves'))).toEqual([]);
  });

  it('English function/filler words are never single-word evidence — the measured FPs', () => {
    // "fix this typo" — 'this' IS a (rare!) segment here via
    // resolveDeferredThisMemberRefs; the stoplist keeps it out of candidates.
    expect(cg.getSegmentMatches(extractProseCandidates('fix this typo'))).toEqual([]);
    // "write …" — writeConfig exists; 'write' is stoplisted prose.
    expect(cg.getSegmentMatches(extractProseCandidates('write something for the readme'))).toEqual([]);
    // Engine-level backstop, independent of extraction: a sub-5-char single
    // word never fires the single-word tier even if a caller passes it raw.
    expect(cg.getSegmentMatches(['this'])).toEqual([]);
    // But the same segments remain reachable through CO-OCCURRENCE — the
    // stoplist only removes thin single-word evidence: naming both halves of
    // writeConfig via prose is still a match ("config" is not stoplisted).
    expect(cg.getSegmentMatches(['config']).map((m) => m.name)).toContain('writeConfig');
  });

  it('sync heals an empty vocab over a populated graph (pre-vocab-table upgrade path)', async () => {
    const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean } }).queries;
    queries.clearNameSegmentVocab();
    expect(queries.isNameSegmentVocabEmpty()).toBe(true);
    await cg.sync();
    expect(queries.isNameSegmentVocabEmpty()).toBe(false);
    expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
  });

  it('heal covers UNCHANGED files even when the same sync also indexes changed ones', async () => {
    // Regression: emptiness must be captured at sync ENTRY — the sync's own
    // incremental writes populate rows for the files it touches, and an
    // end-of-sync emptiness check would see those rows and skip the backfill,
    // leaving every unchanged file's names unsegmented forever.
    const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void } }).queries;
    queries.clearNameSegmentVocab();
    const touched = path.join(dir, 'src', 'state-machine.ts');
    fs.writeFileSync(touched, fs.readFileSync(touched, 'utf8') + '\n// touched\n');
    await cg.sync();
    // The touched file's names came from the incremental write path; the
    // UNTOUCHED file's names must come from the backfill.
    expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
  });

  it('healSegmentVocabIfEmpty backfills WITHOUT a sync — the prompt-hook open path (#1142)', async () => {
    // The hook opens the graph without syncing, and a database migrated from
    // before the vocab table existed starts with it empty — sync's backfill
    // never runs on that path, leaving the MEDIUM tier permanently dormant.
    const queries = (cg as unknown as {
      queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean };
    }).queries;
    queries.clearNameSegmentVocab();
    expect(queries.isNameSegmentVocabEmpty()).toBe(true);
    await expect(cg.healSegmentVocabIfEmpty()).resolves.toBe(true);
    expect(queries.isNameSegmentVocabEmpty()).toBe(false);
    expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
    // Populated vocab: the fast path (one SELECT) still reports usable.
    await expect(cg.healSegmentVocabIfEmpty()).resolves.toBe(true);
  });

  it('a rename through updateNode reaches the vocab — the framework post-extract path (#1141)', () => {
    // Framework resolvers rewrite node names after extraction (NestJS route
    // prefixing) via updateNode. The new name must become prose-searchable;
    // the old name's rows become orphans the honesty gate drops.
    const queries = (cg as unknown as {
      queries: {
        getNodesByName(name: string): Array<Record<string, unknown>>;
        updateNode(node: Record<string, unknown>): void;
      };
    }).queries;
    const node = queries.getNodesByName('OrderStateMachine')[0]!;
    queries.updateNode({ ...node, name: 'RenamedWorkflowEngine', qualifiedName: 'RenamedWorkflowEngine' });
    expect(cg.getSegmentMatches(['renamed', 'workflow']).map((m) => m.name)).toContain('RenamedWorkflowEngine');
    expect(cg.getSegmentMatches(['state', 'machine'])).toEqual([]);
  });

  it('a name that exists only as an import statement is never surfaced (#1144)', async () => {
    // Import nodes are named after module specifiers, not symbols. The write
    // path no longer segments them; and even against legacy vocab rows (a DB
    // populated before that exclusion), the representative picker must skip
    // the name rather than surface the import line as a matched symbol.
    fs.writeFileSync(
      path.join(dir, 'src', 'consumer.ts'),
      `import { Thing } from 'external-unindexed-pkg';\nexport function useIt(): void {}\n`,
    );
    await cg.sync();
    expect(cg.getSegmentMatches(['external', 'unindexed'])).toEqual([]);
    // Legacy rows: plant the vocab entries a pre-exclusion version wrote.
    const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
    queries.insertNameSegmentsBatch(['external-unindexed-pkg']);
    expect(cg.getSegmentMatches(['external', 'unindexed'])).toEqual([]);
  });

  it('co-occurrence counts distinct WORDS, not variants — plural pairs cannot pose as two words (#1146)', () => {
    const queries = (cg as unknown as {
      queries: {
        insertNameSegmentsBatch(names: string[]): void;
        getSegmentCoOccurrence(
          variants: Array<{ segment: string; word: string }>,
          minWords: number,
          limit: number,
        ): Array<{ name: string; matches: number }>;
      };
    }).queries;
    // BillingServicesService carries BOTH the `services` and `service`
    // segments — two variants of ONE prompt word. It must not meet minWords=2.
    queries.insertNameSegmentsBatch(['BillingServicesService']);
    const hits = queries.getSegmentCoOccurrence(
      [
        { segment: 'services', word: 'services' },
        { segment: 'service', word: 'services' },
        { segment: 'checkout', word: 'checkout' },
      ],
      2,
      24,
    );
    const names = hits.map((h) => h.name);
    expect(names).toContain('CheckoutService'); // checkout + service(s) — two real words
    expect(names).not.toContain('BillingServicesService'); // services + service — one word
  });
});