пре 1 дан · e699ee9686
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,11 @@ and adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
				 
			
 
				 ## [Unreleased]
			
 
				 
			
 
				+### New Features
			
 
				+
			
 
				+- The Claude Code context hook now recognizes prompts that describe code in plain words — in any language — by checking the prompt's words against the symbol names actually in your project's index. Asking about "the state machine des commandes" finds `OrderStateMachine` with no keyword involved. Confidence decides how much gets injected: structural questions and prompts naming a real symbol still get full context up front; a plain-words match gets a short pointer to the matching symbols so the agent queries them itself; everything else stays silent, exactly as before.
			
 
				+- Anonymous usage telemetry now counts how often the context hook injected context, offered a hint, or stayed silent — fixed counter names only; the prompt's content is never stored or sent. This makes the hook's accuracy measurable instead of guessed.
			
 
				+
			
 
				 ### Fixes
			
 
				 
			
 
				 - The automatic context hook for Claude Code now fires for structural questions asked in nearly thirty languages — French, Spanish, Portuguese, German, Italian, Dutch, Polish, Czech, Romanian, Hungarian, Greek, Swedish, Danish, Norwegian, Finnish, Russian, Ukrainian, Turkish, Indonesian, Vietnamese, Thai, Hindi, Arabic, Farsi, Hebrew, Japanese, Korean, and both simplified and traditional Chinese — instead of just English and simplified Chinese. Previously a natural question like "comment marche la state machine des commandes ?" injected nothing unless it happened to contain a code-shaped symbol name, making the hook look broken for non-English teams. English questions phrased with derived word forms ("explain the architecture…", "what are the dependencies…") now fire too, and prompts in any other language still fire when they name a symbol from the index. Thanks @anthonyle-roy-lgtm for the report. (#1126)
			
--- a/TELEMETRY.md
+++ b/TELEMETRY.md
@@ -52,7 +52,9 @@ And one of four events:
 
				 - **`usage_rollup`** — one line per day per tool: the tool or CLI command **name** (e.g.
			
 
				   `codegraph_explore`, `init`), how many times it ran, how many errored, and — for MCP
			
 
				   tools — the connecting agent's name and version from the MCP handshake (e.g.
			
 
				-  `Claude Code 2.1`).
			
 
				+  `Claude Code 2.1`). The Claude Code prompt hook also counts its **gate decision**
			
 
				+  (fired fully, fired as a hint, or did nothing — fixed counter names like
			
 
				+  `prompt-hook-gate-medium-segment`); the prompt itself is never read, stored, or sent.
			
 
				 - **`uninstall`** — when `codegraph uninstall`/`uninit` runs: which agents were removed.
			
 
				 
			
 
				 Usage is **aggregated locally into daily totals** before anything is sent — there is no
			
--- a/__tests__/db-perf.test.ts
+++ b/__tests__/db-perf.test.ts
@@ -343,7 +343,7 @@ describe('migration v6: dedup edges + add identity index on upgrade (#1034)', ()
 
				     runMigrations(raw, 5);
			
 
				 
			
 
				     expect(count()).toBe(2); // duplicate collapsed, the distinct `calls` edge kept
			
 
				-    expect(getCurrentVersion(raw)).toBe(6);
			
 
				+    expect(getCurrentVersion(raw)).toBe(7);
			
 
				     const idx = raw
			
 
				       .prepare("SELECT name FROM sqlite_master WHERE type='index' AND name='idx_edges_identity'")
			
 
				       .get();
			
--- a/__tests__/foundation.test.ts
+++ b/__tests__/foundation.test.ts
@@ -370,7 +370,7 @@ describe('Database Connection', () => {
 
				 
			
 
				     const version = db.getSchemaVersion();
			
 
				     expect(version).not.toBeNull();
			
 
				-    expect(version?.version).toBe(6);
			
 
				+    expect(version?.version).toBe(7);
			
 
				 
			
 
				     db.close();
			
 
				   });
			
--- a/__tests__/identifier-segments.test.ts
+++ b/__tests__/identifier-segments.test.ts
@@ -0,0 +1,81 @@
 
				+import { describe, it, expect } from 'vitest';
			
 
				+import {
			
 
				+  splitIdentifierSegments,
			
 
				+  extractProseCandidates,
			
 
				+  normalizeProseWord,
			
 
				+  segmentLookupVariants,
			
 
				+} from '../src/search/identifier-segments';
			
 
				+
			
 
				+describe('splitIdentifierSegments — symbol names → prose words', () => {
			
 
				+  it('splits camelCase / PascalCase at humps', () => {
			
 
				+    expect(splitIdentifierSegments('OrderStateMachine')).toEqual(['order', 'state', 'machine']);
			
 
				+    expect(splitIdentifierSegments('userId')).toEqual(['user', 'id']);
			
 
				+  });
			
 
				+
			
 
				+  it('handles acronym runs — HTML stays one segment', () => {
			
 
				+    expect(splitIdentifierSegments('parseHTMLDocument')).toEqual(['parse', 'html', 'document']);
			
 
				+    expect(splitIdentifierSegments('HTMLParser')).toEqual(['html', 'parser']);
			
 
				+  });
			
 
				+
			
 
				+  it('keeps digits glued to their word', () => {
			
 
				+    expect(splitIdentifierSegments('base64Encode')).toEqual(['base64', 'encode']);
			
 
				+    expect(splitIdentifierSegments('parseHTML5Doc')).toEqual(['parse', 'html5', 'doc']);
			
 
				+  });
			
 
				+
			
 
				+  it('splits snake_case, kebab-case, and dotted file names', () => {
			
 
				+    expect(splitIdentifierSegments('snake_case_name')).toEqual(['snake', 'case', 'name']);
			
 
				+    expect(splitIdentifierSegments('MAX_RETRY_COUNT')).toEqual(['max', 'retry', 'count']);
			
 
				+    expect(splitIdentifierSegments('checkout.service.ts')).toEqual(['checkout', 'service', 'ts']);
			
 
				+    expect(splitIdentifierSegments('state-machine')).toEqual(['state', 'machine']);
			
 
				+  });
			
 
				+
			
 
				+  it('drops sub-minimum and digit-only fragments, dedupes', () => {
			
 
				+    expect(splitIdentifierSegments('x')).toEqual([]);
			
 
				+    expect(splitIdentifierSegments('42')).toEqual([]);
			
 
				+    expect(splitIdentifierSegments('getData_getData')).toEqual(['get', 'data']);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe('extractProseCandidates — prompt prose → lookup words', () => {
			
 
				+  it('keeps content words, drops short function words, in any Latin language', () => {
			
 
				+    expect(extractProseCandidates('comment marche la state machine des commandes ?')).toEqual([
			
 
				+      'comment', 'marche', 'state', 'machine', 'commandes',
			
 
				+    ]);
			
 
				+  });
			
 
				+
			
 
				+  it('strips diacritics so loanwords meet ASCII identifier segments', () => {
			
 
				+    expect(extractProseCandidates('la résolution des références')).toEqual(['resolution', 'references']);
			
 
				+    expect(normalizeProseWord('Übersicht')).toBe('ubersicht');
			
 
				+  });
			
 
				+
			
 
				+  it("splits on apostrophes — l'architecture keeps the noun", () => {
			
 
				+    expect(extractProseCandidates("explique l'architecture du module de stock")).toEqual([
			
 
				+      'explique', 'architecture', 'module', 'stock',
			
 
				+    ]);
			
 
				+  });
			
 
				+
			
 
				+  it('caps candidates and skips unsegmented-script sentence runs', () => {
			
 
				+    const many = Array.from({ length: 25 }, (_, i) => `distinctword${String.fromCharCode(97 + i)}`).join(' ');
			
 
				+    expect(extractProseCandidates(many)).toHaveLength(16);
			
 
				+    // A no-spaces CJK sentence is one giant run — over the length ceiling, skipped.
			
 
				+    expect(extractProseCandidates('請解釋一下這個訂單狀態機的整體運作流程與架構設計方式')).toEqual([]);
			
 
				+    // Short CJK runs pass through as candidates — no script filter; the graph
			
 
				+    // verification tier rejects them (identifiers are almost never CJK).
			
 
				+    expect(extractProseCandidates('修复这个拼写错误')).toEqual(['修复这个拼写错误']);
			
 
				+  });
			
 
				+
			
 
				+  it('drops digit-only and sub-4-char words', () => {
			
 
				+    expect(extractProseCandidates('fix the bug in v2 at 1234')).toEqual([]);
			
 
				+  });
			
 
				+});
			
 
				+
			
 
				+describe('segmentLookupVariants — light plural folding', () => {
			
 
				+  it('folds trailing s/es so plurals hit singular segments', () => {
			
 
				+    expect(segmentLookupVariants('services')).toContain('service');
			
 
				+    expect(segmentLookupVariants('machines')).toContain('machine');
			
 
				+  });
			
 
				+
			
 
				+  it('never strips a word below the minimum', () => {
			
 
				+    expect(segmentLookupVariants('bus')).toEqual(['bus']);
			
 
				+  });
			
 
				+});
			
--- a/__tests__/pr19-improvements.test.ts
+++ b/__tests__/pr19-improvements.test.ts
@@ -299,7 +299,7 @@ describe('Best-Candidate Resolution', () => {
 
				 describe('Schema v2 Migration', () => {
			
 
				   it.skipIf(!HAS_SQLITE)('should have correct current schema version', async () => {
			
 
				     const { CURRENT_SCHEMA_VERSION } = await import('../src/db/migrations');
			
 
				-    expect(CURRENT_SCHEMA_VERSION).toBe(6);
			
 
				+    expect(CURRENT_SCHEMA_VERSION).toBe(7);
			
 
				   });
			
 
				 
			
 
				   it.skipIf(!HAS_SQLITE)('should have migration for version 2', async () => {
			
--- a/__tests__/segment-vocab.test.ts
+++ b/__tests__/segment-vocab.test.ts
@@ -0,0 +1,144 @@
 
				+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
			
 
				+import * as fs from 'node:fs';
			
 
				+import * as path from 'node:path';
			
 
				+import * as os from 'node:os';
			
 
				+import { CodeGraph } from '../src';
			
 
				+import { extractProseCandidates } from '../src/search/identifier-segments';
			
 
				+
			
 
				+/**
			
 
				+ * The graph-derived gate behind the prompt hook's MEDIUM tier: symbol names
			
 
				+ * are segmented into the words a human uses for them in prose
			
 
				+ * (name_segment_vocab, populated on the node write path), and
			
 
				+ * CodeGraph.getSegmentMatches verifies prompt words against them with
			
 
				+ * co-occurrence / rarity rules. Precision comes from the repo's own naming
			
 
				+ * statistics — no keyword vocabulary involved.
			
 
				+ */
			
 
				+describe('name-segment vocabulary + getSegmentMatches (graph-derived gate)', () => {
			
 
				+  let dir: string;
			
 
				+  let cg: CodeGraph;
			
 
				+
			
 
				+  beforeEach(async () => {
			
 
				+    dir = fs.mkdtempSync(path.join(os.tmpdir(), 'segment-vocab-'));
			
 
				+    fs.mkdirSync(path.join(dir, 'src'), { recursive: true });
			
 
				+    fs.writeFileSync(
			
 
				+      path.join(dir, 'src', 'state-machine.ts'),
			
 
				+      `export class OrderStateMachine {
			
 
				+  transition(from: string, to: string): boolean { return from !== to; }
			
 
				+}
			
 
				+`,
			
 
				+    );
			
 
				+    fs.writeFileSync(
			
 
				+      path.join(dir, 'src', 'checkout.ts'),
			
 
				+      `export class CheckoutService {
			
 
				+  submitOrder(): void {}
			
 
				+}
			
 
				+export class CheckoutController {
			
 
				+  handle(): void {}
			
 
				+}
			
 
				+export function loadConfig(): void {}
			
 
				+`,
			
 
				+    );
			
 
				+    // 30 distinct names sharing the segment "data" — a ubiquitous segment that
			
 
				+    // must NOT qualify as a single-word signal (rarity ceiling).
			
 
				+    const noise = Array.from({ length: 30 }, (_, i) => {
			
 
				+      const suffix = `${String.fromCharCode(65 + (i % 26))}${i}`;
			
 
				+      return `export function dataLoader${suffix}(): number { return ${i}; }`;
			
 
				+    }).join('\n');
			
 
				+    // The measured-FP shapes: a repo-rare segment that is an English function
			
 
				+    // word ("this"), and a common-verb segment ("write").
			
 
				+    const fpBait = `
			
 
				+export function resolveDeferredThisMemberRefs(): void {}
			
 
				+export function writeConfig(): void {}
			
 
				+`;
			
 
				+    fs.writeFileSync(path.join(dir, 'src', 'noise.ts'), noise + fpBait + '\n');
			
 
				+
			
 
				+    cg = await CodeGraph.init(dir, { silent: true });
			
 
				+    await cg.indexAll();
			
 
				+  });
			
 
				+
			
 
				+  afterEach(() => {
			
 
				+    cg.destroy();
			
 
				+    fs.rmSync(dir, { recursive: true, force: true });
			
 
				+  });
			
 
				+
			
 
				+  it('co-occurrence: two prose words on one name find it — the reported-prompt shape', () => {
			
 
				+    // The words a French prompt would produce: "comment marche la state
			
 
				+    // machine des commandes ?" — no keyword list knows any of them.
			
 
				+    const words = extractProseCandidates('comment marche la state machine des commandes ?');
			
 
				+    const matches = cg.getSegmentMatches(words);
			
 
				+    expect(matches.map((m) => m.name)).toContain('OrderStateMachine');
			
 
				+    const hit = matches.find((m) => m.name === 'OrderStateMachine')!;
			
 
				+    expect(hit.matchedWords).toEqual(['machine', 'state']);
			
 
				+    expect(hit.filePath).toContain('state-machine.ts');
			
 
				+    expect(hit.kind).not.toBe('file');
			
 
				+  });
			
 
				+
			
 
				+  it('single rare word qualifies; ubiquitous and singleton words do not', () => {
			
 
				+    // "checkout" clusters (Service + Controller) — a concept this repo is about.
			
 
				+    expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
			
 
				+    // "data" appears in 30 names here — noise, not signal.
			
 
				+    expect(cg.getSegmentMatches(['data'])).toEqual([]);
			
 
				+    // "machine" appears in exactly ONE name — a singleton is prose
			
 
				+    // coincidence for a single word (the "deploy to production" FP shape);
			
 
				+    // it stays reachable through co-occurrence ("state machine").
			
 
				+    expect(cg.getSegmentMatches(['machine'])).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  it('plural folding: "services" still meets the "service" segment', () => {
			
 
				+    const matches = cg.getSegmentMatches(['checkout', 'services']);
			
 
				+    const hit = matches.find((m) => m.name === 'CheckoutService');
			
 
				+    expect(hit).toBeDefined();
			
 
				+    expect(hit!.matchedWords).toEqual(['checkout', 'services']);
			
 
				+  });
			
 
				+
			
 
				+  it('vocab rows are proposals — a name with no surviving node is never surfaced', () => {
			
 
				+    // Plant an orphan row (as file deletion would): the honesty gate must drop it.
			
 
				+    const queries = (cg as unknown as { queries: { insertNameSegmentsBatch(names: string[]): void } }).queries;
			
 
				+    queries.insertNameSegmentsBatch(['GhostSymbolMachine']);
			
 
				+    const matches = cg.getSegmentMatches(['ghost', 'symbol']);
			
 
				+    expect(matches).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  it('unrelated prose matches nothing', () => {
			
 
				+    expect(cg.getSegmentMatches(extractProseCandidates('write a haiku about autumn leaves'))).toEqual([]);
			
 
				+  });
			
 
				+
			
 
				+  it('English function/filler words are never single-word evidence — the measured FPs', () => {
			
 
				+    // "fix this typo" — 'this' IS a (rare!) segment here via
			
 
				+    // resolveDeferredThisMemberRefs; the stoplist keeps it out of candidates.
			
 
				+    expect(cg.getSegmentMatches(extractProseCandidates('fix this typo'))).toEqual([]);
			
 
				+    // "write …" — writeConfig exists; 'write' is stoplisted prose.
			
 
				+    expect(cg.getSegmentMatches(extractProseCandidates('write something for the readme'))).toEqual([]);
			
 
				+    // Engine-level backstop, independent of extraction: a sub-5-char single
			
 
				+    // word never fires the single-word tier even if a caller passes it raw.
			
 
				+    expect(cg.getSegmentMatches(['this'])).toEqual([]);
			
 
				+    // But the same segments remain reachable through CO-OCCURRENCE — the
			
 
				+    // stoplist only removes thin single-word evidence: naming both halves of
			
 
				+    // writeConfig via prose is still a match ("config" is not stoplisted).
			
 
				+    expect(cg.getSegmentMatches(['config']).map((m) => m.name)).toContain('writeConfig');
			
 
				+  });
			
 
				+
			
 
				+  it('sync heals an empty vocab over a populated graph (pre-vocab-table upgrade path)', async () => {
			
 
				+    const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void; isNameSegmentVocabEmpty(): boolean } }).queries;
			
 
				+    queries.clearNameSegmentVocab();
			
 
				+    expect(queries.isNameSegmentVocabEmpty()).toBe(true);
			
 
				+    await cg.sync();
			
 
				+    expect(queries.isNameSegmentVocabEmpty()).toBe(false);
			
 
				+    expect(cg.getSegmentMatches(['state', 'machine']).map((m) => m.name)).toContain('OrderStateMachine');
			
 
				+  });
			
 
				+
			
 
				+  it('heal covers UNCHANGED files even when the same sync also indexes changed ones', async () => {
			
 
				+    // Regression: emptiness must be captured at sync ENTRY — the sync's own
			
 
				+    // incremental writes populate rows for the files it touches, and an
			
 
				+    // end-of-sync emptiness check would see those rows and skip the backfill,
			
 
				+    // leaving every unchanged file's names unsegmented forever.
			
 
				+    const queries = (cg as unknown as { queries: { clearNameSegmentVocab(): void } }).queries;
			
 
				+    queries.clearNameSegmentVocab();
			
 
				+    const touched = path.join(dir, 'src', 'state-machine.ts');
			
 
				+    fs.writeFileSync(touched, fs.readFileSync(touched, 'utf8') + '\n// touched\n');
			
 
				+    await cg.sync();
			
 
				+    // The touched file's names came from the incremental write path; the
			
 
				+    // UNTOUCHED file's names must come from the backfill.
			
 
				+    expect(cg.getSegmentMatches(['checkout']).map((m) => m.name)).toContain('CheckoutService');
			
 
				+  });
			
 
				+});
			
--- a/docs/design/telemetry.md
+++ b/docs/design/telemetry.md
@@ -72,6 +72,13 @@ Event types:
 
				   (e.g. `codegraph_explore`, `affected`), `count`, `error_count`, and for MCP:
			
 
				   `client_name`/`client_version` from the `initialize` handshake (`src/mcp/session.ts`
			
 
				   `case 'initialize'` — plumbing to add; currently unread).
			
 
				+  The prompt hook additionally rolls up its gate DECISION as `cli_command`
			
 
				+  counters named `prompt-hook-gate-<outcome>`, outcome ∈ `high-keyword` /
			
 
				+  `high-token` / `medium-segment` / `nudge-projects` / `noop-shape` /
			
 
				+  `noop-no-index` / `noop-unverified` — decision names only, never prompt
			
 
				+  content. This is the gate's measured recall/precision funnel: a rising
			
 
				+  `noop-*` share against the `high`/`medium` tiers is the signal that the
			
 
				+  gate (keyword table or segment matching) is missing real questions.
			
 
				 - **`uninstall`** — one per `uninstall`/`uninit` run (churn signal). Props: `targets`.
			
 
				 
			
 
				 Volume math: rollups mean monthly events ≈ active machines × active days × distinct
			
--- a/src/bin/codegraph.ts
+++ b/src/bin/codegraph.ts
@@ -27,6 +27,7 @@ import { Command } from 'commander';
 
				 import * as path from 'path';
			
 
				 import * as fs from 'fs';
			
 
				 import { getCodeGraphDir, isInitialized, unsafeIndexRootReason, findNearestCodeGraphRoot, planFrontload, hasStructuralKeyword, extractCodeTokens } from '../directory';
			
 
				+import { extractProseCandidates } from '../search/identifier-segments';
			
 
				 import { detectWorktreeIndexMismatch, worktreeMismatchWarning } from '../sync/worktree';
			
 
				 import { createShimmerProgress } from '../ui/shimmer-progress';
			
 
				 import { getGlyphs } from '../ui/glyphs';
			
@@ -1070,15 +1071,30 @@ program
 
				       try { input = JSON.parse(raw); } catch { return; }
			
 
				       const prompt = String(input.prompt || '');
			
 
				 
			
 
				-      // Gate: only structural / flow / impact / where-how prompts get context, so
			
 
				-      // every other prompt ("fix this typo") stays a zero-cost no-op. Language-aware
			
 
				-      // (multilingual keywords, plus code-shaped tokens) so it fires for non-English
			
 
				-      // prompts too (#994, #1126). A keyword fires on its own; a code-token is only a
			
 
				-      // CANDIDATE — verified against the graph below, so a tech brand ("JavaScript")
			
 
				-      // that looks like a symbol but isn't one here doesn't inject spurious context.
			
 
				+      // Gate telemetry: how often each tier fires vs. no-ops — counter names
			
 
				+      // only, NEVER prompt content (see TELEMETRY.md). This is the data that
			
 
				+      // turns "is the gate any good" from vibes into a measured recall rate.
			
 
				+      const gate = (outcome: string): void => {
			
 
				+        try { getTelemetry().recordUsage('cli_command', `prompt-hook-gate-${outcome}`, true); } catch { /* never break the hook */ }
			
 
				+      };
			
 
				+
			
 
				+      // Gate, tiered by confidence (#994, #1126):
			
 
				+      //   HIGH   — a structural keyword (any covered language), or a code-shaped
			
 
				+      //            token verified in the index → full explore injection.
			
 
				+      //   MEDIUM — no keyword/token, but prose words match indexed symbol-name
			
 
				+      //            SEGMENTS ("state machine" → OrderStateMachine, in any
			
 
				+      //            language): inject a short list of the matching symbols and
			
 
				+      //            let the AGENT write the explore query — the graph-derived
			
 
				+      //            tier, no vocabulary involved.
			
 
				+      //   silent — nothing verified. Every other prompt ("fix this typo")
			
 
				+      //            stays a zero-cost no-op.
			
 
				+      // Keywords fire on their own; a token or prose word is only a CANDIDATE
			
 
				+      // verified against the graph below, so a tech brand ("JavaScript") that
			
 
				+      // merely looks like code doesn't inject spurious context.
			
 
				       const keyworded = hasStructuralKeyword(prompt);
			
 
				       const codeTokens = keyworded ? [] : extractCodeTokens(prompt);
			
 
				-      if (!keyworded && codeTokens.length === 0) return;
			
 
				+      const proseWords = keyworded ? [] : extractProseCandidates(prompt);
			
 
				+      if (!keyworded && codeTokens.length === 0 && proseWords.length === 0) { gate('noop-shape'); return; }
			
 
				 
			
 
				       // Decide what to inject, shaped by WHERE the index(es) are: the nearest
			
 
				       // indexed ancestor of cwd, or — when cwd is an un-indexed workspace root
			
@@ -1088,7 +1104,7 @@ program
 
				       // root (it only walked up), so the validated adoption lever never fired
			
 
				       // exactly where the agent most needs it.
			
 
				       const plan = planFrontload(String(input.cwd || process.cwd()), prompt);
			
 
				-      if (!plan.exploreRoot && plan.nudgeProjects.length === 0) return; // nothing reachable — the agent's normal tools apply
			
 
				+      if (!plan.exploreRoot && plan.nudgeProjects.length === 0) { gate('noop-no-index'); return; } // nothing reachable — the agent's normal tools apply
			
 
				 
			
 
				       // A "pass projectPath" line for indexed sub-projects we did NOT front-load.
			
 
				       // Follow-up codegraph_explore calls against a sub-project (cwd isn't its
			
@@ -1100,31 +1116,55 @@ program
 
				         const { default: CodeGraph } = await loadCodeGraph();
			
 
				         const cg = await CodeGraph.open(plan.exploreRoot);
			
 
				         try {
			
 
				-          // Code-token-only prompt: require that at least one token is a REAL symbol
			
 
				-          // in THIS index before front-loading. Without it, a brand name or common
			
 
				-          // word that merely looks like code ("JavaScript", "GitHub") would run
			
 
				-          // explore and inject ~16KB of low-relevance context (issue #994 follow-up).
			
 
				-          // A keyword-bearing prompt skips this — the keyword is signal enough.
			
 
				-          if (!keyworded && !codeTokens.some((t) => cg.getNodesByName(t).length > 0)) return;
			
 
				-          const { ToolHandler } = await import('../mcp/tools');
			
 
				-          const handler = new ToolHandler(cg);
			
 
				-          const result = await handler.execute('codegraph_explore', { query: prompt });
			
 
				-          const text = result.content[0]?.text ?? '';
			
 
				-          if (!result.isError && text.trim()) {
			
 
				-            // Cap the injection so a large-repo explore can't flood the prompt.
			
 
				-            const MAX = 16000;
			
 
				-            const body = text.length > MAX ? `${text.slice(0, MAX)}\n…(truncated; call codegraph_explore for the rest)` : text;
			
 
				-            // For a front-loaded SUB-project, a follow-up explore needs its path.
			
 
				-            const more = plan.viaSubScan
			
 
				-              ? `call codegraph_explore with projectPath: "${plan.exploreRoot}" for more`
			
 
				-              : 'call codegraph_explore for more';
			
 
				-            const others = plan.nudgeProjects.length
			
 
				-              ? `\n${nudge(plan.nudgeProjects, 'Other indexed projects in this workspace — pass projectPath to query them:')}`
			
 
				-              : '';
			
 
				-            process.stdout.write(
			
 
				-              `<codegraph_context note="Structural context from CodeGraph for this prompt — treat returned source as already read; ${more}.">\n${body}${others}\n</codegraph_context>\n`,
			
 
				-            );
			
 
				+          const others = plan.nudgeProjects.length
			
 
				+            ? `\n${nudge(plan.nudgeProjects, 'Other indexed projects in this workspace — pass projectPath to query them:')}`
			
 
				+            : '';
			
 
				+
			
 
				+          // Tier decision against THIS index (issue #994 follow-up: candidates
			
 
				+          // must be real here — a brand name or prose about another domain
			
 
				+          // must not inject). Keyword-bearing prompts skip verification — the
			
 
				+          // keyword is signal enough.
			
 
				+          const tokenVerified = !keyworded && codeTokens.some((t) => cg.getNodesByName(t).length > 0);
			
 
				+          if (keyworded || tokenVerified) {
			
 
				+            const { ToolHandler } = await import('../mcp/tools');
			
 
				+            const handler = new ToolHandler(cg);
			
 
				+            const result = await handler.execute('codegraph_explore', { query: prompt });
			
 
				+            const text = result.content[0]?.text ?? '';
			
 
				+            if (!result.isError && text.trim()) {
			
 
				+              // Cap the injection so a large-repo explore can't flood the prompt.
			
 
				+              const MAX = 16000;
			
 
				+              const body = text.length > MAX ? `${text.slice(0, MAX)}\n…(truncated; call codegraph_explore for the rest)` : text;
			
 
				+              // For a front-loaded SUB-project, a follow-up explore needs its path.
			
 
				+              const more = plan.viaSubScan
			
 
				+                ? `call codegraph_explore with projectPath: "${plan.exploreRoot}" for more`
			
 
				+                : 'call codegraph_explore for more';
			
 
				+              process.stdout.write(
			
 
				+                `<codegraph_context note="Structural context from CodeGraph for this prompt — treat returned source as already read; ${more}.">\n${body}${others}\n</codegraph_context>\n`,
			
 
				+              );
			
 
				+            }
			
 
				+            gate(keyworded ? 'high-keyword' : 'high-token');
			
 
				+            return;
			
 
				           }
			
 
				+
			
 
				+          // MEDIUM: prose words → symbol-name segments, co-occurrence/rarity
			
 
				+          // scored, each hit re-verified to exist (see getSegmentMatches). The
			
 
				+          // payload names the symbols but does NOT run explore — the agent owns
			
 
				+          // the query where the hook's confidence is only "these are related".
			
 
				+          const related = cg.getSegmentMatches(proseWords);
			
 
				+          if (related.length === 0) { gate('noop-unverified'); return; }
			
 
				+          const lines = related
			
 
				+            .map((m) => `  - ${m.name} (${m.kind} — ${m.filePath}:${m.startLine})`)
			
 
				+            .join('\n');
			
 
				+          const exampleQuery = related.slice(0, 3).map((m) => m.name).join(' ');
			
 
				+          const projectHint = plan.viaSubScan ? ` with projectPath: "${plan.exploreRoot}"` : '';
			
 
				+          process.stdout.write(
			
 
				+            `<codegraph_context note="CodeGraph found indexed symbols matching this prompt — query the graph before searching files.">\n` +
			
 
				+            `This project's CodeGraph index contains symbols matching this request:\n${lines}\n` +
			
 
				+            `Call codegraph_explore ONCE${projectHint} with the relevant names in one query (e.g. "${exampleQuery}") ` +
			
 
				+            `to get their source, call paths, and blast radius — cheaper and more complete than Read/Grep.\n${others}` +
			
 
				+            `</codegraph_context>\n`,
			
 
				+          );
			
 
				+          gate('medium-segment');
			
 
				         } finally {
			
 
				           cg.destroy();
			
 
				         }
			
@@ -1136,6 +1176,7 @@ program
 
				           nudge(plan.nudgeProjects, "This workspace's CodeGraph indexes live in sub-projects. To use CodeGraph, call codegraph_explore with the projectPath of the relevant one:") +
			
 
				           `</codegraph_context>\n`,
			
 
				         );
			
 
				+        gate('nudge-projects');
			
 
				       }
			
 
				     } catch {
			
 
				       // Degradable by contract: never surface an error to the prompt pipeline.
			
--- a/src/db/migrations.ts
+++ b/src/db/migrations.ts
@@ -9,7 +9,7 @@ import { SqliteDatabase } from './sqlite-adapter';
 
				 /**
			
 
				  * Current schema version
			
 
				  */
			
 
				-export const CURRENT_SCHEMA_VERSION = 6;
			
 
				+export const CURRENT_SCHEMA_VERSION = 7;
			
 
				 
			
 
				 /**
			
 
				  * Migration definition
			
@@ -100,6 +100,25 @@ const migrations: Migration[] = [
 
				       `);
			
 
				     },
			
 
				   },
			
 
				+  {
			
 
				+    version: 7,
			
 
				+    description:
			
 
				+      'Add name_segment_vocab — prose-word → symbol-name lookup for the prompt hook’s graph-derived gate',
			
 
				+    up: (db) => {
			
 
				+      // DDL only — instant on any size database (the row-churn hazards of #1067
			
 
				+      // don't apply). The table starts EMPTY on migrated databases; `sync`
			
 
				+      // detects that over a populated graph and backfills batched+yielding
			
 
				+      // (CodeGraph.rebuildNameSegmentVocab), and any full index rebuilds it
			
 
				+      // from scratch. Keep the definition in lockstep with schema.sql.
			
 
				+      db.exec(`
			
 
				+        CREATE TABLE IF NOT EXISTS name_segment_vocab (
			
 
				+          segment TEXT NOT NULL,
			
 
				+          name TEXT NOT NULL,
			
 
				+          PRIMARY KEY (segment, name)
			
 
				+        ) WITHOUT ROWID;
			
 
				+      `);
			
 
				+    },
			
 
				+  },
			
 
				 ];
			
 
				 
			
 
				 /**
			
--- a/src/db/queries.ts
+++ b/src/db/queries.ts
@@ -21,6 +21,7 @@ import { safeJsonParse } from '../utils';
 
				 import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
			
 
				 import { parseQuery, boundedEditDistance } from '../search/query-parser';
			
 
				 import { isGeneratedFile } from '../extraction/generated-detection';
			
 
				+import { splitIdentifierSegments } from '../search/identifier-segments';
			
 
				 
			
 
				 /**
			
 
				  * Path-only heuristic for files that should not be candidates for
			
@@ -219,8 +220,16 @@ export class QueryBuilder {
 
				     getDominantFile?: SqliteStatement;
			
 
				     getTopRouteFile?: SqliteStatement;
			
 
				     getRoutingManifest?: SqliteStatement;
			
 
				+    insertNameSegment?: SqliteStatement;
			
 
				   } = {};
			
 
				 
			
 
				+  // Names whose segments were already written this session — skips re-splitting
			
 
				+  // and re-inserting for the same-named nodes that repeat across files ("get",
			
 
				+  // "render", …). Purely a write-path fast path; INSERT OR IGNORE is the
			
 
				+  // correctness backstop. Bounded so a pathological repo can't grow it forever.
			
 
				+  private segmentedNames: Set<string> = new Set();
			
 
				+  private static readonly MAX_SEGMENTED_NAMES = 65536;
			
 
				+
			
 
				   constructor(db: SqliteDatabase) {
			
 
				     this.db = db;
			
 
				   }
			
@@ -303,6 +312,30 @@ export class QueryBuilder {
 
				       returnType: node.returnType ?? null,
			
 
				       updatedAt: node.updatedAt ?? Date.now(),
			
 
				     });
			
 
				+
			
 
				+    // Segment vocabulary rides the same write path (and transaction) so it can
			
 
				+    // never drift ahead of the nodes it describes. Deletes intentionally leave
			
 
				+    // orphans behind — vocab rows are proposals re-verified against nodes
			
 
				+    // before use, and a full index clears the table at its start. File nodes
			
 
				+    // are excluded: a file's basename duplicates the symbols inside it
			
 
				+    // (state-machine.ts / OrderStateMachine), which double-counts every
			
 
				+    // concept and defeats the singleton-vs-cluster rarity statistics.
			
 
				+    if (node.kind !== 'file') this.insertNameSegments(node.name);
			
 
				+  }
			
 
				+
			
 
				+  /** Write `name`'s segments into name_segment_vocab (idempotent). */
			
 
				+  private insertNameSegments(name: string): void {
			
 
				+    if (this.segmentedNames.has(name)) return;
			
 
				+    if (this.segmentedNames.size >= QueryBuilder.MAX_SEGMENTED_NAMES) this.segmentedNames.clear();
			
 
				+    this.segmentedNames.add(name);
			
 
				+    if (!this.stmts.insertNameSegment) {
			
 
				+      this.stmts.insertNameSegment = this.db.prepare(
			
 
				+        'INSERT OR IGNORE INTO name_segment_vocab (segment, name) VALUES (?, ?)',
			
 
				+      );
			
 
				+    }
			
 
				+    for (const segment of splitIdentifierSegments(name)) {
			
 
				+      this.stmts.insertNameSegment.run(segment, name);
			
 
				+    }
			
 
				   }
			
 
				 
			
 
				   /**
			
@@ -409,6 +442,86 @@ export class QueryBuilder {
 
				     this.stmts.deleteNodesByFile.run(filePath);
			
 
				   }
			
 
				 
			
 
				+  // ===========================================================================
			
 
				+  // Name-segment vocabulary (prompt-hook graph-derived gate)
			
 
				+  // ===========================================================================
			
 
				+
			
 
				+  /** Wipe the segment vocabulary. A full index calls this at its start; the
			
 
				+   *  node write path repopulates it as files (re-)index, so the end state is
			
 
				+   *  exactly the current names with no orphan rows. */
			
 
				+  clearNameSegmentVocab(): void {
			
 
				+    this.db.exec('DELETE FROM name_segment_vocab');
			
 
				+    this.segmentedNames.clear();
			
 
				+  }
			
 
				+
			
 
				+  /** True when the vocab has no rows — an index built before the table existed.
			
 
				+   *  `sync` uses this to heal such databases (see rebuildNameSegmentVocabFrom). */
			
 
				+  isNameSegmentVocabEmpty(): boolean {
			
 
				+    const row = this.db.prepare('SELECT 1 FROM name_segment_vocab LIMIT 1').get();
			
 
				+    return row === undefined;
			
 
				+  }
			
 
				+
			
 
				+  /** One page of distinct non-file node names, for batched vocab rebuilds
			
 
				+   *  (file basenames are excluded from the vocab — see insertNode). */
			
 
				+  getDistinctNodeNames(limit: number, offset: number): string[] {
			
 
				+    const rows = this.db
			
 
				+      .prepare("SELECT DISTINCT name FROM nodes WHERE kind != 'file' ORDER BY name LIMIT ? OFFSET ?")
			
 
				+      .all(limit, offset) as Array<{ name: string }>;
			
 
				+    return rows.map((r) => r.name);
			
 
				+  }
			
 
				+
			
 
				+  /** Insert segments for a batch of names in one transaction (vocab heal path). */
			
 
				+  insertNameSegmentsBatch(names: string[]): void {
			
 
				+    this.db.transaction(() => {
			
 
				+      for (const name of names) this.insertNameSegments(name);
			
 
				+    })();
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Names whose segments cover at least `minSegments` of the given segments —
			
 
				+   * the co-occurrence probe behind the prompt hook's medium tier: the words
			
 
				+   * "state" and "machine" both being segments of `OrderStateMachine` is strong
			
 
				+   * evidence the prompt names that symbol in prose. Ordered by coverage.
			
 
				+   */
			
 
				+  getSegmentCoOccurrence(segments: string[], minSegments: number, limit: number): Array<{ name: string; matches: number }> {
			
 
				+    if (segments.length === 0) return [];
			
 
				+    const placeholders = segments.map(() => '?').join(', ');
			
 
				+    const rows = this.db
			
 
				+      .prepare(
			
 
				+        `SELECT name, COUNT(DISTINCT segment) AS matches
			
 
				+         FROM name_segment_vocab
			
 
				+         WHERE segment IN (${placeholders})
			
 
				+         GROUP BY name
			
 
				+         HAVING matches >= ?
			
 
				+         ORDER BY matches DESC, length(name) ASC
			
 
				+         LIMIT ?`,
			
 
				+      )
			
 
				+      .all(...segments, minSegments, limit) as Array<{ name: string; matches: number }>;
			
 
				+    return rows;
			
 
				+  }
			
 
				+
			
 
				+  /** How many distinct names each segment appears in — the rarity signal that
			
 
				+   *  separates a discriminative word ("checkout") from a ubiquitous one ("state"). */
			
 
				+  getSegmentNameCounts(segments: string[]): Map<string, number> {
			
 
				+    if (segments.length === 0) return new Map();
			
 
				+    const placeholders = segments.map(() => '?').join(', ');
			
 
				+    const rows = this.db
			
 
				+      .prepare(
			
 
				+        `SELECT segment, COUNT(*) AS n FROM name_segment_vocab
			
 
				+         WHERE segment IN (${placeholders}) GROUP BY segment`,
			
 
				+      )
			
 
				+      .all(...segments) as Array<{ segment: string; n: number }>;
			
 
				+    return new Map(rows.map((r) => [r.segment, r.n]));
			
 
				+  }
			
 
				+
			
 
				+  /** Names containing the given segment (rare-single-word tier). */
			
 
				+  getNamesForSegment(segment: string, limit: number): string[] {
			
 
				+    const rows = this.db
			
 
				+      .prepare('SELECT name FROM name_segment_vocab WHERE segment = ? ORDER BY length(name) ASC LIMIT ?')
			
 
				+      .all(segment, limit) as Array<{ name: string }>;
			
 
				+    return rows.map((r) => r.name);
			
 
				+  }
			
 
				+
			
 
				   /**
			
 
				    * Get a node by ID
			
 
				    */
			
--- a/src/db/schema.sql
+++ b/src/db/schema.sql
@@ -123,6 +123,25 @@ CREATE TRIGGER IF NOT EXISTS nodes_au AFTER UPDATE ON nodes BEGIN
 
				     VALUES (NEW.rowid, NEW.id, NEW.name, NEW.qualified_name, NEW.docstring, NEW.signature);
			
 
				 END;
			
 
				 
			
 
				+-- Prose-word → symbol-name lookup for the prompt hook's graph-derived gate.
			
 
				+-- One row per (segment, name): segment is a lowercased word of a symbol name
			
 
				+-- ("OrderStateMachine" → order, state, machine — see identifier-segments.ts),
			
 
				+-- which lets natural-language prompt words be verified against the graph in
			
 
				+-- any language whose technical nouns are Latin script. File nodes are
			
 
				+-- excluded — a file's basename duplicates the symbols inside it and skews the
			
 
				+-- singleton-vs-cluster rarity statistics. FTS can't serve this lookup (its
			
 
				+-- tokenizer keeps camelCase names as single tokens), so segments are
			
 
				+-- materialized on the node write path.
			
 
				+-- Deletions leave orphan rows ON PURPOSE: rows are PROPOSALS, always
			
 
				+-- re-verified against nodes before being surfaced (CodeGraph.getSegmentMatches),
			
 
				+-- and a full index clears the table at its start. Populated lazily on old
			
 
				+-- databases (empty until the next index/sync heals it).
			
 
				+CREATE TABLE IF NOT EXISTS name_segment_vocab (
			
 
				+    segment TEXT NOT NULL,
			
 
				+    name TEXT NOT NULL,
			
 
				+    PRIMARY KEY (segment, name)
			
 
				+) WITHOUT ROWID;
			
 
				+
			
 
				 -- Edge indexes.
			
 
				 -- idx_edges_source / idx_edges_target are intentionally omitted —
			
 
				 -- the (source, kind) and (target, kind) composites below cover the
			
--- a/src/index.ts
+++ b/src/index.ts
@@ -15,6 +15,7 @@ import {
 
				   TraversalOptions,
			
 
				   SearchOptions,
			
 
				   SearchResult,
			
 
				+  SegmentMatch,
			
 
				   Context,
			
 
				   GraphStats,
			
 
				   TaskInput,
			
@@ -51,6 +52,8 @@ import { EXTRACTION_VERSION } from './extraction/extraction-version';
 
				 import { getCodeGraphDir } from './directory';
			
 
				 import { deriveProjectNameTokens } from './search/query-utils';
			
 
				 import { CodeGraphPackageVersion } from './mcp/version';
			
 
				+import { segmentLookupVariants, splitIdentifierSegments } from './search/identifier-segments';
			
 
				+import { createYielder } from './resolution/cooperative-yield';
			
 
				 
			
 
				 // Re-export types for consumers
			
 
				 export * from './types';
			
@@ -434,6 +437,10 @@ export class CodeGraph {
 
				       }
			
 
				       try {
			
 
				         const before = this.queries.getNodeAndEdgeCount();
			
 
				+        // Segment vocabulary starts empty and is repopulated by the node write
			
 
				+        // path as every file (re-)indexes below — so a full index is also the
			
 
				+        // orphan-cleanup pass for names deleted since the last one.
			
 
				+        try { this.queries.clearNameSegmentVocab(); } catch { /* vocab is advisory — never fail an index over it */ }
			
 
				         const result = await this.orchestrator.indexAll(options.onProgress, options.signal, options.verbose);
			
 
				 
			
 
				         // Re-detect frameworks now that the index is populated. The resolver
			
@@ -546,6 +553,14 @@ export class CodeGraph {
 
				         return { filesChecked: 0, filesAdded: 0, filesModified: 0, filesRemoved: 0, nodesUpdated: 0, durationMs: 0 };
			
 
				       }
			
 
				       try {
			
 
				+        // Captured BEFORE the sync runs: the sync's own incremental writes
			
 
				+        // populate vocab rows for the files it touches, so an end-of-sync
			
 
				+        // emptiness check would see "non-empty" and skip the backfill forever,
			
 
				+        // leaving every unchanged file's names unsegmented.
			
 
				+        const vocabWasEmpty = (() => {
			
 
				+          try { return this.queries.isNameSegmentVocabEmpty(); } catch { return false; }
			
 
				+        })();
			
 
				+
			
 
				         const result = await this.orchestrator.sync(options.onProgress);
			
 
				 
			
 
				         // Cross-file finalization (e.g. NestJS RouterModule prefixes). Run on
			
@@ -608,6 +623,18 @@ export class CodeGraph {
 
				           this.db.runMaintenance();
			
 
				         }
			
 
				 
			
 
				+        // Heal the segment vocabulary on indexes built before the table
			
 
				+        // existed (upgrade path): incremental writes above only cover changed
			
 
				+        // files, so a vocab that was empty when this sync STARTED means the
			
 
				+        // bulk was never segmented — backfill it (INSERT OR IGNORE, so the
			
 
				+        // rows the sync just wrote are fine). Batched + yielding — sync can
			
 
				+        // run on the daemon's liveness-watchdog thread (#850/#1091).
			
 
				+        try {
			
 
				+          if (vocabWasEmpty && this.queries.getNodeAndEdgeCount().nodes > 0) {
			
 
				+            await this.rebuildNameSegmentVocab();
			
 
				+          }
			
 
				+        } catch { /* vocab is advisory — never fail a sync over it */ }
			
 
				+
			
 
				         return result;
			
 
				       } finally {
			
 
				         this.fileLock.release();
			
@@ -881,6 +908,123 @@ export class CodeGraph {
 
				     return this.queries.searchNodes(query, options);
			
 
				   }
			
 
				 
			
 
				+  /**
			
 
				+   * Graph-derived prompt matching for the front-load hook's MEDIUM tier:
			
 
				+   * which indexed symbols do these prose words name? "state machine des
			
 
				+   * commandes" → `OrderStateMachine`, in any human language whose technical
			
 
				+   * nouns are Latin script — no keyword list involved.
			
 
				+   *
			
 
				+   * Precision comes from the repo's own naming statistics, not vocabulary:
			
 
				+   * - CO-OCCURRENCE: ≥2 words that are segments of the SAME name ("state" +
			
 
				+   *   "machine" → OrderStateMachine) is strong evidence and always qualifies.
			
 
				+   * - RARITY: a single matched word qualifies only when its segment is
			
 
				+   *   discriminative here (≤ {@link SEGMENT_RARITY_CEILING} distinct names) —
			
 
				+   *   "checkout" in a shop backend yes, "state" in a react app no.
			
 
				+   * Every candidate is re-verified against `nodes` before being returned
			
 
				+   * (vocab rows are proposals; deletions leave orphans by design), so a
			
 
				+   * returned symbol is guaranteed to exist right now.
			
 
				+   */
			
 
				+  getSegmentMatches(words: string[], limit: number = 6): SegmentMatch[] {
			
 
				+    if (words.length === 0) return [];
			
 
				+    // Variant → original word (plural folding), for coverage accounting.
			
 
				+    const variantToWord = new Map<string, string>();
			
 
				+    for (const word of words) {
			
 
				+      for (const variant of segmentLookupVariants(word)) {
			
 
				+        if (!variantToWord.has(variant)) variantToWord.set(variant, word);
			
 
				+      }
			
 
				+    }
			
 
				+    const variants = [...variantToWord.keys()];
			
 
				+
			
 
				+    // Tier A: co-occurrence. minSegments=2 counts VARIANTS, so fold a name's
			
 
				+    // matched variants back to distinct words before trusting the coverage.
			
 
				+    const candidates: Array<{ name: string; matchedWords: Set<string> }> = [];
			
 
				+    for (const hit of this.queries.getSegmentCoOccurrence(variants, 2, 24)) {
			
 
				+      const matched = this.wordsMatchingName(hit.name, variantToWord);
			
 
				+      if (matched.size >= 2) candidates.push({ name: hit.name, matchedWords: matched });
			
 
				+    }
			
 
				+
			
 
				+    // Tier B: single rare word. Only when co-occurrence found nothing — a
			
 
				+    // co-occurring name is categorically stronger evidence — and under
			
 
				+    // stricter rules, because one word is thin: the word must be ≥5 chars
			
 
				+    // (measured FPs: "this", "typo"); the segment must appear in AT LEAST TWO
			
 
				+    // names (a concept the codebase is about clusters across names —
			
 
				+    // CheckoutService/CheckoutController — while a prose coincidence is a
			
 
				+    // singleton: measured FP "deploy to PRODUCTION" → the one name
			
 
				+    // matchesNonProductionDir); and the candidate name must have ≥2 segments
			
 
				+    // (a bare common verb matching a bare function name — "write" → `write` —
			
 
				+    // is prose coincidence, not the user naming a symbol).
			
 
				+    if (candidates.length === 0) {
			
 
				+      const singleWordVariants = variants.filter((v) => variantToWord.get(v)!.length >= 5);
			
 
				+      const counts = this.queries.getSegmentNameCounts(singleWordVariants);
			
 
				+      const rare = [...counts.entries()]
			
 
				+        .filter(([, n]) => n >= 2 && n <= CodeGraph.SEGMENT_RARITY_CEILING)
			
 
				+        .sort((a, b) => a[1] - b[1])
			
 
				+        .slice(0, 2);
			
 
				+      for (const [variant] of rare) {
			
 
				+        const word = variantToWord.get(variant)!;
			
 
				+        for (const name of this.queries.getNamesForSegment(variant, 12)) {
			
 
				+          if (splitIdentifierSegments(name).length < 2) continue;
			
 
				+          candidates.push({ name, matchedWords: new Set([word]) });
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				+
			
 
				+    // Verify against nodes (the honesty gate) and pick a representative
			
 
				+    // definition per name — prefer a real symbol over a file/import node.
			
 
				+    const out: SegmentMatch[] = [];
			
 
				+    const seen = new Set<string>();
			
 
				+    candidates.sort((a, b) => b.matchedWords.size - a.matchedWords.size || a.name.length - b.name.length);
			
 
				+    for (const candidate of candidates) {
			
 
				+      if (out.length >= limit) break;
			
 
				+      if (seen.has(candidate.name)) continue;
			
 
				+      seen.add(candidate.name);
			
 
				+      const nodes = this.queries.getNodesByName(candidate.name);
			
 
				+      if (nodes.length === 0) continue; // orphaned vocab row — name no longer exists
			
 
				+      const rep = nodes.find((n) => n.kind !== 'file' && n.kind !== 'import') ?? nodes[0]!;
			
 
				+      out.push({
			
 
				+        name: candidate.name,
			
 
				+        kind: rep.kind,
			
 
				+        filePath: rep.filePath,
			
 
				+        startLine: rep.startLine ?? 0,
			
 
				+        matchedWords: [...candidate.matchedWords].sort(),
			
 
				+      });
			
 
				+    }
			
 
				+    return out;
			
 
				+  }
			
 
				+
			
 
				+  /** A single word ("state") can match hundreds of names in a big repo — that
			
 
				+   *  is noise, not signal. Ceiling for the single-word tier; co-occurrence is
			
 
				+   *  exempt because two words on one name is already discriminative. */
			
 
				+  private static readonly SEGMENT_RARITY_CEILING = 25;
			
 
				+
			
 
				+  /** Which of the prompt's original words match `name`'s segments (via
			
 
				+   *  variants). Segments are recomputed in JS — a name-keyed vocab lookup
			
 
				+   *  would scan the (segment, name) primary key. */
			
 
				+  private wordsMatchingName(name: string, variantToWord: Map<string, string>): Set<string> {
			
 
				+    const segments = new Set(splitIdentifierSegments(name));
			
 
				+    const matched = new Set<string>();
			
 
				+    for (const [variant, word] of variantToWord) {
			
 
				+      if (segments.has(variant)) matched.add(word);
			
 
				+    }
			
 
				+    return matched;
			
 
				+  }
			
 
				+
			
 
				+  /**
			
 
				+   * Rebuild the segment vocabulary from the current graph, batched and
			
 
				+   * yielding — the upgrade-heal path for indexes built before the vocab table
			
 
				+   * existed. Runs inside sync's mutex/lock (callers hold them).
			
 
				+   */
			
 
				+  private async rebuildNameSegmentVocab(): Promise<void> {
			
 
				+    const maybeYield = createYielder();
			
 
				+    const BATCH = 2000;
			
 
				+    for (let offset = 0; ; offset += BATCH) {
			
 
				+      const names = this.queries.getDistinctNodeNames(BATCH, offset);
			
 
				+      if (names.length === 0) break;
			
 
				+      this.queries.insertNameSegmentsBatch(names);
			
 
				+      await maybeYield();
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				   /**
			
 
				    * Normalized project-name tokens (go.mod / package.json / repo dir) used to
			
 
				    * down-weight the non-discriminative project name in search ranking (#720).
			
--- a/src/search/identifier-segments.ts
+++ b/src/search/identifier-segments.ts
@@ -0,0 +1,140 @@
 
				+/**
			
 
				+ * Identifier-segment utilities for the prompt hook's graph-derived gate
			
 
				+ * (name_segment_vocab): symbol names split into the words a human would use
			
 
				+ * for them in prose, and prompt prose normalized into candidate words to look
			
 
				+ * those segments up with.
			
 
				+ *
			
 
				+ * "OrderStateMachine" → order / state / machine — so the French prompt
			
 
				+ * "comment marche la state machine des commandes ?" (or any language's prose
			
 
				+ * naming the concept in Latin script) can be verified against the graph
			
 
				+ * without a keyword list ever knowing the words. The FTS index can't serve
			
 
				+ * this — its tokenizer keeps camelCase names as single tokens — which is why
			
 
				+ * segments are materialized at index time instead (see schema.sql,
			
 
				+ * name_segment_vocab).
			
 
				+ */
			
 
				+
			
 
				+/** Bounds keep degenerate identifiers (minified names, hashes) from bloating
			
 
				+ *  the vocab: segments outside them carry no prose signal anyway. */
			
 
				+const MIN_SEGMENT_CHARS = 2;
			
 
				+const MAX_SEGMENT_CHARS = 32;
			
 
				+const MAX_SEGMENTS_PER_NAME = 12;
			
 
				+
			
 
				+/**
			
 
				+ * Split a symbol or file name into lowercase word segments.
			
 
				+ *
			
 
				+ * Handles camelCase / PascalCase (inner lower→Upper), acronym runs
			
 
				+ * ("HTMLParser" → html/parser), snake_case / kebab-case / dotted file names
			
 
				+ * (non-alphanumerics separate), and keeps digits glued to their word
			
 
				+ * ("base64Encode" → base64/encode). Digit-only fragments are dropped.
			
 
				+ */
			
 
				+export function splitIdentifierSegments(name: string): string[] {
			
 
				+  if (!name) return [];
			
 
				+  const out = new Set<string>();
			
 
				+  for (const run of name.match(/[\p{L}\p{N}]+/gu) ?? []) {
			
 
				+    // Split before an Upper that follows lower/digit (camelCase hump), and
			
 
				+    // before the last Upper of an acronym run when a lowercase follows
			
 
				+    // ("HTMLParser" → HTML | Parser).
			
 
				+    const parts = run.split(/(?<=[\p{Ll}\p{N}])(?=\p{Lu})|(?<=\p{Lu})(?=\p{Lu}\p{Ll})/u);
			
 
				+    for (const part of parts) {
			
 
				+      if (out.size >= MAX_SEGMENTS_PER_NAME) return [...out];
			
 
				+      const seg = part.toLowerCase();
			
 
				+      if (seg.length < MIN_SEGMENT_CHARS || seg.length > MAX_SEGMENT_CHARS) continue;
			
 
				+      if (/^\p{N}+$/u.test(seg)) continue;
			
 
				+      out.add(seg);
			
 
				+    }
			
 
				+  }
			
 
				+  return [...out];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Normalize a prose word for segment lookup: lowercase + strip diacritics
			
 
				+ * (NFD, drop combining marks), so "références" matches the segment
			
 
				+ * "references" and "résolution" matches "resolution". Identifier segments are
			
 
				+ * overwhelmingly ASCII, so this is what buys Latin-script languages their
			
 
				+ * cross-lingual reach on loanwords.
			
 
				+ */
			
 
				+export function normalizeProseWord(word: string): string {
			
 
				+  return word.normalize('NFD').replace(/\p{M}+/gu, '').toLowerCase();
			
 
				+}
			
 
				+
			
 
				+/** Candidate cap: a prompt's first words carry its subject; scanning an essay
			
 
				+ *  buys nothing and the vocab lookup cost scales with this. */
			
 
				+const MAX_PROSE_CANDIDATES = 16;
			
 
				+const MIN_PROSE_CHARS = 4; // "the"/"des"/"une"/"fix" out; "auth"/"flow"/"path" in
			
 
				+const MAX_PROSE_CHARS = 24; // an unsegmented-script sentence is one giant run — skip it
			
 
				+
			
 
				+/**
			
 
				+ * English prompt words that are never evidence a symbol was NAMED, however
			
 
				+ * rare their segment happens to be in a given repo: function words, filler,
			
 
				+ * hyper-common dev verbs, and words ABOUT code rather than OF it ("rename
			
 
				+ * this file", "there's an issue"). Measured FPs that motivated this: "fix
			
 
				+ * THIS typo" matched `resolveDeferredThisMemberRefs` (repo-rare segment!),
			
 
				+ * "WRITE a haiku" matched `writeConfig`.
			
 
				+ *
			
 
				+ * English-only ON PURPOSE — this is not the #1126 keyword treadmill:
			
 
				+ * identifiers are written in English, so only English prose words can
			
 
				+ * accidentally collide with segments. Other languages' function words
			
 
				+ * ("avec", "pendant", "dieser") don't match anything and need no list.
			
 
				+ * Domain nouns ("state", "checkout", "order") stay OUT — they are exactly
			
 
				+ * the signal; the rarity/co-occurrence rules judge them per-repo.
			
 
				+ */
			
 
				+const ENGLISH_PROSE_STOPWORDS = new Set([
			
 
				+  'about', 'above', 'actually', 'after', 'again', 'against', 'almost', 'along', 'also', 'always',
			
 
				+  'another', 'anything', 'around', 'away', 'back', 'because', 'been', 'before', 'behind', 'being',
			
 
				+  'below', 'best', 'better', 'between', 'both', 'cannot', 'come', 'could', 'does', 'doing', 'done',
			
 
				+  'down', 'each', 'either', 'else', 'even', 'ever', 'every', 'everything', 'fine', 'first', 'from',
			
 
				+  'getting', 'give', 'goes', 'going', 'gone', 'good', 'great', 'have', 'having', 'help', 'here',
			
 
				+  'inside', 'instead', 'into', 'just', 'keep', 'know', 'last', 'least', 'less', 'like', 'likely',
			
 
				+  'little', 'look', 'looking', 'made', 'make', 'making', 'many', 'maybe', 'mind', 'more', 'most',
			
 
				+  'much', 'must', 'need', 'needs', 'never', 'next', 'nice', 'none', 'nothing', 'okay', 'only',
			
 
				+  'onto', 'other', 'otherwise', 'over', 'please', 'pretty', 'probably', 'quite', 'rather', 'really',
			
 
				+  'right', 'same', 'seem', 'seems', 'should', 'show', 'since', 'some', 'someone', 'something',
			
 
				+  'somewhere', 'soon', 'still', 'such', 'sure', 'take', 'than', 'thank', 'thanks', 'that', 'their',
			
 
				+  'them', 'then', 'there', 'these', 'they', 'thing', 'things', 'think', 'this', 'those', 'though',
			
 
				+  'tried', 'tries', 'trying', 'under', 'until', 'upon', 'very', 'want', 'wants', 'well', 'went',
			
 
				+  'were', 'what', 'when', 'which', 'while', 'will', 'wish', 'with', 'within', 'without', 'would',
			
 
				+  'wrong', 'your', 'yours',
			
 
				+  // words ABOUT code, not OF it — present in a huge share of prompts while
			
 
				+  // almost never naming the symbol the user means
			
 
				+  'again', 'change', 'changes', 'check', 'class', 'classes', 'code', 'detail', 'details',
			
 
				+  'directory', 'error', 'errors', 'example', 'examples', 'file', 'files', 'folder', 'function',
			
 
				+  'functions', 'issue', 'issues', 'line', 'lines', 'method', 'methods', 'name', 'names', 'problem',
			
 
				+  'problems', 'project', 'question', 'questions', 'rename', 'test', 'tests', 'type', 'types',
			
 
				+  'update', 'value', 'values', 'warning', 'warnings', 'work', 'working', 'write', 'writing',
			
 
				+]);
			
 
				+
			
 
				+/**
			
 
				+ * Candidate words from a prompt for segment-vocabulary lookup, in order of
			
 
				+ * appearance: Unicode letter/digit runs, normalized via
			
 
				+ * {@link normalizeProseWord}, length-bounded, digit-only dropped,
			
 
				+ * {@link ENGLISH_PROSE_STOPWORDS} dropped, deduped, capped. Everything that
			
 
				+ * survives is judged per-repo by the rarity and co-occurrence rules in
			
 
				+ * CodeGraph.getSegmentMatches — there is no domain-word list.
			
 
				+ */
			
 
				+export function extractProseCandidates(prompt: string): string[] {
			
 
				+  if (!prompt) return [];
			
 
				+  const seen = new Set<string>();
			
 
				+  for (const run of prompt.match(/[\p{L}\p{N}]+/gu) ?? []) {
			
 
				+    if (seen.size >= MAX_PROSE_CANDIDATES) break;
			
 
				+    if (run.length > MAX_PROSE_CHARS) continue;
			
 
				+    const w = normalizeProseWord(run);
			
 
				+    if (w.length < MIN_PROSE_CHARS || w.length > MAX_PROSE_CHARS) continue;
			
 
				+    if (/^\p{N}+$/u.test(w)) continue;
			
 
				+    if (ENGLISH_PROSE_STOPWORDS.has(w)) continue;
			
 
				+    seen.add(w);
			
 
				+  }
			
 
				+  return [...seen];
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * Lookup variants for a prose word: the word itself plus light plural folding
			
 
				+ * ("services" → service, "dependencies" → dependencie/dependency is NOT
			
 
				+ * attempted — only trailing s/es strip), so common plurals still hit their
			
 
				+ * singular segment. Returned variants map back to the same original word.
			
 
				+ */
			
 
				+export function segmentLookupVariants(word: string): string[] {
			
 
				+  const variants = [word];
			
 
				+  if (word.endsWith('es') && word.length >= MIN_PROSE_CHARS + 2) variants.push(word.slice(0, -2));
			
 
				+  if (word.endsWith('s') && word.length >= MIN_PROSE_CHARS + 1) variants.push(word.slice(0, -1));
			
 
				+  return variants;
			
 
				+}
			
--- a/src/types.ts
+++ b/src/types.ts
@@ -410,6 +410,25 @@ export interface SearchResult {
 
				   highlights?: string[];
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * A symbol whose name-segments match prose words from a prompt — the
			
 
				+ * graph-derived signal behind the front-load hook's medium tier
			
 
				+ * (CodeGraph.getSegmentMatches). Always verified to exist in `nodes` at the
			
 
				+ * time it is returned.
			
 
				+ */
			
 
				+export interface SegmentMatch {
			
 
				+  /** Symbol name as indexed (e.g. `OrderStateMachine`). */
			
 
				+  name: string;
			
 
				+  /** Kind of the representative definition. */
			
 
				+  kind: NodeKind;
			
 
				+  /** File of the representative definition. */
			
 
				+  filePath: string;
			
 
				+  /** 1-based start line of the representative definition. */
			
 
				+  startLine: number;
			
 
				+  /** The prompt words (normalized) that matched this name's segments. */
			
 
				+  matchedWords: string[];
			
 
				+}
			
 
				+
			
 
				 // =============================================================================
			
 
				 // Context Types
			
 
				 // =============================================================================