Bladeren bron

feat: Add stem variants to search term extraction for broader definition matching

Expands symbol lookup with morphological variants (e.g., "caching"→"cache", "eviction"→"evict") to find related class definitions that FTS prefix matching would otherwise miss. Includes comprehensive stemming rules for common English suffixes (-ing, -tion, -ed, -er, etc.) and integrates stem expansion into definition prefix search for improved symbol discovery.
Colby McHenry 2 maanden geleden
bovenliggende
commit
f668b2cd1c
2 gewijzigde bestanden met toevoegingen van 96 en 2 verwijderingen
  1. 9 1
      src/context/index.ts
  2. 87 1
      src/search/query-utils.ts

+ 9 - 1
src/context/index.ts

@@ -26,7 +26,7 @@ import { VectorManager } from '../vectors';
 import { formatContextAsMarkdown, formatContextAsJson } from './formatter';
 import { logDebug } from '../errors';
 import { validatePathWithinRoot } from '../utils';
-import { isTestFile, extractSearchTerms, scorePathRelevance } from '../search/query-utils';
+import { isTestFile, extractSearchTerms, scorePathRelevance, getStemVariants } from '../search/query-utils';
 
 /**
  * Extract likely symbol names from a natural language query
@@ -352,10 +352,18 @@ export class ContextBuilder {
     // Step 2b: Search for extracted symbols as definition (class/interface) prefixes.
     // When the user writes "REST", "bulk", or "allocation", they usually mean classes
     // like RestController, BulkRequest, AllocationService — not nodes named exactly that.
+    // Also tries stem variants: "caching" → "cache" finds Cache, CacheBuilder.
     if (symbolsFromQuery.length > 0) {
       const definitionKinds: NodeKind[] = ['class', 'interface', 'struct', 'trait',
         'protocol', 'enum', 'type_alias'];
+      // Expand symbols with stem variants for broader definition matching
+      const expandedSymbols = new Set(symbolsFromQuery);
       for (const sym of symbolsFromQuery) {
+        for (const variant of getStemVariants(sym)) {
+          expandedSymbols.add(variant);
+        }
+      }
+      for (const sym of expandedSymbols) {
         // Title-case the symbol: "REST" → "Rest", "bulk" → "Bulk", "allocation" → "Allocation"
         const titleCased = sym.charAt(0).toUpperCase() + sym.slice(1).toLowerCase();
         if (titleCased === sym) continue; // already title-case (e.g., "Engine") — handled by exact match

+ 87 - 1
src/search/query-utils.ts

@@ -28,6 +28,70 @@ export const STOP_WORDS = new Set([
   'fix', 'bug', 'called',
 ]);
 
+/**
+ * Generate stem variants of a search term by removing common English suffixes.
+ * Used for FTS query expansion so "caching" also finds "cache", "eviction" finds "evict", etc.
+ * Stems are used as PREFIX matches in FTS, so they don't need to be perfect English words.
+ */
+export function getStemVariants(term: string): string[] {
+  const variants = new Set<string>();
+  const t = term.toLowerCase();
+
+  // -ing: caching→cach/cache, handling→handl/handle, running→run
+  if (t.endsWith('ing') && t.length > 5) {
+    const base = t.slice(0, -3);
+    variants.add(base);
+    variants.add(base + 'e');
+    if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
+      variants.add(base.slice(0, -1));
+    }
+  }
+
+  // -tion/-sion: eviction→evict, expression→express
+  if ((t.endsWith('tion') || t.endsWith('sion')) && t.length > 5) {
+    variants.add(t.slice(0, -3));
+  }
+
+  // -ment: management→manage
+  if (t.endsWith('ment') && t.length > 6) {
+    variants.add(t.slice(0, -4));
+  }
+
+  // -ies: entries→entry
+  if (t.endsWith('ies') && t.length > 4) {
+    variants.add(t.slice(0, -3) + 'y');
+  }
+  // -es: processes→process, classes→class
+  else if (t.endsWith('es') && t.length > 4) {
+    variants.add(t.slice(0, -2));
+  }
+  // -s: errors→error (skip -ss endings like "class")
+  else if (t.endsWith('s') && !t.endsWith('ss') && t.length > 4) {
+    variants.add(t.slice(0, -1));
+  }
+
+  // -ed: handled→handle, propagated→propagate, carried→carry
+  if (t.endsWith('ed') && !t.endsWith('eed') && t.length > 4) {
+    variants.add(t.slice(0, -1));
+    variants.add(t.slice(0, -2));
+    if (t.endsWith('ied') && t.length > 5) {
+      variants.add(t.slice(0, -3) + 'y');
+    }
+  }
+
+  // -er: builder→build/builde, handler→handl/handle, getter→get
+  if (t.endsWith('er') && t.length > 4) {
+    const base = t.slice(0, -2);
+    variants.add(base);
+    variants.add(base + 'e');
+    if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
+      variants.add(base.slice(0, -1));
+    }
+  }
+
+  return [...variants].filter(v => v.length >= 3 && v !== t);
+}
+
 /**
  * Extract meaningful search terms from a natural language query.
  * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
@@ -36,6 +100,9 @@ export const STOP_WORDS = new Set([
  * Preserves original compound identifiers (e.g., "scrapeLoop") alongside
  * their split parts so that FTS can match both the full symbol name and
  * individual words within it.
+ *
+ * Also generates stem variants (e.g., "caching"→"cache", "eviction"→"evict")
+ * so FTS prefix matching can find related code symbols.
  */
 export function extractSearchTerms(query: string): string[] {
   const tokens = new Set<string>();
@@ -76,6 +143,21 @@ export function extractSearchTerms(query: string): string[] {
     tokens.add(lower);
   }
 
+  // Generate stem variants for broader FTS matching.
+  // "caching" → "cache" finds CacheBuilder; "eviction" → "evict" finds evictEntries.
+  // Also enables co-occurrence dampening by increasing term count above 1.
+  const stems = new Set<string>();
+  for (const token of tokens) {
+    for (const variant of getStemVariants(token)) {
+      if (!tokens.has(variant) && !STOP_WORDS.has(variant)) {
+        stems.add(variant);
+      }
+    }
+  }
+  for (const stem of stems) {
+    tokens.add(stem);
+  }
+
   return [...tokens];
 }
 
@@ -133,10 +215,14 @@ export function isTestFile(filePath: string): boolean {
     fileName.endsWith('_test.rs') ||
     fileName.endsWith('Tests.java') ||
     fileName.endsWith('Test.java') ||
+    fileName.endsWith('Tester.java') ||
+    fileName.endsWith('TestCase.java') ||
     lower.includes('/tests/') ||
     lower.includes('/test/') ||
     lower.includes('/__tests__/') ||
-    lower.includes('/spec/')
+    lower.includes('/spec/') ||
+    lower.includes('/testlib/') ||
+    lower.includes('/testing/')
   );
 }