Преглед изворни кода

feat: Add optional stem control to search term extraction and improve name match scoring

Addresses path relevance scoring inflation where stem variants created many near-duplicate terms that all matched the same path segments. Adds stems option to extractSearchTerms (default true) and disables stems for path scoring while keeping them for FTS matching. Also improves name match bonus scoring with length-based prefix matching and higher exact match scores.
Colby McHenry пре 2 месеци
родитељ
комит
392c146810
1 измењених фајлова са 24 додато и 14 уклоњено
  1. 24 14
      src/search/query-utils.ts

+ 24 - 14
src/search/query-utils.ts

@@ -107,7 +107,8 @@ export function getStemVariants(term: string): string[] {
  * Also generates stem variants (e.g., "caching"→"cache", "eviction"→"evict")
  * so FTS prefix matching can find related code symbols.
  */
-export function extractSearchTerms(query: string): string[] {
+export function extractSearchTerms(query: string, options?: { stems?: boolean }): string[] {
+  const includeStems = options?.stems !== false;
   const tokens = new Set<string>();
 
   // First, extract and preserve compound identifiers before splitting
@@ -149,16 +150,19 @@ export function extractSearchTerms(query: string): string[] {
   // Generate stem variants for broader FTS matching.
   // "caching" → "cache" finds CacheBuilder; "eviction" → "evict" finds evictEntries.
   // Also enables co-occurrence dampening by increasing term count above 1.
-  const stems = new Set<string>();
-  for (const token of tokens) {
-    for (const variant of getStemVariants(token)) {
-      if (!tokens.has(variant) && !STOP_WORDS.has(variant)) {
-        stems.add(variant);
+  // Stems are skipped when scoring path relevance (stems inflate path scores).
+  if (includeStems) {
+    const stems = new Set<string>();
+    for (const token of tokens) {
+      for (const variant of getStemVariants(token)) {
+        if (!tokens.has(variant) && !STOP_WORDS.has(variant)) {
+          stems.add(variant);
+        }
       }
     }
-  }
-  for (const stem of stems) {
-    tokens.add(stem);
+    for (const stem of stems) {
+      tokens.add(stem);
+    }
   }
 
   return [...tokens];
@@ -169,7 +173,9 @@ export function extractSearchTerms(query: string): string[] {
  * Higher score = more relevant path
  */
 export function scorePathRelevance(filePath: string, query: string): number {
-  const terms = extractSearchTerms(query);
+  // Use base terms only — stem variants inflate path scores by generating
+  // many near-duplicate terms that all match the same path segments.
+  const terms = extractSearchTerms(query, { stems: false });
   if (terms.length === 0) return 0;
 
   const pathLower = filePath.toLowerCase();
@@ -251,13 +257,17 @@ export function nameMatchBonus(nodeName: string, query: string): number {
   const queryLower = query.replace(/[\s]+/g, '').toLowerCase();
 
   // Exact match: query exactly equals the node name
-  if (nameLower === queryLower) return 30;
+  if (nameLower === queryLower) return 80;
 
   // Exact match on a query token: "CacheBuilder build" and node name is "build"
-  if (queryTokens.length > 1 && queryTokens.includes(nameLower)) return 25;
+  if (queryTokens.length > 1 && queryTokens.includes(nameLower)) return 60;
 
-  // Name starts with query (prefix search: "Cache" → "CacheBuilder")
-  if (nameLower.startsWith(queryLower)) return 20;
+  // Name starts with query — scale by length ratio so "Pod"→"Pod" (exact, handled above)
+  // scores much higher than "Pod"→"PodGCControllerOptions" (ratio 0.125).
+  if (nameLower.startsWith(queryLower)) {
+    const ratio = queryLower.length / nameLower.length;
+    return Math.round(10 + 30 * ratio);
+  }
 
   // All camelCase-split terms appear in the name
   if (rawTerms.length > 1) {