Procházet zdrojové kódy

feat: Improve search tokenization with camelCase splitting and code-aware stop words

extractSearchTerms now splits camelCase, PascalCase, snake_case, and
dot.notation into individual tokens (e.g. "getUserName" → ["user", "name"]).
Stop words expanded with code-specific noise words (code, file, function,
method, class, type, etc.) to improve search precision.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Colby McHenry před 2 měsíci
rodič
revize
0756636bde
1 změnil soubory, kde provedl 33 přidání a 7 odebrání
  1. 33 7
      src/search/query-utils.ts

+ 33 - 7
src/search/query-utils.ts

@@ -8,9 +8,11 @@ import * as path from 'path';
 import { Node } from '../types';
 
 /**
- * Common stop words to filter from search queries
+ * Common stop words to filter from search queries.
+ * Includes generic English + code-specific noise words.
  */
 export const STOP_WORDS = new Set([
+  // English
   'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
   'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
   'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
@@ -18,17 +20,41 @@ export const STOP_WORDS = new Set([
   'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
   'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
   'find', 'show', 'get', 'list', 'give', 'tell',
+  'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
+  'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
+  'over', 'only', 'new', 'out', 'its', 'so', 'up', 'as', 'if',
+  // Code-specific noise
+  'code', 'file', 'files', 'function', 'method', 'class', 'type',
+  'build', 'run', 'test', 'fix', 'bug', 'call', 'called', 'set', 'add',
 ]);
 
 /**
- * Extract meaningful search terms from a natural language query
+ * Extract meaningful search terms from a natural language query.
+ * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
+ * into individual tokens before filtering.
  */
 export function extractSearchTerms(query: string): string[] {
-  return query
-    .toLowerCase()
-    .replace(/[^\w\s-]/g, ' ')
-    .split(/\s+/)
-    .filter(term => term.length > 1 && !STOP_WORDS.has(term));
+  const tokens = new Set<string>();
+
+  // Split camelCase / PascalCase: "getUserName" → "get User Name"
+  const camelSplit = query
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
+
+  // Replace underscores and dots with spaces (snake_case, dot.notation)
+  const normalised = camelSplit.replace(/[_.]+/g, ' ');
+
+  // Split on any non-alphanumeric character
+  const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
+
+  for (const word of words) {
+    const lower = word.toLowerCase();
+    if (lower.length < 3) continue;
+    if (STOP_WORDS.has(lower)) continue;
+    tokens.add(lower);
+  }
+
+  return [...tokens];
 }
 
 /**