Explorar el Código

feat: Improve search tokenization with camelCase splitting and code-aware stop words

extractSearchTerms now splits camelCase, PascalCase, snake_case, and
dot.notation into individual tokens (e.g. "getUserName" → ["user", "name"]).
Stop words expanded with code-specific noise words (code, file, function,
method, class, type, etc.) to improve search precision.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Colby McHenry hace 2 meses
padre
commit
0756636bde
Se han modificado 1 ficheros con 33 adiciones y 7 borrados
  1. 33 7
      src/search/query-utils.ts

+ 33 - 7
src/search/query-utils.ts

@@ -8,9 +8,11 @@ import * as path from 'path';
 import { Node } from '../types';
 
 /**
- * Common stop words to filter from search queries
+ * Common stop words to filter from search queries.
+ * Includes generic English + code-specific noise words.
  */
 export const STOP_WORDS = new Set([
+  // English
   'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
   'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
   'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
@@ -18,17 +20,41 @@ export const STOP_WORDS = new Set([
   'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
   'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
   'find', 'show', 'get', 'list', 'give', 'tell',
+  'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
+  'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
+  'over', 'only', 'new', 'out', 'its', 'so', 'up', 'as', 'if',
+  // Code-specific noise
+  'code', 'file', 'files', 'function', 'method', 'class', 'type',
+  'build', 'run', 'test', 'fix', 'bug', 'call', 'called', 'set', 'add',
 ]);
 
 /**
- * Extract meaningful search terms from a natural language query
+ * Extract meaningful search terms from a natural language query.
+ * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
+ * into individual tokens before filtering.
  */
 export function extractSearchTerms(query: string): string[] {
-  return query
-    .toLowerCase()
-    .replace(/[^\w\s-]/g, ' ')
-    .split(/\s+/)
-    .filter(term => term.length > 1 && !STOP_WORDS.has(term));
+  const tokens = new Set<string>();
+
+  // Split camelCase / PascalCase: "getUserName" → "get User Name"
+  const camelSplit = query
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
+
+  // Replace underscores and dots with spaces (snake_case, dot.notation)
+  const normalised = camelSplit.replace(/[_.]+/g, ' ');
+
+  // Split on any non-alphanumeric character
+  const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
+
+  for (const word of words) {
+    const lower = word.toLowerCase();
+    if (lower.length < 3) continue;
+    if (STOP_WORDS.has(lower)) continue;
+    tokens.add(lower);
+  }
+
+  return [...tokens];
 }
 
 /**