Просмотр исходного кода

feat: Improve search tokenization with camelCase splitting and code-aware stop words

extractSearchTerms now splits camelCase, PascalCase, snake_case, and
dot.notation into individual tokens (e.g. "getUserName" → ["user", "name"]).
Stop words expanded with code-specific noise words (code, file, function,
method, class, type, etc.) to improve search precision.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Colby McHenry 2 месяцев назад
Родитель
Сommit
0756636bde
1 измененных файлов с 33 добавлено и 7 удалено
  1. 33 7
      src/search/query-utils.ts

+ 33 - 7
src/search/query-utils.ts

@@ -8,9 +8,11 @@ import * as path from 'path';
 import { Node } from '../types';
 
 /**
- * Common stop words to filter from search queries
+ * Common stop words to filter from search queries.
+ * Includes generic English + code-specific noise words.
  */
 export const STOP_WORDS = new Set([
+  // English
   'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
   'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
   'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
@@ -18,17 +20,41 @@ export const STOP_WORDS = new Set([
   'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
   'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
   'find', 'show', 'get', 'list', 'give', 'tell',
+  'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
+  'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
+  'over', 'only', 'new', 'out', 'its', 'so', 'up', 'as', 'if',
+  // Code-specific noise
+  'code', 'file', 'files', 'function', 'method', 'class', 'type',
+  'build', 'run', 'test', 'fix', 'bug', 'call', 'called', 'set', 'add',
 ]);
 
 /**
- * Extract meaningful search terms from a natural language query
+ * Extract meaningful search terms from a natural language query.
+ * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
+ * into individual tokens before filtering.
  */
 export function extractSearchTerms(query: string): string[] {
-  return query
-    .toLowerCase()
-    .replace(/[^\w\s-]/g, ' ')
-    .split(/\s+/)
-    .filter(term => term.length > 1 && !STOP_WORDS.has(term));
+  const tokens = new Set<string>();
+
+  // Split camelCase / PascalCase: "getUserName" → "get User Name"
+  const camelSplit = query
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
+
+  // Replace underscores and dots with spaces (snake_case, dot.notation)
+  const normalised = camelSplit.replace(/[_.]+/g, ' ');
+
+  // Split on any non-alphanumeric character
+  const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
+
+  for (const word of words) {
+    const lower = word.toLowerCase();
+    if (lower.length < 3) continue;
+    if (STOP_WORDS.has(lower)) continue;
+    tokens.add(lower);
+  }
+
+  return [...tokens];
 }
 
 /**