瀏覽代碼

feat: Improve search tokenization with camelCase splitting and code-aware stop words

extractSearchTerms now splits camelCase, PascalCase, snake_case, and
dot.notation into individual tokens (e.g. "getUserName" → ["user", "name"]).
Stop words expanded with code-specific noise words (code, file, function,
method, class, type, etc.) to improve search precision.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Colby McHenry 2 月之前
父節點
當前提交
0756636bde
共有 1 個文件被更改,包括 33 次插入7 次删除
  1. 33 7
      src/search/query-utils.ts

+ 33 - 7
src/search/query-utils.ts

@@ -8,9 +8,11 @@ import * as path from 'path';
 import { Node } from '../types';
 
 /**
- * Common stop words to filter from search queries
+ * Common stop words to filter from search queries.
+ * Includes generic English + code-specific noise words.
  */
 export const STOP_WORDS = new Set([
+  // English
   'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
   'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
   'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
@@ -18,17 +20,41 @@ export const STOP_WORDS = new Set([
   'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
   'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
   'find', 'show', 'get', 'list', 'give', 'tell',
+  'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
+  'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
+  'over', 'only', 'new', 'out', 'its', 'so', 'up', 'as', 'if',
+  // Code-specific noise
+  'code', 'file', 'files', 'function', 'method', 'class', 'type',
+  'build', 'run', 'test', 'fix', 'bug', 'call', 'called', 'set', 'add',
 ]);
 
 /**
- * Extract meaningful search terms from a natural language query
+ * Extract meaningful search terms from a natural language query.
+ * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
+ * into individual tokens before filtering.
  */
 export function extractSearchTerms(query: string): string[] {
-  return query
-    .toLowerCase()
-    .replace(/[^\w\s-]/g, ' ')
-    .split(/\s+/)
-    .filter(term => term.length > 1 && !STOP_WORDS.has(term));
+  const tokens = new Set<string>();
+
+  // Split camelCase / PascalCase: "getUserName" → "get User Name"
+  const camelSplit = query
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
+
+  // Replace underscores and dots with spaces (snake_case, dot.notation)
+  const normalised = camelSplit.replace(/[_.]+/g, ' ');
+
+  // Split on any non-alphanumeric character
+  const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
+
+  for (const word of words) {
+    const lower = word.toLowerCase();
+    if (lower.length < 3) continue;
+    if (STOP_WORDS.has(lower)) continue;
+    tokens.add(lower);
+  }
+
+  return [...tokens];
 }
 
 /**