|
@@ -8,9 +8,11 @@ import * as path from 'path';
|
|
|
import { Node } from '../types';
|
|
import { Node } from '../types';
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Common stop words to filter from search queries
|
|
|
|
|
|
|
+ * Common stop words to filter from search queries.
|
|
|
|
|
+ * Includes generic English + code-specific noise words.
|
|
|
*/
|
|
*/
|
|
|
export const STOP_WORDS = new Set([
|
|
export const STOP_WORDS = new Set([
|
|
|
|
|
+ // English
|
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
|
'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
|
|
'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
|
|
|
'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
@@ -18,17 +20,41 @@ export const STOP_WORDS = new Set([
|
|
|
'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
|
|
'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
|
|
|
'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
|
|
'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
|
|
|
'find', 'show', 'get', 'list', 'give', 'tell',
|
|
'find', 'show', 'get', 'list', 'give', 'tell',
|
|
|
|
|
+ 'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
|
|
|
|
|
+ 'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
|
|
|
|
|
+ 'over', 'only', 'new', 'out', 'its', 'so', 'up', 'as', 'if',
|
|
|
|
|
+ // Code-specific noise
|
|
|
|
|
+ 'code', 'file', 'files', 'function', 'method', 'class', 'type',
|
|
|
|
|
+ 'build', 'run', 'test', 'fix', 'bug', 'call', 'called', 'set', 'add',
|
|
|
]);
|
|
]);
|
|
|
|
|
|
|
|
/**
|
|
/**
|
|
|
- * Extract meaningful search terms from a natural language query
|
|
|
|
|
|
|
+ * Extract meaningful search terms from a natural language query.
|
|
|
|
|
+ * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
|
|
|
|
|
+ * into individual tokens before filtering.
|
|
|
*/
|
|
*/
|
|
|
export function extractSearchTerms(query: string): string[] {
|
|
export function extractSearchTerms(query: string): string[] {
|
|
|
- return query
|
|
|
|
|
- .toLowerCase()
|
|
|
|
|
- .replace(/[^\w\s-]/g, ' ')
|
|
|
|
|
- .split(/\s+/)
|
|
|
|
|
- .filter(term => term.length > 1 && !STOP_WORDS.has(term));
|
|
|
|
|
|
|
+ const tokens = new Set<string>();
|
|
|
|
|
+
|
|
|
|
|
+ // Split camelCase / PascalCase: "getUserName" → "get User Name"
|
|
|
|
|
+ const camelSplit = query
|
|
|
|
|
+ .replace(/([a-z])([A-Z])/g, '$1 $2')
|
|
|
|
|
+ .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
|
|
|
|
|
+
|
|
|
|
|
+ // Replace underscores and dots with spaces (snake_case, dot.notation)
|
|
|
|
|
+ const normalised = camelSplit.replace(/[_.]+/g, ' ');
|
|
|
|
|
+
|
|
|
|
|
+ // Split on any non-alphanumeric character
|
|
|
|
|
+ const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
|
|
|
|
|
+
|
|
|
|
|
+ for (const word of words) {
|
|
|
|
|
+ const lower = word.toLowerCase();
|
|
|
|
|
+ if (lower.length < 3) continue;
|
|
|
|
|
+ if (STOP_WORDS.has(lower)) continue;
|
|
|
|
|
+ tokens.add(lower);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ return [...tokens];
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
/**
|