query-utils.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369
  1. /**
  2. * Search Query Utilities
  3. *
  4. * Shared module for search term extraction and scoring.
  5. */
  6. import * as path from 'path';
  7. import { Node } from '../types';
  8. /**
  9. * Common stop words to filter from search queries.
  10. * Includes generic English + code-specific noise words.
  11. */
  12. export const STOP_WORDS = new Set([
  13. // English
  14. 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
  15. 'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
  16. 'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
  17. 'should', 'may', 'might', 'can', 'shall', 'not', 'no', 'all', 'each',
  18. 'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
  19. 'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
  20. 'show', 'give', 'tell',
  21. 'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
  22. 'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
  23. 'over', 'only', 'out', 'its', 'so', 'up', 'as', 'if',
  24. 'look', 'need', 'needs', 'want', 'happen', 'happens',
  25. 'affect', 'affected', 'break', 'breaks', 'failing',
  26. 'implemented', 'implement',
  27. // Code-specific noise (avoid filtering common symbol names like get/set/add/build/find/list)
  28. 'code', 'file', 'files', 'function', 'method', 'class', 'type',
  29. 'fix', 'bug', 'called',
  30. ]);
  31. /**
  32. * Generate stem variants of a search term by removing common English suffixes.
  33. * Used for FTS query expansion so "caching" also finds "cache", "eviction" finds "evict", etc.
  34. * Stems are used as PREFIX matches in FTS, so they don't need to be perfect English words.
  35. */
  36. export function getStemVariants(term: string): string[] {
  37. const variants = new Set<string>();
  38. const t = term.toLowerCase();
  39. // -ing: caching→cach/cache, handling→handl/handle, running→run
  40. if (t.endsWith('ing') && t.length > 5) {
  41. const base = t.slice(0, -3);
  42. variants.add(base);
  43. variants.add(base + 'e');
  44. if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
  45. variants.add(base.slice(0, -1));
  46. }
  47. }
  48. // -tion/-sion: eviction→evict, expression→express
  49. if ((t.endsWith('tion') || t.endsWith('sion')) && t.length > 5) {
  50. variants.add(t.slice(0, -3));
  51. }
  52. // -ment: management→manage
  53. if (t.endsWith('ment') && t.length > 6) {
  54. variants.add(t.slice(0, -4));
  55. }
  56. // -ies: entries→entry
  57. if (t.endsWith('ies') && t.length > 4) {
  58. variants.add(t.slice(0, -3) + 'y');
  59. }
  60. // -es: processes→process, classes→class
  61. else if (t.endsWith('es') && t.length > 4) {
  62. variants.add(t.slice(0, -2));
  63. }
  64. // -s: errors→error (skip -ss endings like "class")
  65. else if (t.endsWith('s') && !t.endsWith('ss') && t.length > 4) {
  66. variants.add(t.slice(0, -1));
  67. }
  68. // -ed: handled→handle, propagated→propagate, carried→carry
  69. if (t.endsWith('ed') && !t.endsWith('eed') && t.length > 4) {
  70. variants.add(t.slice(0, -1));
  71. variants.add(t.slice(0, -2));
  72. if (t.endsWith('ied') && t.length > 5) {
  73. variants.add(t.slice(0, -3) + 'y');
  74. }
  75. }
  76. // -er: builder→build/builde, handler→handl/handle, getter→get
  77. if (t.endsWith('er') && t.length > 4) {
  78. const base = t.slice(0, -2);
  79. variants.add(base);
  80. variants.add(base + 'e');
  81. if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
  82. variants.add(base.slice(0, -1));
  83. }
  84. }
  85. return [...variants].filter(v => v.length >= 3 && v !== t);
  86. }
  87. /**
  88. * Extract meaningful search terms from a natural language query.
  89. * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
  90. * into individual tokens before filtering.
  91. *
  92. * Preserves original compound identifiers (e.g., "scrapeLoop") alongside
  93. * their split parts so that FTS can match both the full symbol name and
  94. * individual words within it.
  95. *
  96. * Also generates stem variants (e.g., "caching"→"cache", "eviction"→"evict")
  97. * so FTS prefix matching can find related code symbols.
  98. */
  99. export function extractSearchTerms(query: string, options?: { stems?: boolean }): string[] {
  100. const includeStems = options?.stems !== false;
  101. const tokens = new Set<string>();
  102. // First, extract and preserve compound identifiers before splitting
  103. // CamelCase: scrapeLoop, UserService, getCallGraph
  104. const compoundPattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:[A-Z][a-z]+)+|[A-Z][a-z]+(?:[A-Z][a-z]*)+)\b/g;
  105. let match;
  106. while ((match = compoundPattern.exec(query)) !== null) {
  107. if (match[1] && match[1].length >= 3) {
  108. tokens.add(match[1].toLowerCase()); // preserve full compound: "scrapeloop"
  109. }
  110. }
  111. // snake_case: scrape_loop, user_service
  112. const snakePattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:_[a-zA-Z0-9]+)+)\b/g;
  113. while ((match = snakePattern.exec(query)) !== null) {
  114. if (match[1] && match[1].length >= 3) {
  115. tokens.add(match[1].toLowerCase());
  116. }
  117. }
  118. // Split camelCase / PascalCase: "getUserName" → "get User Name"
  119. const camelSplit = query
  120. .replace(/([a-z])([A-Z])/g, '$1 $2')
  121. .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
  122. // Replace underscores and dots with spaces (snake_case, dot.notation)
  123. const normalised = camelSplit.replace(/[_.]+/g, ' ');
  124. // Split on any non-alphanumeric character
  125. const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
  126. for (const word of words) {
  127. const lower = word.toLowerCase();
  128. if (lower.length < 3) continue;
  129. if (STOP_WORDS.has(lower)) continue;
  130. tokens.add(lower);
  131. }
  132. // Generate stem variants for broader FTS matching.
  133. // "caching" → "cache" finds CacheBuilder; "eviction" → "evict" finds evictEntries.
  134. // Also enables co-occurrence dampening by increasing term count above 1.
  135. // Stems are skipped when scoring path relevance (stems inflate path scores).
  136. if (includeStems) {
  137. const stems = new Set<string>();
  138. for (const token of tokens) {
  139. for (const variant of getStemVariants(token)) {
  140. if (!tokens.has(variant) && !STOP_WORDS.has(variant)) {
  141. stems.add(variant);
  142. }
  143. }
  144. }
  145. for (const stem of stems) {
  146. tokens.add(stem);
  147. }
  148. }
  149. return [...tokens];
  150. }
  151. /**
  152. * Score path relevance to a query
  153. * Higher score = more relevant path
  154. */
  155. export function scorePathRelevance(filePath: string, query: string): number {
  156. // Use base terms only — stem variants inflate path scores by generating
  157. // many near-duplicate terms that all match the same path segments.
  158. const terms = extractSearchTerms(query, { stems: false });
  159. if (terms.length === 0) return 0;
  160. const pathLower = filePath.toLowerCase();
  161. const fileName = path.basename(filePath).toLowerCase();
  162. const dirName = path.dirname(filePath).toLowerCase();
  163. let score = 0;
  164. for (const term of terms) {
  165. // Exact filename match (strongest)
  166. if (fileName.includes(term)) score += 10;
  167. // Directory match
  168. if (dirName.includes(term)) score += 5;
  169. // General path match
  170. else if (pathLower.includes(term)) score += 3;
  171. }
  172. // Deprioritize test files unless the query is explicitly about tests
  173. const queryLower = query.toLowerCase();
  174. const isTestQuery = queryLower.includes('test') || queryLower.includes('spec');
  175. if (!isTestQuery && isTestFile(filePath)) {
  176. score -= 15;
  177. }
  178. return score;
  179. }
  180. /**
  181. * Check if a file path looks like a test file
  182. */
  183. export function isTestFile(filePath: string): boolean {
  184. const lower = filePath.toLowerCase();
  185. const fileName = path.basename(filePath); // original case — needed for camelCase boundaries
  186. const lowerName = fileName.toLowerCase();
  187. // --- Filename patterns ---
  188. if (
  189. lowerName.startsWith('test_') || // python: test_foo.py
  190. lowerName.startsWith('test.') ||
  191. // separator-delimited: foo_test.go, foo.test.ts, foo-spec.rb, bar_spec.py
  192. /[._-](test|tests|spec|specs)\.[a-z0-9]+$/.test(lowerName) ||
  193. // CamelCase suffix (Java/Kotlin/Swift/C#/Scala): FooTest.kt, BarTests.swift,
  194. // BazSpec.scala, QuxTestCase.java. Capital-led so "latest.kt"/"manifest.kt"
  195. // (lowercase "test") are NOT matched.
  196. /(?:Test|Tests|TestCase|Tester|Spec|Specs)\.[A-Za-z0-9]+$/.test(fileName)
  197. ) {
  198. return true;
  199. }
  200. // --- Directory patterns ---
  201. if (
  202. lower.includes('/tests/') || lower.includes('/test/') ||
  203. lower.includes('/__tests__/') || lower.includes('/spec/') ||
  204. lower.includes('/specs/') || lower.includes('/testlib/') ||
  205. lower.includes('/testing/') ||
  206. lower.startsWith('test/') || lower.startsWith('tests/') ||
  207. lower.startsWith('spec/') || lower.startsWith('specs/') ||
  208. // CamelCase test source-set dirs (Kotlin Multiplatform / Gradle / Xcode):
  209. // jvmTest/, commonTest/, androidTest/, iosTest/, integrationTest/. Capital-led
  210. // so "latest/" / "manifest/" are not matched.
  211. /(?:^|\/)[A-Za-z0-9]*(?:Test|Tests|Spec)\//.test(filePath)
  212. ) {
  213. return true;
  214. }
  215. // Non-production directories: examples, samples, benchmarks, fixtures, demos.
  216. // Check both mid-path (/integration/) and start-of-path (integration/) since
  217. // file paths may be stored as relative paths without a leading slash.
  218. return matchesNonProductionDir(lower);
  219. }
  220. /**
  221. * Check if a path is in a non-production directory (integration, sample, example, etc.)
  222. * Handles both absolute paths (/foo/integration/bar) and relative paths (integration/bar).
  223. */
  224. function matchesNonProductionDir(lowerPath: string): boolean {
  225. const dirs = [
  226. 'integration', 'sample', 'samples', 'example', 'examples',
  227. 'fixture', 'fixtures', 'benchmark', 'benchmarks', 'demo', 'demos',
  228. ];
  229. for (const dir of dirs) {
  230. if (lowerPath.includes('/' + dir + '/') || lowerPath.startsWith(dir + '/')) {
  231. return true;
  232. }
  233. }
  234. return false;
  235. }
  236. /**
  237. * Bonus when a node's name matches the search query.
  238. * Exact matches get the largest boost; prefix matches get smaller boosts.
  239. * Multi-word queries also check individual term matches against the name.
  240. */
  241. export function nameMatchBonus(nodeName: string, query: string): number {
  242. const nameLower = nodeName.toLowerCase();
  243. // Split query into word-level terms (handles "CacheBuilder build" → ["cache","builder","build"])
  244. const rawTerms = query
  245. .replace(/([a-z])([A-Z])/g, '$1 $2')
  246. .split(/[\s_.\-]+/)
  247. .map(t => t.toLowerCase())
  248. .filter(t => t.length >= 2);
  249. // Also keep original space-separated tokens for exact-term matching
  250. const queryTokens = query.split(/\s+/).map(t => t.toLowerCase()).filter(t => t.length >= 2);
  251. // Full query as a single token (for compound identifiers like "CacheBuilder")
  252. const queryLower = query.replace(/[\s]+/g, '').toLowerCase();
  253. // Exact match: query exactly equals the node name
  254. if (nameLower === queryLower) return 80;
  255. // Exact match on a query token: "CacheBuilder build" and node name is "build"
  256. if (queryTokens.length > 1 && queryTokens.includes(nameLower)) return 60;
  257. // Name starts with query — scale by length ratio so "Pod"→"Pod" (exact, handled above)
  258. // scores much higher than "Pod"→"PodGCControllerOptions" (ratio 0.125).
  259. if (nameLower.startsWith(queryLower)) {
  260. const ratio = queryLower.length / nameLower.length;
  261. return Math.round(10 + 30 * ratio);
  262. }
  263. // All camelCase-split terms appear in the name
  264. if (rawTerms.length > 1) {
  265. const allMatch = rawTerms.every(t => nameLower.includes(t));
  266. if (allMatch) return 15;
  267. }
  268. // Name contains the full query as substring
  269. if (nameLower.includes(queryLower)) return 10;
  270. return 0;
  271. }
  272. /**
  273. * Kind-based bonus for search ranking
  274. * Functions and classes are typically more relevant than variables/imports
  275. */
  276. export function kindBonus(kind: Node['kind']): number {
  277. const bonuses: Record<string, number> = {
  278. function: 10,
  279. method: 10,
  280. class: 8,
  281. interface: 9,
  282. type_alias: 6,
  283. struct: 6,
  284. trait: 9,
  285. enum: 5,
  286. component: 8,
  287. route: 9,
  288. module: 4,
  289. property: 3,
  290. field: 3,
  291. variable: 2,
  292. constant: 3,
  293. import: 1,
  294. export: 1,
  295. parameter: 0,
  296. namespace: 4,
  297. file: 0,
  298. protocol: 9,
  299. enum_member: 3,
  300. };
  301. return bonuses[kind] ?? 0;
  302. }
  303. /**
  304. * Whether a query token looks like a code identifier the user deliberately typed
  305. * (camelCase / PascalCase-with-internal-caps / snake_case / has a digit) rather
  306. * than a plain dictionary word ("flat", "object", "screen").
  307. *
  308. * Used to decide whether an EXACT name match earns the "the user named this
  309. * symbol" exemption from single-term dampening. A common English word that
  310. * happens to exact-match an unrelated symbol — the query "flat object" matching
  311. * a constant named `FLAT` — must NOT get that exemption, or the +exact-name
  312. * bonus floats it to the top of a prose query on its own.
  313. *
  314. * Classifies the token AS THE USER TYPED IT, not the matched symbol's name:
  315. * "flat" (lowercase, descriptive) is non-distinctive even though it matches
  316. * `FLAT`. A leading-capital-only word ("Screen", "Zustand") is also treated as
  317. * a plain word — sentence-start capitalization and proper nouns aren't reliable
  318. * identifier signals.
  319. */
  320. export function isDistinctiveIdentifier(token: string): boolean {
  321. if (!token) return false;
  322. // snake_case / SCREAMING_SNAKE, or an embedded digit → a deliberate identifier.
  323. if (/[_0-9]/.test(token)) return true;
  324. // An uppercase letter anywhere AFTER the first char → a camelCase/PascalCase
  325. // boundary (setLastEmail, OrgUserStore) or an acronym (REST, HTTP).
  326. if (/[A-Z]/.test(token.slice(1))) return true;
  327. return false;
  328. }