query-utils.ts 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. /**
  2. * Search Query Utilities
  3. *
  4. * Shared module for search term extraction and scoring.
  5. */
  6. import * as fs from 'fs';
  7. import * as path from 'path';
  8. import { Node } from '../types';
  9. /** Normalize a name to a comparable token: lowercase, alphanumerics only. */
  10. export function normalizeNameToken(raw: string): string {
  11. return raw.toLowerCase().replace(/[^a-z0-9]/g, '');
  12. }
  13. /**
  14. * Tokens that name the PROJECT as a whole — its `go.mod` module, `package.json`
  15. * name, or repo root directory — rather than any specific symbol. A user
  16. * naturally puts the project name in a query as context ("MyApp backend
  17. * routes"), but it carries no discriminative signal: when it's also a substring
  18. * of a symbol or path on one stack (a `MyAppFrontend/` dir, a `MyAppApp` class)
  19. * it lexically inflates that stack and buries the rest (#720).
  20. *
  21. * Returned normalized (lowercase, alphanumerics only) so a query word can be
  22. * compared by its normalized form. Only names ≥5 chars are kept — short ones
  23. * (`api`, `app`, `core`, `web`) collide with real query terms too often to
  24. * safely down-weight.
  25. */
  26. export function deriveProjectNameTokens(projectRoot: string): Set<string> {
  27. const tokens = new Set<string>();
  28. const add = (raw: string | undefined | null): void => {
  29. if (!raw) return;
  30. const norm = normalizeNameToken(raw);
  31. if (norm.length >= 5) tokens.add(norm);
  32. };
  33. // go.mod module last segment (the most reliable signal for Go repos).
  34. try {
  35. const gomod = fs.readFileSync(path.join(projectRoot, 'go.mod'), 'utf-8');
  36. const m = gomod.match(/^\s*module\s+(\S+)/m);
  37. if (m && m[1]) add(m[1].split('/').pop());
  38. } catch { /* no go.mod */ }
  39. // package.json name (strip an `@scope/` prefix).
  40. try {
  41. const pkg = JSON.parse(fs.readFileSync(path.join(projectRoot, 'package.json'), 'utf-8'));
  42. if (typeof pkg.name === 'string') add(pkg.name.replace(/^@[^/]+\//, ''));
  43. } catch { /* no / invalid package.json */ }
  44. // Repo root directory name — a fallback when neither manifest names the project.
  45. add(path.basename(path.resolve(projectRoot)));
  46. return tokens;
  47. }
  48. /**
  49. * Common stop words to filter from search queries.
  50. * Includes generic English + code-specific noise words.
  51. */
  52. export const STOP_WORDS = new Set([
  53. // English
  54. 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
  55. 'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
  56. 'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
  57. 'should', 'may', 'might', 'can', 'shall', 'not', 'no', 'all', 'each',
  58. 'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
  59. 'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
  60. 'show', 'give', 'tell',
  61. 'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
  62. 'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
  63. 'over', 'only', 'out', 'its', 'so', 'up', 'as', 'if',
  64. 'look', 'need', 'needs', 'want', 'happen', 'happens',
  65. 'affect', 'affected', 'break', 'breaks', 'failing',
  66. 'implemented', 'implement',
  67. // Code-specific noise (avoid filtering common symbol names like get/set/add/build/find/list)
  68. 'code', 'file', 'files', 'function', 'method', 'class', 'type',
  69. 'fix', 'bug', 'called',
  70. ]);
  71. /**
  72. * Generate stem variants of a search term by removing common English suffixes.
  73. * Used for FTS query expansion so "caching" also finds "cache", "eviction" finds "evict", etc.
  74. * Stems are used as PREFIX matches in FTS, so they don't need to be perfect English words.
  75. */
  76. export function getStemVariants(term: string): string[] {
  77. const variants = new Set<string>();
  78. const t = term.toLowerCase();
  79. // -ing: caching→cach/cache, handling→handl/handle, running→run
  80. if (t.endsWith('ing') && t.length > 5) {
  81. const base = t.slice(0, -3);
  82. variants.add(base);
  83. variants.add(base + 'e');
  84. if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
  85. variants.add(base.slice(0, -1));
  86. }
  87. }
  88. // -tion/-sion: eviction→evict, expression→express
  89. if ((t.endsWith('tion') || t.endsWith('sion')) && t.length > 5) {
  90. variants.add(t.slice(0, -3));
  91. }
  92. // -ment: management→manage
  93. if (t.endsWith('ment') && t.length > 6) {
  94. variants.add(t.slice(0, -4));
  95. }
  96. // -ies: entries→entry
  97. if (t.endsWith('ies') && t.length > 4) {
  98. variants.add(t.slice(0, -3) + 'y');
  99. }
  100. // -es: processes→process, classes→class
  101. else if (t.endsWith('es') && t.length > 4) {
  102. variants.add(t.slice(0, -2));
  103. }
  104. // -s: errors→error (skip -ss endings like "class")
  105. else if (t.endsWith('s') && !t.endsWith('ss') && t.length > 4) {
  106. variants.add(t.slice(0, -1));
  107. }
  108. // -ed: handled→handle, propagated→propagate, carried→carry
  109. if (t.endsWith('ed') && !t.endsWith('eed') && t.length > 4) {
  110. variants.add(t.slice(0, -1));
  111. variants.add(t.slice(0, -2));
  112. if (t.endsWith('ied') && t.length > 5) {
  113. variants.add(t.slice(0, -3) + 'y');
  114. }
  115. }
  116. // -er: builder→build/builde, handler→handl/handle, getter→get
  117. if (t.endsWith('er') && t.length > 4) {
  118. const base = t.slice(0, -2);
  119. variants.add(base);
  120. variants.add(base + 'e');
  121. if (base.length >= 2 && base[base.length - 1] === base[base.length - 2]) {
  122. variants.add(base.slice(0, -1));
  123. }
  124. }
  125. return [...variants].filter(v => v.length >= 3 && v !== t);
  126. }
  127. /**
  128. * Extract meaningful search terms from a natural language query.
  129. * Splits camelCase, PascalCase, snake_case, SCREAMING_SNAKE, and dot.notation
  130. * into individual tokens before filtering.
  131. *
  132. * Preserves original compound identifiers (e.g., "scrapeLoop") alongside
  133. * their split parts so that FTS can match both the full symbol name and
  134. * individual words within it.
  135. *
  136. * Also generates stem variants (e.g., "caching"→"cache", "eviction"→"evict")
  137. * so FTS prefix matching can find related code symbols.
  138. */
  139. export function extractSearchTerms(query: string, options?: { stems?: boolean }): string[] {
  140. const includeStems = options?.stems !== false;
  141. const tokens = new Set<string>();
  142. // First, extract and preserve compound identifiers before splitting
  143. // CamelCase: scrapeLoop, UserService, getCallGraph
  144. const compoundPattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:[A-Z][a-z]+)+|[A-Z][a-z]+(?:[A-Z][a-z]*)+)\b/g;
  145. let match;
  146. while ((match = compoundPattern.exec(query)) !== null) {
  147. if (match[1] && match[1].length >= 3) {
  148. tokens.add(match[1].toLowerCase()); // preserve full compound: "scrapeloop"
  149. }
  150. }
  151. // snake_case: scrape_loop, user_service
  152. const snakePattern = /\b([a-zA-Z][a-zA-Z0-9]*(?:_[a-zA-Z0-9]+)+)\b/g;
  153. while ((match = snakePattern.exec(query)) !== null) {
  154. if (match[1] && match[1].length >= 3) {
  155. tokens.add(match[1].toLowerCase());
  156. }
  157. }
  158. // Split camelCase / PascalCase: "getUserName" → "get User Name"
  159. const camelSplit = query
  160. .replace(/([a-z])([A-Z])/g, '$1 $2')
  161. .replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2');
  162. // Replace underscores and dots with spaces (snake_case, dot.notation)
  163. const normalised = camelSplit.replace(/[_.]+/g, ' ');
  164. // Split on any non-alphanumeric character
  165. const words = normalised.split(/[^a-zA-Z0-9]+/).filter(Boolean);
  166. for (const word of words) {
  167. const lower = word.toLowerCase();
  168. if (lower.length < 3) continue;
  169. if (STOP_WORDS.has(lower)) continue;
  170. tokens.add(lower);
  171. }
  172. // Generate stem variants for broader FTS matching.
  173. // "caching" → "cache" finds CacheBuilder; "eviction" → "evict" finds evictEntries.
  174. // Also enables co-occurrence dampening by increasing term count above 1.
  175. // Stems are skipped when scoring path relevance (stems inflate path scores).
  176. if (includeStems) {
  177. const stems = new Set<string>();
  178. for (const token of tokens) {
  179. for (const variant of getStemVariants(token)) {
  180. if (!tokens.has(variant) && !STOP_WORDS.has(variant)) {
  181. stems.add(variant);
  182. }
  183. }
  184. }
  185. for (const stem of stems) {
  186. tokens.add(stem);
  187. }
  188. }
  189. return [...tokens];
  190. }
  191. /**
  192. * Score path relevance to a query
  193. * Higher score = more relevant path
  194. */
  195. export function scorePathRelevance(
  196. filePath: string,
  197. query: string,
  198. projectNameTokens?: Set<string>,
  199. ): number {
  200. const pathLower = filePath.toLowerCase();
  201. const fileName = path.basename(filePath).toLowerCase();
  202. const dirName = path.dirname(filePath).toLowerCase();
  203. let score = 0;
  204. // Score per original query WORD, not per sub-token. A single PascalCase word
  205. // splits into many sub-tokens (a project name "SuperBizAgent" →
  206. // superbizagent / super / biz / agent) that all match the SAME path segment,
  207. // so summing per sub-token boosted that path 4× for one concept — enough to
  208. // bury the rest of the query's stack (#720). A word matches a path level if
  209. // ANY of its sub-tokens do, and counts ONCE; distinct words still each add.
  210. // Split the ORIGINAL-case query into words; extractSearchTerms does the
  211. // camelCase/snake split per word (so `getUserName` still matches a
  212. // `get_user_name` path) — we just attribute each word's matches once.
  213. const allWords = query.split(/\s+/).filter((w) => w.length > 0);
  214. if (allWords.length === 0) return 0;
  215. // A query word that just names the PROJECT (its go.mod / package.json / repo
  216. // name) carries no discriminative path signal — drop it so the rest of the
  217. // query decides the ranking, instead of every file under a `<ProjectName>…/`
  218. // tree winning on the project name alone (#720). Only when OTHER words remain,
  219. // so a bare project-name query still scores on its path.
  220. const words =
  221. projectNameTokens && projectNameTokens.size > 0
  222. ? allWords.filter((w) => !projectNameTokens.has(normalizeNameToken(w)))
  223. : allWords;
  224. const scored = words.length > 0 ? words : allWords;
  225. for (const word of scored) {
  226. // Use base terms only — stem variants inflate path scores by generating
  227. // many near-duplicate terms that all match the same path segments.
  228. const subtokens = extractSearchTerms(word, { stems: false });
  229. if (subtokens.length === 0) continue;
  230. // Exact filename match (strongest)
  231. if (subtokens.some((t) => fileName.includes(t))) score += 10;
  232. // Directory match
  233. if (subtokens.some((t) => dirName.includes(t))) score += 5;
  234. // General path match
  235. else if (subtokens.some((t) => pathLower.includes(t))) score += 3;
  236. }
  237. // Deprioritize test files unless the query is explicitly about tests
  238. const queryLower = query.toLowerCase();
  239. const isTestQuery = queryLower.includes('test') || queryLower.includes('spec');
  240. if (!isTestQuery && isTestFile(filePath)) {
  241. score -= 15;
  242. }
  243. return score;
  244. }
  245. /**
  246. * Check if a file path looks like a test file
  247. */
  248. export function isTestFile(filePath: string): boolean {
  249. const lower = filePath.toLowerCase();
  250. const fileName = path.basename(filePath); // original case — needed for camelCase boundaries
  251. const lowerName = fileName.toLowerCase();
  252. // --- Filename patterns ---
  253. if (
  254. lowerName.startsWith('test_') || // python: test_foo.py
  255. lowerName.startsWith('test.') ||
  256. // separator-delimited: foo_test.go, foo.test.ts, foo-spec.rb, bar_spec.py
  257. /[._-](test|tests|spec|specs)\.[a-z0-9]+$/.test(lowerName) ||
  258. // CamelCase suffix (Java/Kotlin/Swift/C#/Scala): FooTest.kt, BarTests.swift,
  259. // BazSpec.scala, QuxTestCase.java. Capital-led so "latest.kt"/"manifest.kt"
  260. // (lowercase "test") are NOT matched.
  261. /(?:Test|Tests|TestCase|Tester|Spec|Specs)\.[A-Za-z0-9]+$/.test(fileName)
  262. ) {
  263. return true;
  264. }
  265. // --- Directory patterns ---
  266. if (
  267. lower.includes('/tests/') || lower.includes('/test/') ||
  268. lower.includes('/__tests__/') || lower.includes('/spec/') ||
  269. lower.includes('/specs/') || lower.includes('/testlib/') ||
  270. lower.includes('/testing/') ||
  271. lower.startsWith('test/') || lower.startsWith('tests/') ||
  272. lower.startsWith('spec/') || lower.startsWith('specs/') ||
  273. // CamelCase test source-set dirs (Kotlin Multiplatform / Gradle / Xcode):
  274. // jvmTest/, commonTest/, androidTest/, iosTest/, integrationTest/. Capital-led
  275. // so "latest/" / "manifest/" are not matched.
  276. /(?:^|\/)[A-Za-z0-9]*(?:Test|Tests|Spec)\//.test(filePath)
  277. ) {
  278. return true;
  279. }
  280. // Non-production directories: examples, samples, benchmarks, fixtures, demos.
  281. // Check both mid-path (/integration/) and start-of-path (integration/) since
  282. // file paths may be stored as relative paths without a leading slash.
  283. return matchesNonProductionDir(lower);
  284. }
  285. /**
  286. * Check if a path is in a non-production directory (integration, sample, example, etc.)
  287. * Handles both absolute paths (/foo/integration/bar) and relative paths (integration/bar).
  288. */
  289. function matchesNonProductionDir(lowerPath: string): boolean {
  290. const dirs = [
  291. 'integration', 'sample', 'samples', 'example', 'examples',
  292. 'fixture', 'fixtures', 'benchmark', 'benchmarks', 'demo', 'demos',
  293. ];
  294. for (const dir of dirs) {
  295. if (lowerPath.includes('/' + dir + '/') || lowerPath.startsWith(dir + '/')) {
  296. return true;
  297. }
  298. }
  299. return false;
  300. }
  301. /**
  302. * Bonus when a node's name matches the search query.
  303. * Exact matches get the largest boost; prefix matches get smaller boosts.
  304. * Multi-word queries also check individual term matches against the name.
  305. */
  306. export function nameMatchBonus(nodeName: string, query: string): number {
  307. const nameLower = nodeName.toLowerCase();
  308. // Split query into word-level terms (handles "CacheBuilder build" → ["cache","builder","build"])
  309. const rawTerms = query
  310. .replace(/([a-z])([A-Z])/g, '$1 $2')
  311. .split(/[\s_.\-]+/)
  312. .map(t => t.toLowerCase())
  313. .filter(t => t.length >= 2);
  314. // Also keep original space-separated tokens for exact-term matching
  315. const queryTokens = query.split(/\s+/).map(t => t.toLowerCase()).filter(t => t.length >= 2);
  316. // Full query as a single token (for compound identifiers like "CacheBuilder")
  317. const queryLower = query.replace(/[\s]+/g, '').toLowerCase();
  318. // Exact match: query exactly equals the node name
  319. if (nameLower === queryLower) return 80;
  320. // Exact match on a query token: "CacheBuilder build" and node name is "build"
  321. if (queryTokens.length > 1 && queryTokens.includes(nameLower)) return 60;
  322. // Name starts with query — scale by length ratio so "Pod"→"Pod" (exact, handled above)
  323. // scores much higher than "Pod"→"PodGCControllerOptions" (ratio 0.125).
  324. if (nameLower.startsWith(queryLower)) {
  325. const ratio = queryLower.length / nameLower.length;
  326. return Math.round(10 + 30 * ratio);
  327. }
  328. // All camelCase-split terms appear in the name
  329. if (rawTerms.length > 1) {
  330. const allMatch = rawTerms.every(t => nameLower.includes(t));
  331. if (allMatch) return 15;
  332. }
  333. // Name contains the full query as substring
  334. if (nameLower.includes(queryLower)) return 10;
  335. return 0;
  336. }
  337. /**
  338. * Kind-based bonus for search ranking
  339. * Functions and classes are typically more relevant than variables/imports
  340. */
  341. export function kindBonus(kind: Node['kind']): number {
  342. const bonuses: Record<string, number> = {
  343. function: 10,
  344. method: 10,
  345. class: 8,
  346. interface: 9,
  347. type_alias: 6,
  348. struct: 6,
  349. trait: 9,
  350. enum: 5,
  351. component: 8,
  352. route: 9,
  353. module: 4,
  354. property: 3,
  355. field: 3,
  356. variable: 2,
  357. constant: 3,
  358. import: 1,
  359. export: 1,
  360. parameter: 0,
  361. namespace: 4,
  362. file: 0,
  363. protocol: 9,
  364. enum_member: 3,
  365. };
  366. return bonuses[kind] ?? 0;
  367. }
  368. /**
  369. * Whether a query token looks like a code identifier the user deliberately typed
  370. * (camelCase / PascalCase-with-internal-caps / snake_case / has a digit) rather
  371. * than a plain dictionary word ("flat", "object", "screen").
  372. *
  373. * Used to decide whether an EXACT name match earns the "the user named this
  374. * symbol" exemption from single-term dampening. A common English word that
  375. * happens to exact-match an unrelated symbol — the query "flat object" matching
  376. * a constant named `FLAT` — must NOT get that exemption, or the +exact-name
  377. * bonus floats it to the top of a prose query on its own.
  378. *
  379. * Classifies the token AS THE USER TYPED IT, not the matched symbol's name:
  380. * "flat" (lowercase, descriptive) is non-distinctive even though it matches
  381. * `FLAT`. A leading-capital-only word ("Screen", "Zustand") is also treated as
  382. * a plain word — sentence-start capitalization and proper nouns aren't reliable
  383. * identifier signals.
  384. */
  385. export function isDistinctiveIdentifier(token: string): boolean {
  386. if (!token) return false;
  387. // snake_case / SCREAMING_SNAKE, or an embedded digit → a deliberate identifier.
  388. if (/[_0-9]/.test(token)) return true;
  389. // An uppercase letter anywhere AFTER the first char → a camelCase/PascalCase
  390. // boundary (setLastEmail, OrgUserStore) or an acronym (REST, HTTP).
  391. if (/[A-Z]/.test(token.slice(1))) return true;
  392. return false;
  393. }