Explorar el Código

feat: search query utilities and multi-signal scoring

- Add src/search/query-utils.ts with extractSearchTerms, scorePathRelevance,
  kindBonus, detectApiIntent, inferRouteDirectories
- Add multi-signal scoring to searchNodes (kind bonus + path relevance)
- Improve FTS query sanitization (strip :^ chars, filter boolean operators)
- Add comprehensive search tests
Martin Oehlert hace 4 meses
padre
commit
c80378d73c
Se han modificado 3 ficheros con 356 adiciones y 1 borrados
  1. 214 0
      __tests__/search.test.ts
  2. 13 1
      src/db/queries.ts
  3. 129 0
      src/search/query-utils.ts

+ 214 - 0
__tests__/search.test.ts

@@ -0,0 +1,214 @@
+/**
+ * Search Query Utilities Tests
+ *
+ * Tests multi-signal scoring, kind bonuses, path relevance, and API intent detection.
+ */
+
+import { describe, it, expect } from 'vitest';
+import {
+  extractSearchTerms,
+  scorePathRelevance,
+  kindBonus,
+  detectApiIntent,
+  inferRouteDirectories,
+  STOP_WORDS,
+} from '../src/search/query-utils';
+
+describe('Search Query Utilities', () => {
+  describe('extractSearchTerms', () => {
+    it('should extract meaningful terms from a query', () => {
+      const terms = extractSearchTerms('find the login handler');
+      expect(terms).toContain('login');
+      expect(terms).toContain('handler');
+      // 'find' and 'the' are stop words
+      expect(terms).not.toContain('find');
+      expect(terms).not.toContain('the');
+    });
+
+    it('should filter stop words', () => {
+      const terms = extractSearchTerms('how does the authentication work');
+      expect(terms).not.toContain('how');
+      expect(terms).not.toContain('does');
+      expect(terms).not.toContain('the');
+      expect(terms).toContain('authentication');
+      expect(terms).toContain('work');
+    });
+
+    it('should handle camelCase by lowercasing', () => {
+      const terms = extractSearchTerms('UserService');
+      expect(terms).toContain('userservice');
+    });
+
+    it('should strip punctuation', () => {
+      const terms = extractSearchTerms('payment.process()');
+      expect(terms).toContain('payment');
+      expect(terms).toContain('process');
+    });
+
+    it('should return empty for all stop words', () => {
+      const terms = extractSearchTerms('how do I get the');
+      expect(terms).toHaveLength(0);
+    });
+
+    it('should filter single-character terms', () => {
+      const terms = extractSearchTerms('a b c auth');
+      expect(terms).toEqual(['auth']);
+    });
+  });
+
+  describe('scorePathRelevance', () => {
+    it('should score filename matches highest', () => {
+      const score = scorePathRelevance('src/auth/login.ts', 'login');
+      expect(score).toBeGreaterThanOrEqual(10);
+    });
+
+    it('should score directory matches', () => {
+      const score = scorePathRelevance('src/auth/index.ts', 'auth');
+      expect(score).toBeGreaterThanOrEqual(5);
+    });
+
+    it('should return 0 for unrelated paths', () => {
+      const score = scorePathRelevance('src/utils/format.ts', 'payment');
+      expect(score).toBe(0);
+    });
+
+    it('should accumulate scores for multiple matching terms', () => {
+      const score = scorePathRelevance('src/auth/login.ts', 'auth login');
+      // Both 'auth' (dir match) and 'login' (filename match)
+      expect(score).toBeGreaterThanOrEqual(15);
+    });
+
+    it('should return 0 for empty query terms', () => {
+      const score = scorePathRelevance('src/auth/login.ts', 'the a an');
+      expect(score).toBe(0);
+    });
+  });
+
+  describe('kindBonus', () => {
+    it('should give functions and methods highest bonus', () => {
+      expect(kindBonus('function')).toBe(10);
+      expect(kindBonus('method')).toBe(10);
+    });
+
+    it('should rank functions > classes > variables > imports', () => {
+      expect(kindBonus('function')).toBeGreaterThan(kindBonus('class'));
+      expect(kindBonus('class')).toBeGreaterThan(kindBonus('variable'));
+      expect(kindBonus('variable')).toBeGreaterThan(kindBonus('import'));
+    });
+
+    it('should give routes high priority', () => {
+      expect(kindBonus('route')).toBeGreaterThanOrEqual(9);
+    });
+
+    it('should give components high priority', () => {
+      expect(kindBonus('component')).toBeGreaterThanOrEqual(8);
+    });
+
+    it('should return 0 for parameter and file kinds', () => {
+      expect(kindBonus('parameter')).toBe(0);
+      expect(kindBonus('file')).toBe(0);
+    });
+
+    it('should return 0 for unknown kinds', () => {
+      expect(kindBonus('unknown_kind' as any)).toBe(0);
+    });
+  });
+
+  describe('detectApiIntent', () => {
+    it('should detect API-related queries', () => {
+      expect(detectApiIntent('find the API endpoint for users')).toBe(true);
+      expect(detectApiIntent('where is the login route')).toBe(true);
+      expect(detectApiIntent('show me the request handler')).toBe(true);
+    });
+
+    it('should detect HTTP method patterns', () => {
+      expect(detectApiIntent('GET /api/users')).toBe(true);
+      expect(detectApiIntent('post /users/create')).toBe(true);
+    });
+
+    it('should detect REST and GraphQL', () => {
+      expect(detectApiIntent('REST API for payments')).toBe(true);
+      expect(detectApiIntent('GraphQL resolver for orders')).toBe(true);
+    });
+
+    it('should not detect non-API queries', () => {
+      expect(detectApiIntent('fix the login bug')).toBe(false);
+      expect(detectApiIntent('add dark mode support')).toBe(false);
+    });
+
+    it('should detect controller and middleware mentions', () => {
+      expect(detectApiIntent('find the auth controller')).toBe(true);
+      expect(detectApiIntent('CORS middleware configuration')).toBe(true);
+    });
+  });
+
+  describe('inferRouteDirectories', () => {
+    it('should detect route directories', () => {
+      const files = [
+        'src/routes/auth.ts',
+        'src/routes/users.ts',
+        'src/utils/format.ts',
+      ];
+      const dirs = inferRouteDirectories(files);
+      expect(dirs).toBeDefined();
+      if (dirs) {
+        expect(dirs.some(d => d.includes('route'))).toBe(true);
+      }
+    });
+
+    it('should detect controller directories', () => {
+      const files = [
+        'src/controllers/AuthController.ts',
+        'src/models/User.ts',
+      ];
+      const dirs = inferRouteDirectories(files);
+      expect(dirs).toBeDefined();
+      if (dirs) {
+        expect(dirs.some(d => d.includes('controller'))).toBe(true);
+      }
+    });
+
+    it('should detect api directories', () => {
+      const files = [
+        'src/api/v1/users.ts',
+        'src/api/v1/orders.ts',
+      ];
+      const dirs = inferRouteDirectories(files);
+      expect(dirs).toBeDefined();
+      if (dirs) {
+        expect(dirs.some(d => d.includes('api'))).toBe(true);
+      }
+    });
+
+    it('should return undefined when no route dirs found', () => {
+      const files = [
+        'src/utils/format.ts',
+        'src/models/User.ts',
+        'src/index.ts',
+      ];
+      const dirs = inferRouteDirectories(files);
+      expect(dirs).toBeUndefined();
+    });
+  });
+
+  describe('STOP_WORDS', () => {
+    it('should contain common English stop words', () => {
+      expect(STOP_WORDS.has('the')).toBe(true);
+      expect(STOP_WORDS.has('and')).toBe(true);
+      expect(STOP_WORDS.has('or')).toBe(true);
+    });
+
+    it('should contain action verbs used in queries', () => {
+      expect(STOP_WORDS.has('find')).toBe(true);
+      expect(STOP_WORDS.has('show')).toBe(true);
+      expect(STOP_WORDS.has('get')).toBe(true);
+      expect(STOP_WORDS.has('list')).toBe(true);
+    });
+
+    it('should not contain technical terms', () => {
+      expect(STOP_WORDS.has('function')).toBe(false);
+      expect(STOP_WORDS.has('class')).toBe(false);
+      expect(STOP_WORDS.has('auth')).toBe(false);
+    });
+  });
+});

+ 13 - 1
src/db/queries.ts

@@ -18,6 +18,7 @@ import {
   SearchResult,
 } from '../types';
 import { safeJsonParse } from '../utils';
+import { kindBonus, scorePathRelevance } from '../search/query-utils';
 
 /**
  * Database row types (snake_case from SQLite)
@@ -451,6 +452,15 @@ export class QueryBuilder {
       results = this.searchNodesLike(query, { kinds, languages, limit, offset });
     }
 
+    // Apply multi-signal scoring
+    if (results.length > 0 && query) {
+      results = results.map(r => ({
+        ...r,
+        score: r.score + kindBonus(r.node.kind) + scorePathRelevance(r.node.filePath, query),
+      }));
+      results.sort((a, b) => b.score - a.score);
+    }
+
     return results;
   }
 
@@ -463,9 +473,11 @@ export class QueryBuilder {
     // Add prefix wildcard for better matching (e.g., "auth" matches "AuthService", "authenticate")
     // Escape special FTS5 characters and add prefix wildcard
     const ftsQuery = query
-      .replace(/['"*()]/g, '') // Remove special chars
+      .replace(/['"*():^]/g, '') // Remove FTS5 special chars
       .split(/\s+/)
       .filter(term => term.length > 0)
+      // Strip FTS5 boolean operators to prevent query manipulation
+      .filter(term => !/^(AND|OR|NOT|NEAR)$/i.test(term))
       .map(term => `"${term}"*`) // Prefix match each term
       .join(' OR ');
 

+ 129 - 0
src/search/query-utils.ts

@@ -0,0 +1,129 @@
+/**
+ * Search Query Utilities
+ *
+ * Shared module for search term extraction, scoring, and intent detection.
+ */
+
+import * as path from 'path';
+import { Node } from '../types';
+
+/**
+ * Common stop words to filter from search queries
+ */
+export const STOP_WORDS = new Set([
+  'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+  'of', 'with', 'by', 'from', 'is', 'it', 'that', 'this', 'are', 'was',
+  'be', 'has', 'had', 'have', 'do', 'does', 'did', 'will', 'would', 'could',
+  'should', 'may', 'might', 'can', 'shall', 'not', 'no', 'all', 'each',
+  'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
+  'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
+  'find', 'show', 'get', 'list', 'give', 'tell',
+]);
+
+/**
+ * Extract meaningful search terms from a natural language query
+ */
+export function extractSearchTerms(query: string): string[] {
+  return query
+    .toLowerCase()
+    .replace(/[^\w\s-]/g, ' ')
+    .split(/\s+/)
+    .filter(term => term.length > 1 && !STOP_WORDS.has(term));
+}
+
+/**
+ * Score path relevance to a query
+ * Higher score = more relevant path
+ */
+export function scorePathRelevance(filePath: string, query: string): number {
+  const terms = extractSearchTerms(query);
+  if (terms.length === 0) return 0;
+
+  const pathLower = filePath.toLowerCase();
+  const fileName = path.basename(filePath).toLowerCase();
+  const dirName = path.dirname(filePath).toLowerCase();
+  let score = 0;
+
+  for (const term of terms) {
+    // Exact filename match (strongest)
+    if (fileName.includes(term)) score += 10;
+    // Directory match
+    if (dirName.includes(term)) score += 5;
+    // General path match
+    else if (pathLower.includes(term)) score += 3;
+  }
+
+  return score;
+}
+
+/**
+ * Kind-based bonus for search ranking
+ * Functions and classes are typically more relevant than variables/imports
+ */
+export function kindBonus(kind: Node['kind']): number {
+  const bonuses: Record<string, number> = {
+    function: 10,
+    method: 10,
+    class: 8,
+    interface: 7,
+    type_alias: 6,
+    struct: 6,
+    trait: 6,
+    enum: 5,
+    component: 8,
+    route: 9,
+    module: 4,
+    property: 3,
+    field: 3,
+    variable: 2,
+    constant: 3,
+    import: 1,
+    export: 1,
+    parameter: 0,
+    namespace: 4,
+    file: 0,
+    protocol: 6,
+    enum_member: 3,
+  };
+  return bonuses[kind] ?? 0;
+}
+
+/**
+ * Detect if a query has API/endpoint intent
+ */
+export function detectApiIntent(query: string): boolean {
+  const apiPatterns = [
+    /\bapi\b/i, /\bendpoint/i, /\broute/i, /\bhandler/i,
+    /\bcontroller/i, /\bmiddleware/i, /\brest\b/i, /\bgraphql/i,
+    /\bget\s+\//, /\bpost\s+\//, /\bput\s+\//, /\bdelete\s+\//,
+    /\brequest/i, /\bresponse/i, /\bhttp/i,
+  ];
+  return apiPatterns.some(p => p.test(query));
+}
+
+/**
+ * Infer route/controller directories from project structure
+ * Returns undefined if no route directories are detected
+ */
+export function inferRouteDirectories(files: string[]): string[] | undefined {
+  const routeDirs = new Set<string>();
+  const routePatterns = [
+    /routes?\//i, /controllers?\//i, /handlers?\//i,
+    /api\//i, /endpoints?\//i,
+  ];
+
+  for (const file of files) {
+    for (const pattern of routePatterns) {
+      if (pattern.test(file)) {
+        const match = file.match(pattern);
+        if (match) {
+          const idx = file.indexOf(match[0]);
+          const dir = file.substring(0, idx + match[0].length - 1);
+          routeDirs.add(dir);
+        }
+      }
+    }
+  }
+
+  return routeDirs.size > 0 ? Array.from(routeDirs) : undefined;
+}