Sfoglia il codice sorgente

feat: Enhance search ranking with name matching and field extraction improvements

Adds nameMatchBonus scoring to prioritize results where node names exactly or partially match query terms. Implements dedicated field extraction for Java/C# to properly categorize class fields vs variables. Optimizes BM25 search with column weights favoring name matches and increased result fetching before post-processing. Refines stop words list to preserve common programming terms like "get", "find", "list".
Colby McHenry 2 mesi fa
parent
commit
e5663c5952

+ 18 - 4
src/db/queries.ts

@@ -18,7 +18,7 @@ import {
   SearchResult,
 } from '../types';
 import { safeJsonParse } from '../utils';
-import { kindBonus, scorePathRelevance } from '../search/query-utils';
+import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
 
 /**
  * Database row types (snake_case from SQLite)
@@ -492,9 +492,16 @@ export class QueryBuilder {
     if (results.length > 0 && query) {
       results = results.map(r => ({
         ...r,
-        score: r.score + kindBonus(r.node.kind) + scorePathRelevance(r.node.filePath, query),
+        score: r.score
+          + kindBonus(r.node.kind)
+          + scorePathRelevance(r.node.filePath, query)
+          + nameMatchBonus(r.node.name, query),
       }));
       results.sort((a, b) => b.score - a.score);
+      // Trim to requested limit after rescoring
+      if (results.length > limit) {
+        results = results.slice(0, limit);
+      }
     }
 
     return results;
@@ -521,8 +528,15 @@ export class QueryBuilder {
       return [];
     }
 
+    // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1, signature=2
+    // Heavy name weight ensures exact/prefix name matches rank above incidental
+    // mentions in long docstrings or qualified names of nested symbols.
+    // Fetch 5x requested limit so post-hoc rescoring (kindBonus, pathRelevance,
+    // nameMatchBonus) can promote results that BM25 alone undervalues.
+    const ftsLimit = Math.max(limit * 5, 100);
+
     let sql = `
-      SELECT nodes.*, bm25(nodes_fts) as score
+      SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2) as score
       FROM nodes_fts
       JOIN nodes ON nodes_fts.id = nodes.id
       WHERE nodes_fts MATCH ?
@@ -541,7 +555,7 @@ export class QueryBuilder {
     }
 
     sql += ' ORDER BY score LIMIT ? OFFSET ?';
-    params.push(limit, offset);
+    params.push(ftsLimit, offset);
 
     try {
       const rows = this.db.prepare(sql).all(...params) as (NodeRow & { score: number })[];

+ 2 - 1
src/extraction/languages/csharp.ts

@@ -13,7 +13,8 @@ export const csharpExtractor: LanguageExtractor = {
   typeAliasTypes: [],
   importTypes: ['using_directive'],
   callTypes: ['invocation_expression'],
-  variableTypes: ['local_declaration_statement', 'field_declaration'],
+  variableTypes: ['local_declaration_statement'],
+  fieldTypes: ['field_declaration'],
   nameField: 'name',
   bodyField: 'body',
   paramsField: 'parameter_list',

+ 2 - 1
src/extraction/languages/java.ts

@@ -13,7 +13,8 @@ export const javaExtractor: LanguageExtractor = {
   typeAliasTypes: [],
   importTypes: ['import_declaration'],
   callTypes: ['method_invocation'],
-  variableTypes: ['local_variable_declaration', 'field_declaration'],
+  variableTypes: ['local_variable_declaration'],
+  fieldTypes: ['field_declaration'],
   nameField: 'name',
   bodyField: 'body',
   paramsField: 'parameters',

+ 2 - 0
src/extraction/tree-sitter-types.ts

@@ -98,6 +98,8 @@ export interface LanguageExtractor {
   callTypes: string[];
   /** Node types that represent variable declarations (const, let, var, etc.) */
   variableTypes: string[];
+  /** Node types that represent class fields (extracted as 'field' kind inside class bodies) */
+  fieldTypes?: string[];
 
   // --- Field name mappings ---
 

+ 57 - 0
src/extraction/tree-sitter.ts

@@ -276,6 +276,11 @@ export class TreeSitterExtractor {
     else if (this.extractor.typeAliasTypes.includes(nodeType)) {
       this.extractTypeAlias(node);
     }
+    // Check for class fields (e.g. Java field_declaration, C# field_declaration)
+    else if (this.extractor.fieldTypes?.includes(nodeType) && this.isInsideClassLikeNode()) {
+      this.extractField(node);
+      skipChildren = true;
+    }
     // Check for variable declarations (const, let, var, etc.)
     // Only extract top-level variables (not inside functions/methods)
     else if (this.extractor.variableTypes.includes(nodeType) && !this.isInsideClassLikeNode()) {
@@ -656,6 +661,58 @@ export class TreeSitterExtractor {
     }
   }
 
+  /**
+   * Extract a class field declaration (e.g. Java field_declaration, C# field_declaration).
+   * Extracts each declarator as a 'field' kind node inside the owning class.
+   */
+  private extractField(node: SyntaxNode): void {
+    if (!this.extractor) return;
+
+    const docstring = getPrecedingDocstring(node, this.source);
+    const visibility = this.extractor.getVisibility?.(node);
+    const isStatic = this.extractor.isStatic?.(node) ?? false;
+
+    // Java field_declaration: "private final String name = value;"
+    // Children include modifiers, type, variable_declarator(s)
+    const declarators = node.namedChildren.filter(
+      c => c.type === 'variable_declarator'
+    );
+
+    if (declarators.length > 0) {
+      // Get field type from the type child
+      const typeNode = node.namedChildren.find(
+        c => c.type !== 'modifiers' && c.type !== 'variable_declarator'
+          && c.type !== 'marker_annotation' && c.type !== 'annotation'
+      );
+      const typeText = typeNode ? getNodeText(typeNode, this.source) : undefined;
+
+      for (const decl of declarators) {
+        const nameNode = getChildByField(decl, 'name');
+        if (!nameNode) continue;
+        const name = getNodeText(nameNode, this.source);
+        const signature = typeText ? `${typeText} ${name}` : name;
+        this.createNode('field', name, decl, {
+          docstring,
+          signature,
+          visibility,
+          isStatic,
+        });
+      }
+    } else {
+      // Fallback: try to find an identifier child directly
+      const nameNode = getChildByField(node, 'name')
+        || node.namedChildren.find(c => c.type === 'identifier');
+      if (nameNode) {
+        const name = getNodeText(nameNode, this.source);
+        this.createNode('field', name, node, {
+          docstring,
+          visibility,
+          isStatic,
+        });
+      }
+    }
+  }
+
   /**
    * Extract a variable declaration (const, let, var, etc.)
    *

+ 47 - 5
src/search/query-utils.ts

@@ -19,13 +19,13 @@ export const STOP_WORDS = new Set([
   'should', 'may', 'might', 'can', 'shall', 'not', 'no', 'all', 'each',
   'every', 'how', 'what', 'where', 'when', 'who', 'which', 'why',
   'i', 'me', 'my', 'we', 'our', 'you', 'your', 'he', 'she', 'they',
-  'find', 'show', 'get', 'list', 'give', 'tell',
-  'been', 'done', 'made', 'used', 'using', 'work', 'works', 'found',
+  'show', 'give', 'tell',
+  'been', 'done', 'made', 'used', 'using', 'works', 'found',
   'also', 'into', 'then', 'than', 'just', 'more', 'some', 'such',
-  'over', 'only', 'new', 'out', 'its', 'so', 'up', 'as', 'if',
-  // Code-specific noise
+  'over', 'only', 'out', 'its', 'so', 'up', 'as', 'if',
+  // Code-specific noise (avoid filtering common symbol names like get/set/add/build/find/list)
   'code', 'file', 'files', 'function', 'method', 'class', 'type',
-  'build', 'fix', 'bug', 'called', 'set', 'add',
+  'fix', 'bug', 'called',
 ]);
 
 /**
@@ -140,6 +140,48 @@ export function isTestFile(filePath: string): boolean {
   );
 }
 
+/**
+ * Bonus when a node's name matches the search query.
+ * Exact matches get the largest boost; prefix matches get smaller boosts.
+ * Multi-word queries also check individual term matches against the name.
+ */
+export function nameMatchBonus(nodeName: string, query: string): number {
+  const nameLower = nodeName.toLowerCase();
+
+  // Split query into word-level terms (handles "CacheBuilder build" → ["cache","builder","build"])
+  const rawTerms = query
+    .replace(/([a-z])([A-Z])/g, '$1 $2')
+    .split(/[\s_.\-]+/)
+    .map(t => t.toLowerCase())
+    .filter(t => t.length >= 2);
+
+  // Also keep original space-separated tokens for exact-term matching
+  const queryTokens = query.split(/\s+/).map(t => t.toLowerCase()).filter(t => t.length >= 2);
+
+  // Full query as a single token (for compound identifiers like "CacheBuilder")
+  const queryLower = query.replace(/[\s]+/g, '').toLowerCase();
+
+  // Exact match: query exactly equals the node name
+  if (nameLower === queryLower) return 30;
+
+  // Exact match on a query token: "CacheBuilder build" and node name is "build"
+  if (queryTokens.length > 1 && queryTokens.includes(nameLower)) return 25;
+
+  // Name starts with query (prefix search: "Cache" → "CacheBuilder")
+  if (nameLower.startsWith(queryLower)) return 20;
+
+  // All camelCase-split terms appear in the name
+  if (rawTerms.length > 1) {
+    const allMatch = rawTerms.every(t => nameLower.includes(t));
+    if (allMatch) return 15;
+  }
+
+  // Name contains the full query as substring
+  if (nameLower.includes(queryLower)) return 10;
+
+  return 0;
+}
+
 /**
  * Kind-based bonus for search ranking
  * Functions and classes are typically more relevant than variables/imports