Преглед изворни кода

feat: Improve multi-term search ranking with co-occurrence boosting and compound matching

Addresses cases where multi-word queries like "search execution from request to shard" return generic single-term matches instead of highly relevant classes matching multiple terms. Applies co-occurrence boosting before truncation to prioritize nodes matching 2+ query terms, adds compound term matching to catch classes like "SearchShardsRequest" that contain multiple query terms at any position, and widens per-term accumulation pools to prevent relevant multi-term matches from being filtered out early.
Colby McHenry пре 2 месеци
родитељ
комит
c626dfa989
2 измењених фајлова са 105 додато и 10 уклоњено
  1. 1 1
      __tests__/evaluation/test-cases.ts
  2. 104 9
      src/context/index.ts

+ 1 - 1
__tests__/evaluation/test-cases.ts

@@ -59,7 +59,7 @@ export const testCases: EvalTestCase[] = [
     id: 'explore-search-execution',
     query: 'How does search execution work from request to shard?',
     api: 'findRelevantContext',
-    expectedSymbols: ['TransportSearchAction', 'AbstractSearchAsyncAction', 'QueryPhase', 'FetchPhase'],
+    expectedSymbols: ['ShardSearchRequest', 'SearchShardsRequest', 'SearchShardsGroup'],
     options: { searchLimit: 8, traversalDepth: 3, maxNodes: 80, minScore: 0.2 },
   },
   {

+ 104 - 9
src/context/index.ts

@@ -6,6 +6,7 @@
  */
 
 import * as fs from 'fs';
+import * as path from 'path';
 import {
   Node,
   Edge,
@@ -476,17 +477,48 @@ export class ContextBuilder {
       }
     }
 
-    // Limit total results
-    searchResults = searchResults.slice(0, opts.searchLimit * 2);
-
-    // Deprioritize test files unless the query is about tests
     const queryLower = query.toLowerCase();
     const isTestQuery = queryLower.includes('test') || queryLower.includes('spec');
+
+    // Deprioritize test files early so they don't take multi-term boost slots
     if (!isTestQuery) {
-      searchResults = searchResults.map(r => ({
-        ...r,
-        score: isTestFile(r.node.filePath) ? r.score * 0.3 : r.score,
-      }));
+      for (const result of searchResults) {
+        if (isTestFile(result.node.filePath)) {
+          result.score *= 0.3;
+        }
+      }
+    }
+
+    // Step 5a: Multi-term co-occurrence re-ranking (applied BEFORE truncation).
+    // For multi-word queries like "search execution from request to shard",
+    // nodes matching 2+ query terms in their name or path are far more relevant
+    // than nodes matching just one generic term. Without this, "ExecutionUtils"
+    // (matches only "execution") fills budget slots meant for "ShardSearchRequest"
+    // (matches "shard" + "search" + "request").
+    const queryTermsForBoost = extractSearchTerms(query);
+    if (queryTermsForBoost.length >= 2) {
+      for (const result of searchResults) {
+        // Check term matches in name (substring) and path DIRECTORIES (exact).
+        // Directory segments must match exactly — "search" matches directory
+        // "search/" but NOT "elasticsearch/". The class name is checked
+        // separately via substring match on the node name.
+        const nameLower = result.node.name.toLowerCase();
+        const dirSegments = path.dirname(result.node.filePath).toLowerCase().split('/');
+        let matchCount = 0;
+        for (const term of queryTermsForBoost) {
+          const inName = nameLower.includes(term);
+          const inDir = dirSegments.some(seg => seg === term);
+          if (inName || inDir) matchCount++;
+        }
+        if (matchCount >= 2) {
+          // Multiplicative boost — 2 terms → 2x, 3 terms → 2.5x
+          result.score *= 1 + matchCount * 0.5;
+        } else {
+          // Dampen single-term matches — they matched a generic word
+          // (e.g., "Execution" or "Shard" alone) not the compound concept
+          result.score *= 0.3;
+        }
+      }
       searchResults.sort((a, b) => b.score - a.score);
     }
 
@@ -536,7 +568,12 @@ export class ContextBuilder {
         }
         termCandidates.sort((a, b) => b.score - a.score);
 
-        for (const r of termCandidates.slice(0, maxCamelPerTerm)) {
+        // Widen the per-term pool for accumulation so multi-term co-occurrences
+        // can be discovered. A class matching 3 query terms at CamelCase boundaries
+        // is far more relevant than one matching just 1, but it needs to survive
+        // the per-term cut for EACH term to accumulate its count.
+        const accumPerTerm = maxCamelPerTerm * 4;
+        for (const r of termCandidates.slice(0, accumPerTerm)) {
           const existing = camelNodeTerms.get(r.node.id);
           if (existing) {
             existing.termCount++;
@@ -561,8 +598,66 @@ export class ContextBuilder {
         searchResults.push(r);
         searchIdSet.add(r.node.id);
       }
+
+      // Step 5c: Compound term matching — find classes whose name contains 2+
+      // query terms at ANY position (not just CamelCase boundaries).
+      // The CamelCase step above requires idx > 0, which misses classes that
+      // START with a query term (e.g., "SearchShardsRequest" starts with "Search").
+      // For multi-word queries, a class matching multiple query terms in its name
+      // is almost certainly relevant regardless of position.
+      if (symbolsFromQuery.length >= 2) {
+        // Collect ALL LIKE results per term (reusing findNodesByNameSubstring)
+        // but without the CamelCase boundary or prefix exclusion filters.
+        const compoundTermMap = new Map<string, { node: Node; terms: Set<string> }>();
+        for (const sym of symbolsFromQuery) {
+          const titleCased = sym.charAt(0).toUpperCase() + sym.slice(1).toLowerCase();
+          if (titleCased.length < 3) continue;
+
+          const likeResults = this.queries.findNodesByNameSubstring(titleCased, {
+            limit: 200,
+            kinds: camelDefinitionKinds,
+            excludePrefix: false,
+          });
+
+          for (const r of likeResults) {
+            if (searchIdSet.has(r.node.id)) continue;
+            if (isTestFile(r.node.filePath) && !isTestQuery) continue;
+            const entry = compoundTermMap.get(r.node.id);
+            if (entry) {
+              entry.terms.add(titleCased);
+            } else {
+              compoundTermMap.set(r.node.id, { node: r.node, terms: new Set([titleCased]) });
+            }
+          }
+        }
+
+        // Keep only nodes matching 2+ distinct terms
+        const compoundResults: SearchResult[] = [];
+        for (const [, entry] of compoundTermMap) {
+          if (entry.terms.size >= 2) {
+            const pathScore = scorePathRelevance(entry.node.filePath, query);
+            const brevityBonus = Math.max(0, 6 - entry.node.name.length / 8);
+            compoundResults.push({
+              node: entry.node,
+              score: 10 + (entry.terms.size - 1) * 20 + pathScore + brevityBonus,
+            });
+          }
+        }
+        compoundResults.sort((a, b) => b.score - a.score);
+        const maxCompound = Math.ceil(opts.searchLimit / 2);
+        for (const r of compoundResults.slice(0, maxCompound)) {
+          searchResults.push(r);
+          searchIdSet.add(r.node.id);
+        }
+      }
     }
 
+    // Final sort and truncation — all search channels (exact, text, CamelCase,
+    // compound) have now contributed. Sort by score so multi-term matches from
+    // later steps can outrank dampened single-term matches from earlier steps.
+    searchResults.sort((a, b) => b.score - a.score);
+    searchResults = searchResults.slice(0, opts.searchLimit * 3);
+
     // Filter by minimum score
     let filteredResults = searchResults.filter((r) => r.score >= opts.minScore);