Răsfoiți Sursa

feat: Fix stem variant inflation in multi-term search boosting by grouping related terms

Addresses cases where stem variants like "index", "indexed", "indexe" were counted as separate term matches, artificially inflating match counts and giving false multi-term boosts to symbols matching one root word multiple times. Groups terms that are substrings of each other before counting matches to ensure each conceptual term contributes only once to the boost calculation.
Colby McHenry 2 luni în urmă
părinte
comite
e41431abc2
1 a modificat fișierele cu 28 adăugiri și 4 ștergeri
  1. 28 4
      src/context/index.ts

+ 28 - 4
src/context/index.ts

@@ -505,6 +505,27 @@ export class ContextBuilder {
     // (matches "shard" + "search" + "request").
     // (matches "shard" + "search" + "request").
     const queryTermsForBoost = extractSearchTerms(query);
     const queryTermsForBoost = extractSearchTerms(query);
     if (queryTermsForBoost.length >= 2) {
     if (queryTermsForBoost.length >= 2) {
+      // Group terms that are substrings of each other (stem variants of the same
+      // root word). "indexed", "indexe", "index" should count as ONE concept match,
+      // not three. Without this, stem variants inflate matchCount and give false
+      // multi-term boosts to symbols matching one root word multiple times.
+      const termGroups: string[][] = [];
+      const sorted = [...queryTermsForBoost].sort((a, b) => b.length - a.length);
+      const assigned = new Set<string>();
+      for (const term of sorted) {
+        if (assigned.has(term)) continue;
+        const group = [term];
+        assigned.add(term);
+        for (const other of sorted) {
+          if (assigned.has(other)) continue;
+          if (term.includes(other) || other.includes(term)) {
+            group.push(other);
+            assigned.add(other);
+          }
+        }
+        termGroups.push(group);
+      }
+
       for (const result of searchResults) {
       for (const result of searchResults) {
         // Check term matches in name (substring) and path DIRECTORIES (exact).
         // Check term matches in name (substring) and path DIRECTORIES (exact).
         // Directory segments must match exactly — "search" matches directory
         // Directory segments must match exactly — "search" matches directory
@@ -513,10 +534,13 @@ export class ContextBuilder {
         const nameLower = result.node.name.toLowerCase();
         const nameLower = result.node.name.toLowerCase();
         const dirSegments = path.dirname(result.node.filePath).toLowerCase().split('/');
         const dirSegments = path.dirname(result.node.filePath).toLowerCase().split('/');
         let matchCount = 0;
         let matchCount = 0;
-        for (const term of queryTermsForBoost) {
-          const inName = nameLower.includes(term);
-          const inDir = dirSegments.some(seg => seg === term);
-          if (inName || inDir) matchCount++;
+        for (const group of termGroups) {
+          const groupMatches = group.some(term => {
+            const inName = nameLower.includes(term);
+            const inDir = dirSegments.some(seg => seg === term);
+            return inName || inDir;
+          });
+          if (groupMatches) matchCount++;
         }
         }
         if (matchCount >= 2) {
         if (matchCount >= 2) {
           // Multiplicative boost — 2 terms → 2x, 3 terms → 2.5x
           // Multiplicative boost — 2 terms → 2x, 3 terms → 2.5x