Просмотр исходного кода

feat(context): iter7 — core-directory boost to surface dominant-file siblings in search ranking

On projects with a single file holding the dense majority of internal
call edges (e.g. sinatra's `lib/sinatra/base.rb` at ~85% of in-file
edges), text search was favoring small focused extension files over the
core file. A small focused file like `multi_route.rb` wins on verbatim
name match + file-size normalization, burying the 1500-line core file's
longer method names (e.g. `route!` vs `route`).

Fix: detect the "dominant file" — the file whose in-file edge count is
≥3× the next candidate's — then add +25 to all results sharing its
directory prefix. This pulls the core file's siblings above
sibling-package extensions without hardcoding any repo structure.

`getDominantFile()` excludes test/spec files and generated files
(e.g. etcd's `rpc.pb.go` has 4× the in-file edges of `server.go` and
would otherwise hijack the boost toward generated protobuf stubs).
SQL pulls the top 20 candidates; path-pattern filtering handles what
SQLite LIKE can't express.
Colby McHenry 3 недель назад
Родитель
Сommit
f1a63643a1
2 измененных файлов с 106 добавлено и 0 удалено
  1. 31 0
      src/context/index.ts
  2. 75 0
      src/db/queries.ts

+ 31 - 0
src/context/index.ts

@@ -587,6 +587,37 @@ export class ContextBuilder {
       }
     }
 
+    // Iter7 — Core-directory boost. On projects with one file that holds
+    // the dense majority of internal call edges (e.g. sinatra's
+    // `lib/sinatra/base.rb` at 85% of all in-file edges), the agent's
+    // task usually asks about the framework's core. Without this boost,
+    // ranking favors small focused extension files (e.g. text search
+    // picks `sinatra-contrib/lib/sinatra/multi_route.rb`'s 10-line
+    // `route` method over `base.rb`'s `route!` because the extension
+    // file's `route` matches the query verbatim AND the file is small,
+    // dwarfing the longer name `route!` in a 1500-line file). Boost
+    // results that share a directory prefix with the dominant file's
+    // directory so the core file's siblings outrank sibling-package
+    // extensions.
+    try {
+      const dominant = this.queries.getDominantFile?.();
+      if (dominant && dominant.edgeCount >= 3 * dominant.nextEdgeCount) {
+        // Take the directory of the dominant file (everything up to the
+        // last slash). For `lib/sinatra/base.rb` → `lib/sinatra/`.
+        const slash = dominant.filePath.lastIndexOf('/');
+        if (slash > 0) {
+          const coreDir = dominant.filePath.slice(0, slash + 1);
+          for (const result of searchResults) {
+            if (result.node.filePath.startsWith(coreDir)) {
+              result.score += 25;
+            }
+          }
+        }
+      }
+    } catch {
+      // SQL failure — fall through, scoring works without the boost
+    }
+
     // Step 5a: Multi-term co-occurrence re-ranking (applied BEFORE truncation).
     // For multi-word queries like "search execution from request to shard",
     // nodes matching 2+ query terms in their name or path are far more relevant

+ 75 - 0
src/db/queries.ts

@@ -20,6 +20,32 @@ import {
 import { safeJsonParse } from '../utils';
 import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
 import { parseQuery, boundedEditDistance } from '../search/query-parser';
+import { isGeneratedFile } from '../extraction/generated-detection';
+
+/**
+ * Path-only heuristic for files that should not be candidates for
+ * "dominant file" detection: test/spec files and tool-generated files.
+ * Generated files (`*.pb.go`, `*.pulsar.go`, mock outputs, …) often
+ * have huge in-file edge counts that dwarf the real source — etcd's
+ * `rpc.pb.go` has 4× the in-file edges of `server.go`.
+ */
+function isLowValueFile(filePath: string): boolean {
+  const lp = filePath.toLowerCase();
+  return (
+    /(?:^|\/)(tests?|__tests?__|spec)\//.test(lp) ||
+    /_test\.go$/.test(lp) ||
+    /(?:^|\/)test_[^/]+\.py$/.test(lp) ||
+    /_test\.py$/.test(lp) ||
+    /_spec\.rb$/.test(lp) ||
+    /_test\.rb$/.test(lp) ||
+    /\.(test|spec)\.[jt]sx?$/.test(lp) ||
+    /(test|spec|tests)\.(java|kt|scala)$/.test(lp) ||
+    /(tests?|spec)\.cs$/.test(lp) ||
+    /tests?\.swift$/.test(lp) ||
+    /_test\.dart$/.test(lp) ||
+    isGeneratedFile(filePath)
+  );
+}
 
 const SQLITE_PARAM_CHUNK_SIZE = 500;
 
@@ -182,6 +208,7 @@ export class QueryBuilder {
     getUnresolvedBatch?: SqliteStatement;
     getAllFilePaths?: SqliteStatement;
     getAllNodeNames?: SqliteStatement;
+    getDominantFile?: SqliteStatement;
   } = {};
 
   constructor(db: SqliteDatabase) {
@@ -489,6 +516,54 @@ export class QueryBuilder {
     return rows.map(rowToNode);
   }
 
+  /**
+   * Find the file that holds the densest concentration of the project's
+   * internal call graph — the "core" file. Used by context-builder to
+   * boost ranking of symbols in that file's directory (so e.g. sinatra
+   * queries surface `lib/sinatra/base.rb`'s `route!` instead of
+   * `sinatra-contrib/lib/sinatra/multi_route.rb`'s `route` extension).
+   *
+   * Returns null if no file has a meaningful concentration (e.g. spread
+   * evenly across many files, or empty index).
+   *
+   * "Internal" = source and target are in the same file. Cross-file
+   * edges aren't useful here — they don't tell us which file is the
+   * functional center.
+   *
+   * Excludes test/spec files from candidacy via path-pattern. The agent's
+   * typical question is "how does X work", not "how is X tested", so
+   * boosting a test file's directory would be a misfire.
+   */
+  getDominantFile(): { filePath: string; edgeCount: number; nextEdgeCount: number } | null {
+    if (!this.stmts.getDominantFile) {
+      // Pull top 20 candidates; we then filter out test/generated files
+      // in code (regex-grade matching that SQL LIKE can't express). The
+      // generated-file filter is critical — without it, etcd's
+      // `api/etcdserverpb/rpc.pb.go` (1916 in-file edges, generated
+      // protobuf stub) outranks the real `server/etcdserver/server.go`
+      // (470 edges) by 4×, and the boost would push the agent toward
+      // generated code.
+      this.stmts.getDominantFile = this.db.prepare(`
+        SELECT n.file_path AS file_path, COUNT(*) AS edge_count
+        FROM edges e
+        JOIN nodes n ON e.source = n.id
+        JOIN nodes m ON e.target = m.id
+        WHERE n.file_path = m.file_path
+        GROUP BY n.file_path
+        ORDER BY edge_count DESC
+        LIMIT 20
+      `);
+    }
+    const rows = this.stmts.getDominantFile.all() as Array<{ file_path: string; edge_count: number }>;
+    const filtered = rows.filter(r => !isLowValueFile(r.file_path));
+    if (filtered.length === 0 || filtered[0]!.edge_count < 20) return null;
+    return {
+      filePath: filtered[0]!.file_path,
+      edgeCount: filtered[0]!.edge_count,
+      nextEdgeCount: filtered[1]?.edge_count ?? 0,
+    };
+  }
+
   /**
    * Get all nodes of a specific kind
    */