| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839 |
- /**
- * Database Queries
- *
- * Prepared statements for CRUD operations on the knowledge graph.
- */
- import { SqliteDatabase, SqliteStatement } from './sqlite-adapter';
- import {
- Node,
- Edge,
- FileRecord,
- UnresolvedReference,
- NodeKind,
- EdgeKind,
- Language,
- GraphStats,
- SearchOptions,
- SearchResult,
- } from '../types';
- import { safeJsonParse } from '../utils';
- import { kindBonus, nameMatchBonus, scorePathRelevance } from '../search/query-utils';
- import { parseQuery, boundedEditDistance } from '../search/query-parser';
- import { isGeneratedFile } from '../extraction/generated-detection';
- /**
- * Path-only heuristic for files that should not be candidates for
- * "dominant file" detection: test/spec files and tool-generated files.
- * Generated files (`*.pb.go`, `*.pulsar.go`, mock outputs, …) often
- * have huge in-file edge counts that dwarf the real source — etcd's
- * `rpc.pb.go` has 4× the in-file edges of `server.go`.
- */
- function isLowValueFile(filePath: string): boolean {
- const lp = filePath.toLowerCase();
- return (
- /(?:^|\/)(tests?|__tests?__|spec)\//.test(lp) ||
- /_test\.go$/.test(lp) ||
- /(?:^|\/)test_[^/]+\.py$/.test(lp) ||
- /_test\.py$/.test(lp) ||
- /_spec\.rb$/.test(lp) ||
- /_test\.rb$/.test(lp) ||
- /\.(test|spec)\.[jt]sx?$/.test(lp) ||
- /(test|spec|tests)\.(java|kt|scala)$/.test(lp) ||
- /(tests?|spec)\.cs$/.test(lp) ||
- /tests?\.swift$/.test(lp) ||
- /_test\.dart$/.test(lp) ||
- isGeneratedFile(filePath)
- );
- }
- const SQLITE_PARAM_CHUNK_SIZE = 500;
- /**
- * Database row types (snake_case from SQLite)
- */
- interface NodeRow {
- id: string;
- kind: string;
- name: string;
- qualified_name: string;
- file_path: string;
- language: string;
- start_line: number;
- end_line: number;
- start_column: number;
- end_column: number;
- docstring: string | null;
- signature: string | null;
- visibility: string | null;
- is_exported: number;
- is_async: number;
- is_static: number;
- is_abstract: number;
- decorators: string | null;
- type_parameters: string | null;
- return_type: string | null;
- updated_at: number;
- }
- interface EdgeRow {
- id: number;
- source: string;
- target: string;
- kind: string;
- metadata: string | null;
- line: number | null;
- col: number | null;
- provenance: string | null;
- }
- interface FileRow {
- path: string;
- content_hash: string;
- language: string;
- size: number;
- modified_at: number;
- indexed_at: number;
- node_count: number;
- errors: string | null;
- }
- interface UnresolvedRefRow {
- id: number;
- from_node_id: string;
- reference_name: string;
- reference_kind: string;
- line: number;
- col: number;
- candidates: string | null;
- file_path: string;
- language: string;
- }
- /**
- * Convert database row to Node object
- */
- function rowToNode(row: NodeRow): Node {
- return {
- id: row.id,
- kind: row.kind as NodeKind,
- name: row.name,
- qualifiedName: row.qualified_name,
- filePath: row.file_path,
- language: row.language as Language,
- startLine: row.start_line,
- endLine: row.end_line,
- startColumn: row.start_column,
- endColumn: row.end_column,
- docstring: row.docstring ?? undefined,
- signature: row.signature ?? undefined,
- visibility: row.visibility as Node['visibility'],
- isExported: row.is_exported === 1,
- isAsync: row.is_async === 1,
- isStatic: row.is_static === 1,
- isAbstract: row.is_abstract === 1,
- decorators: row.decorators ? safeJsonParse(row.decorators, undefined) : undefined,
- typeParameters: row.type_parameters ? safeJsonParse(row.type_parameters, undefined) : undefined,
- returnType: row.return_type ?? undefined,
- updatedAt: row.updated_at,
- };
- }
- /**
- * Convert database row to Edge object
- */
- function rowToEdge(row: EdgeRow): Edge {
- return {
- source: row.source,
- target: row.target,
- kind: row.kind as EdgeKind,
- metadata: row.metadata ? safeJsonParse(row.metadata, undefined) : undefined,
- line: row.line ?? undefined,
- column: row.col ?? undefined,
- provenance: row.provenance as Edge['provenance'],
- };
- }
- /**
- * Convert database row to FileRecord object
- */
- function rowToFileRecord(row: FileRow): FileRecord {
- return {
- path: row.path,
- contentHash: row.content_hash,
- language: row.language as Language,
- size: row.size,
- modifiedAt: row.modified_at,
- indexedAt: row.indexed_at,
- nodeCount: row.node_count,
- errors: row.errors ? safeJsonParse(row.errors, undefined) : undefined,
- };
- }
- /**
- * Query builder for the knowledge graph database
- */
- export class QueryBuilder {
- private db: SqliteDatabase;
- // Project-name tokens (go.mod / package.json / repo dir), normalized. A query
- // word matching one is dropped from path-relevance scoring — it names the
- // whole project, not a symbol, so it carries no discriminative signal (#720).
- // Set once by the CodeGraph instance; empty by default (no down-weighting).
- private projectNameTokens: Set<string> = new Set();
- // Node cache for frequently accessed nodes (LRU-style, max 1000 entries)
- private nodeCache: Map<string, Node> = new Map();
- private readonly maxCacheSize = 1000;
- // Prepared statements (lazily initialized)
- private stmts: {
- insertNode?: SqliteStatement;
- updateNode?: SqliteStatement;
- deleteNode?: SqliteStatement;
- deleteNodesByFile?: SqliteStatement;
- getNodeById?: SqliteStatement;
- getNodesByFile?: SqliteStatement;
- getNodesByKind?: SqliteStatement;
- insertEdge?: SqliteStatement;
- upsertFile?: SqliteStatement;
- deleteEdgesBySource?: SqliteStatement;
- deleteEdgesByTarget?: SqliteStatement;
- getEdgesBySource?: SqliteStatement;
- getEdgesByTarget?: SqliteStatement;
- insertFile?: SqliteStatement;
- updateFile?: SqliteStatement;
- deleteFile?: SqliteStatement;
- getFileByPath?: SqliteStatement;
- getAllFiles?: SqliteStatement;
- insertUnresolved?: SqliteStatement;
- deleteUnresolvedByNode?: SqliteStatement;
- getUnresolvedByName?: SqliteStatement;
- getNodesByName?: SqliteStatement;
- getNodesByQualifiedNameExact?: SqliteStatement;
- getNodesByLowerName?: SqliteStatement;
- getUnresolvedCount?: SqliteStatement;
- getUnresolvedBatch?: SqliteStatement;
- getAllFilePaths?: SqliteStatement;
- getAllNodeNames?: SqliteStatement;
- getDominantFile?: SqliteStatement;
- getTopRouteFile?: SqliteStatement;
- getRoutingManifest?: SqliteStatement;
- } = {};
- constructor(db: SqliteDatabase) {
- this.db = db;
- }
- /** Set the normalized project-name tokens used to down-weight non-discriminative
- * query words in path scoring (#720). Called once when the project opens. */
- setProjectNameTokens(tokens: Set<string>): void {
- this.projectNameTokens = tokens;
- }
- /** The normalized project-name tokens (#720); empty if none were derived. */
- getProjectNameTokens(): Set<string> {
- return this.projectNameTokens;
- }
- // ===========================================================================
- // Node Operations
- // ===========================================================================
- /**
- * Insert a new node
- */
- insertNode(node: Node): void {
- if (!this.stmts.insertNode) {
- this.stmts.insertNode = this.db.prepare(`
- INSERT OR REPLACE INTO nodes (
- id, kind, name, qualified_name, file_path, language,
- start_line, end_line, start_column, end_column,
- docstring, signature, visibility,
- is_exported, is_async, is_static, is_abstract,
- decorators, type_parameters, return_type, updated_at
- ) VALUES (
- @id, @kind, @name, @qualifiedName, @filePath, @language,
- @startLine, @endLine, @startColumn, @endColumn,
- @docstring, @signature, @visibility,
- @isExported, @isAsync, @isStatic, @isAbstract,
- @decorators, @typeParameters, @returnType, @updatedAt
- )
- `);
- }
- // Validate required fields to prevent SQLite bind errors
- if (!node.id || !node.kind || !node.name || !node.filePath || !node.language) {
- console.error('[CodeGraph] Skipping node with missing required fields:', {
- id: node.id,
- kind: node.kind,
- name: node.name,
- filePath: node.filePath,
- language: node.language,
- });
- return;
- }
- // INSERT OR REPLACE may overwrite a node we have cached. Drop the
- // stale entry so the next getNodeById sees the new row, not the old
- // one (matches the cache-invalidation pattern used by updateNode and
- // deleteNode below).
- this.nodeCache.delete(node.id);
- this.stmts.insertNode.run({
- id: node.id,
- kind: node.kind,
- name: node.name,
- qualifiedName: node.qualifiedName ?? node.name,
- filePath: node.filePath,
- language: node.language,
- startLine: node.startLine ?? 0,
- endLine: node.endLine ?? 0,
- startColumn: node.startColumn ?? 0,
- endColumn: node.endColumn ?? 0,
- docstring: node.docstring ?? null,
- signature: node.signature ?? null,
- visibility: node.visibility ?? null,
- isExported: node.isExported ? 1 : 0,
- isAsync: node.isAsync ? 1 : 0,
- isStatic: node.isStatic ? 1 : 0,
- isAbstract: node.isAbstract ? 1 : 0,
- decorators: node.decorators ? JSON.stringify(node.decorators) : null,
- typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null,
- returnType: node.returnType ?? null,
- updatedAt: node.updatedAt ?? Date.now(),
- });
- }
- /**
- * Insert multiple nodes in a transaction
- */
- insertNodes(nodes: Node[]): void {
- this.db.transaction(() => {
- for (const node of nodes) {
- this.insertNode(node);
- }
- })();
- }
- /**
- * Update an existing node
- */
- updateNode(node: Node): void {
- if (!this.stmts.updateNode) {
- this.stmts.updateNode = this.db.prepare(`
- UPDATE nodes SET
- kind = @kind,
- name = @name,
- qualified_name = @qualifiedName,
- file_path = @filePath,
- language = @language,
- start_line = @startLine,
- end_line = @endLine,
- start_column = @startColumn,
- end_column = @endColumn,
- docstring = @docstring,
- signature = @signature,
- visibility = @visibility,
- is_exported = @isExported,
- is_async = @isAsync,
- is_static = @isStatic,
- is_abstract = @isAbstract,
- decorators = @decorators,
- type_parameters = @typeParameters,
- return_type = @returnType,
- updated_at = @updatedAt
- WHERE id = @id
- `);
- }
- // Invalidate cache before update
- this.nodeCache.delete(node.id);
- // Validate required fields
- if (!node.id || !node.kind || !node.name || !node.filePath || !node.language) {
- console.error('[CodeGraph] Skipping node update with missing required fields:', node.id);
- return;
- }
- this.stmts.updateNode.run({
- id: node.id,
- kind: node.kind,
- name: node.name,
- qualifiedName: node.qualifiedName ?? node.name,
- filePath: node.filePath,
- language: node.language,
- startLine: node.startLine ?? 0,
- endLine: node.endLine ?? 0,
- startColumn: node.startColumn ?? 0,
- endColumn: node.endColumn ?? 0,
- docstring: node.docstring ?? null,
- signature: node.signature ?? null,
- visibility: node.visibility ?? null,
- isExported: node.isExported ? 1 : 0,
- isAsync: node.isAsync ? 1 : 0,
- isStatic: node.isStatic ? 1 : 0,
- isAbstract: node.isAbstract ? 1 : 0,
- decorators: node.decorators ? JSON.stringify(node.decorators) : null,
- typeParameters: node.typeParameters ? JSON.stringify(node.typeParameters) : null,
- returnType: node.returnType ?? null,
- updatedAt: node.updatedAt ?? Date.now(),
- });
- }
- /**
- * Delete a node by ID
- */
- deleteNode(id: string): void {
- if (!this.stmts.deleteNode) {
- this.stmts.deleteNode = this.db.prepare('DELETE FROM nodes WHERE id = ?');
- }
- // Invalidate cache
- this.nodeCache.delete(id);
- this.stmts.deleteNode.run(id);
- }
- /**
- * Delete all nodes for a file
- */
- deleteNodesByFile(filePath: string): void {
- if (!this.stmts.deleteNodesByFile) {
- this.stmts.deleteNodesByFile = this.db.prepare('DELETE FROM nodes WHERE file_path = ?');
- }
- // Invalidate cache for nodes in this file
- for (const [id, node] of this.nodeCache) {
- if (node.filePath === filePath) {
- this.nodeCache.delete(id);
- }
- }
- this.stmts.deleteNodesByFile.run(filePath);
- }
- /**
- * Get a node by ID
- */
- getNodeById(id: string): Node | null {
- // Check cache first
- if (this.nodeCache.has(id)) {
- const cached = this.nodeCache.get(id)!;
- // Move to end to implement LRU (delete and re-add)
- this.nodeCache.delete(id);
- this.nodeCache.set(id, cached);
- return cached;
- }
- if (!this.stmts.getNodeById) {
- this.stmts.getNodeById = this.db.prepare('SELECT * FROM nodes WHERE id = ?');
- }
- const row = this.stmts.getNodeById.get(id) as NodeRow | undefined;
- if (!row) {
- return null;
- }
- const node = rowToNode(row);
- this.cacheNode(node);
- return node;
- }
- /**
- * Batch lookup: fetch many nodes by ID in a single SQL round-trip.
- *
- * Replaces the N+1 pattern in graph traversal where every edge would
- * trigger its own `getNodeById` call. For a function with 50 callers
- * this collapses 50 point reads into one IN-list query (~10-50x
- * faster end-to-end).
- *
- * Returns a Map keyed by id so callers can preserve their own ordering
- * (typically the order edges were returned from the graph). Missing IDs
- * are simply absent from the map.
- *
- * Cache-aware: ids already in the LRU cache are served from memory and
- * the SQL query only touches the misses.
- */
- getNodesByIds(ids: readonly string[]): Map<string, Node> {
- const out = new Map<string, Node>();
- if (ids.length === 0) return out;
- // Serve cache hits first; build the miss list for SQL.
- const misses: string[] = [];
- for (const id of ids) {
- const cached = this.nodeCache.get(id);
- if (cached !== undefined) {
- // LRU touch
- this.nodeCache.delete(id);
- this.nodeCache.set(id, cached);
- out.set(id, cached);
- } else {
- misses.push(id);
- }
- }
- if (misses.length === 0) return out;
- // Chunk under SQLite's parameter limit (default 999, raised to 32766
- // in better-sqlite3 builds — chunk at 500 for safety across both
- // backends and to keep the query plan simple).
- for (let i = 0; i < misses.length; i += SQLITE_PARAM_CHUNK_SIZE) {
- const chunk = misses.slice(i, i + SQLITE_PARAM_CHUNK_SIZE);
- const placeholders = chunk.map(() => '?').join(',');
- const rows = this.db
- .prepare(`SELECT * FROM nodes WHERE id IN (${placeholders})`)
- .all(...chunk) as NodeRow[];
- for (const row of rows) {
- const node = rowToNode(row);
- out.set(node.id, node);
- this.cacheNode(node);
- }
- }
- return out;
- }
- private getExistingNodeIds(ids: readonly string[]): Set<string> {
- const out = new Set<string>();
- if (ids.length === 0) return out;
- const uniqueIds = [...new Set(ids)];
- for (let i = 0; i < uniqueIds.length; i += SQLITE_PARAM_CHUNK_SIZE) {
- const chunk = uniqueIds.slice(i, i + SQLITE_PARAM_CHUNK_SIZE);
- const placeholders = chunk.map(() => '?').join(',');
- const rows = this.db
- .prepare(`SELECT id FROM nodes WHERE id IN (${placeholders})`)
- .all(...chunk) as { id: string }[];
- for (const row of rows) {
- out.add(row.id);
- }
- }
- return out;
- }
- /**
- * Add a node to the cache, evicting oldest if needed
- */
- private cacheNode(node: Node): void {
- if (this.nodeCache.size >= this.maxCacheSize) {
- // Evict oldest (first) entry
- const firstKey = this.nodeCache.keys().next().value;
- if (firstKey) {
- this.nodeCache.delete(firstKey);
- }
- }
- this.nodeCache.set(node.id, node);
- }
- /**
- * Clear the node cache
- */
- clearCache(): void {
- this.nodeCache.clear();
- }
- /**
- * Get all nodes in a file
- */
- getNodesByFile(filePath: string): Node[] {
- if (!this.stmts.getNodesByFile) {
- this.stmts.getNodesByFile = this.db.prepare(
- 'SELECT * FROM nodes WHERE file_path = ? ORDER BY start_line'
- );
- }
- const rows = this.stmts.getNodesByFile.all(filePath) as NodeRow[];
- return rows.map(rowToNode);
- }
- /**
- * Find the file that holds the densest concentration of the project's
- * internal call graph — the "core" file. Used by context-builder to
- * boost ranking of symbols in that file's directory (so e.g. sinatra
- * queries surface `lib/sinatra/base.rb`'s `route!` instead of
- * `sinatra-contrib/lib/sinatra/multi_route.rb`'s `route` extension).
- *
- * Returns null if no file has a meaningful concentration (e.g. spread
- * evenly across many files, or empty index).
- *
- * "Internal" = source and target are in the same file. Cross-file
- * edges aren't useful here — they don't tell us which file is the
- * functional center.
- *
- * Excludes test/spec files from candidacy via path-pattern. The agent's
- * typical question is "how does X work", not "how is X tested", so
- * boosting a test file's directory would be a misfire.
- */
- getDominantFile(): { filePath: string; edgeCount: number; nextEdgeCount: number } | null {
- if (!this.stmts.getDominantFile) {
- // Pull top 20 candidates; we then filter out test/generated files
- // in code (regex-grade matching that SQL LIKE can't express). The
- // generated-file filter is critical — without it, etcd's
- // `api/etcdserverpb/rpc.pb.go` (1916 in-file edges, generated
- // protobuf stub) outranks the real `server/etcdserver/server.go`
- // (470 edges) by 4×, and the boost would push the agent toward
- // generated code.
- this.stmts.getDominantFile = this.db.prepare(`
- SELECT n.file_path AS file_path, COUNT(*) AS edge_count
- FROM edges e
- JOIN nodes n ON e.source = n.id
- JOIN nodes m ON e.target = m.id
- WHERE n.file_path = m.file_path
- GROUP BY n.file_path
- ORDER BY edge_count DESC
- LIMIT 20
- `);
- }
- const rows = this.stmts.getDominantFile.all() as Array<{ file_path: string; edge_count: number }>;
- const filtered = rows.filter(r => !isLowValueFile(r.file_path));
- if (filtered.length === 0 || filtered[0]!.edge_count < 20) return null;
- return {
- filePath: filtered[0]!.file_path,
- edgeCount: filtered[0]!.edge_count,
- nextEdgeCount: filtered[1]?.edge_count ?? 0,
- };
- }
- /**
- * Find the file that holds the densest concentration of the project's
- * `route` nodes (framework-emitted: Express/Gin/Flask/Rails/Drupal/etc.).
- * Used by handleContext on small repos to inline the project's routing
- * config when the agent's query is about request flow — eliminating the
- * "Glob + Read routes.rb" pattern that beats codegraph on tiny realworld
- * template repos.
- *
- * Excludes test/generated files from candidacy. Returns null if there
- * are fewer than 3 non-test routes total, or if no file holds at least
- * 30% of them (diffuse routing → no single answer file).
- */
- getTopRouteFile(): { filePath: string; routeCount: number; totalRoutes: number } | null {
- if (!this.stmts.getTopRouteFile) {
- this.stmts.getTopRouteFile = this.db.prepare(`
- SELECT file_path, COUNT(*) AS cnt
- FROM nodes
- WHERE kind = 'route'
- GROUP BY file_path
- ORDER BY cnt DESC
- LIMIT 20
- `);
- }
- const rows = this.stmts.getTopRouteFile.all() as Array<{ file_path: string; cnt: number }>;
- const filtered = rows.filter(r => !isLowValueFile(r.file_path));
- if (filtered.length === 0) return null;
- const totalRoutes = filtered.reduce((sum, r) => sum + r.cnt, 0);
- const top = filtered[0]!;
- if (totalRoutes < 3 || top.cnt < 3) return null;
- if (top.cnt / totalRoutes < 0.30) return null;
- return { filePath: top.file_path, routeCount: top.cnt, totalRoutes };
- }
- /**
- * Build a URL → handler manifest from the index. Each route node's
- * `references` edge points at the function/method that handles the
- * request. We join them in one pass; the agent gets the canonical
- * routing answer ("POST /users/login → AuthController#login") without
- * having to parse the framework's route DSL itself.
- *
- * Also returns the file with the most handler endpoints — used as the
- * "top handler file" to inline source for, so the agent has both the
- * mapping AND the handler implementations.
- */
- getRoutingManifest(limit: number = 40): {
- entries: Array<{ url: string; handler: string; handlerFile: string; handlerLine: number; handlerKind: string }>;
- topHandlerFile: string | null;
- topHandlerFileCount: number;
- totalRoutes: number;
- } | null {
- if (!this.stmts.getRoutingManifest) {
- // Edge kind varies across framework resolvers: Spring/Rails/
- // Laravel/Drupal emit `references`, Express emits `calls`. Accept
- // both — the semantic is the same (route → its handler).
- this.stmts.getRoutingManifest = this.db.prepare(`
- SELECT
- r.name AS url,
- h.name AS handler,
- h.file_path AS handler_file,
- h.start_line AS handler_line,
- h.kind AS handler_kind
- FROM nodes r
- JOIN edges e ON e.source = r.id
- JOIN nodes h ON e.target = h.id
- WHERE r.kind = 'route'
- AND e.kind IN ('references', 'calls')
- AND h.kind IN ('function', 'method', 'class')
- ORDER BY r.file_path, r.start_line
- LIMIT ?
- `);
- }
- const rows = this.stmts.getRoutingManifest.all(limit) as Array<{
- url: string; handler: string; handler_file: string; handler_line: number; handler_kind: string;
- }>;
- // Drop test/generated handlers — same hygiene as elsewhere.
- const filtered = rows.filter(r => !isLowValueFile(r.handler_file));
- if (filtered.length < 3) return null;
- // Identify the file holding the most handlers (the "primary handler file").
- const fileCounts = new Map<string, number>();
- for (const r of filtered) {
- fileCounts.set(r.handler_file, (fileCounts.get(r.handler_file) ?? 0) + 1);
- }
- let topHandlerFile: string | null = null;
- let topHandlerFileCount = 0;
- for (const [file, count] of fileCounts) {
- if (count > topHandlerFileCount) {
- topHandlerFile = file;
- topHandlerFileCount = count;
- }
- }
- return {
- entries: filtered.map(r => ({
- url: r.url,
- handler: r.handler,
- handlerFile: r.handler_file,
- handlerLine: r.handler_line,
- handlerKind: r.handler_kind,
- })),
- topHandlerFile,
- topHandlerFileCount,
- totalRoutes: filtered.length,
- };
- }
- /**
- * Get all nodes of a specific kind
- */
- getNodesByKind(kind: NodeKind): Node[] {
- if (!this.stmts.getNodesByKind) {
- this.stmts.getNodesByKind = this.db.prepare('SELECT * FROM nodes WHERE kind = ?');
- }
- const rows = this.stmts.getNodesByKind.all(kind) as NodeRow[];
- return rows.map(rowToNode);
- }
- /**
- * Stream every node of a kind one at a time (lazy) instead of materializing
- * them all like {@link getNodesByKind}. For unbounded kinds (`function`,
- * `method`) on a symbol-dense project the full array is gigabytes; the
- * dynamic-edge synthesizers only scan-and-filter, so they iterate to keep
- * memory O(1) in the node count rather than O(nodes) (#610).
- */
- *iterateNodesByKind(kind: NodeKind): IterableIterator<Node> {
- // Fresh statement per call (not a cached one): an iterator holds an open
- // cursor, so a shared statement would conflict across overlapping scans.
- const stmt = this.db.prepare('SELECT * FROM nodes WHERE kind = ?');
- for (const row of stmt.iterate(kind)) {
- yield rowToNode(row as NodeRow);
- }
- }
- /**
- * Get all nodes in the database
- */
- getAllNodes(): Node[] {
- const rows = this.db.prepare('SELECT * FROM nodes').all() as NodeRow[];
- return rows.map(rowToNode);
- }
- /**
- * Get nodes by exact name match (uses idx_nodes_name index)
- */
- getNodesByName(name: string): Node[] {
- if (!this.stmts.getNodesByName) {
- this.stmts.getNodesByName = this.db.prepare('SELECT * FROM nodes WHERE name = ?');
- }
- const rows = this.stmts.getNodesByName.all(name) as NodeRow[];
- return rows.map(rowToNode);
- }
- /**
- * Get nodes by exact qualified name match (uses idx_nodes_qualified_name index)
- */
- getNodesByQualifiedNameExact(qualifiedName: string): Node[] {
- if (!this.stmts.getNodesByQualifiedNameExact) {
- this.stmts.getNodesByQualifiedNameExact = this.db.prepare(
- 'SELECT * FROM nodes WHERE qualified_name = ?'
- );
- }
- const rows = this.stmts.getNodesByQualifiedNameExact.all(qualifiedName) as NodeRow[];
- return rows.map(rowToNode);
- }
- /**
- * Get nodes by lowercase name match (uses idx_nodes_lower_name expression index)
- */
- getNodesByLowerName(lowerName: string): Node[] {
- if (!this.stmts.getNodesByLowerName) {
- this.stmts.getNodesByLowerName = this.db.prepare(
- 'SELECT * FROM nodes WHERE lower(name) = ?'
- );
- }
- const rows = this.stmts.getNodesByLowerName.all(lowerName) as NodeRow[];
- return rows.map(rowToNode);
- }
- /**
- * Search nodes by name using FTS with fallback to LIKE for better matching
- *
- * Search strategy:
- * 1. Try FTS5 prefix match (query*) for word-start matching
- * 2. If no results, try LIKE for substring matching (e.g., "signIn" finds "signInWithGoogle")
- * 3. Score results based on match quality
- */
- searchNodes(query: string, options: SearchOptions = {}): SearchResult[] {
- const { limit = 100, offset = 0 } = options;
- // Parse field-qualified bits out of the raw query (kind:, lang:,
- // path:, name:). Anything not recognised stays in `text` and goes
- // to FTS unchanged. Filters compose with the SearchOptions arg —
- // both are applied (intersection-style).
- const parsed = parseQuery(query);
- const mergedKinds =
- parsed.kinds.length > 0
- ? Array.from(new Set([...(options.kinds ?? []), ...parsed.kinds]))
- : options.kinds;
- const mergedLanguages =
- parsed.languages.length > 0
- ? Array.from(new Set([...(options.languages ?? []), ...parsed.languages]))
- : options.languages;
- const pathFilters = parsed.pathFilters;
- const nameFilters = parsed.nameFilters;
- // The text portion drives FTS/LIKE; if all the user typed was
- // filters (`kind:function`), we still need *some* candidate set,
- // so synthesise an empty-text path that returns everything matching
- // the filters.
- const text = parsed.text;
- const kinds = mergedKinds;
- const languages = mergedLanguages;
- // First try FTS5 with prefix matching
- let results = text
- ? this.searchNodesFTS(text, { kinds, languages, limit, offset })
- // Over-fetch by 5× when running filter-only (no text). The
- // post-scoring path: + name: filters can be very selective, so
- // a smaller multiplier risks returning fewer than `limit`
- // results despite the DB having plenty of matches.
- : this.searchAllByFilters({ kinds, languages, limit: limit * 5 });
- // If no FTS results, try LIKE-based substring search
- if (results.length === 0 && text.length >= 2) {
- results = this.searchNodesLike(text, { kinds, languages, limit, offset });
- }
- // Final fuzzy fallback: scan all known names and keep those within
- // a tight Levenshtein distance. Only fires when both FTS and LIKE
- // returned nothing AND there's a text portion long enough to be
- // worth fuzzing (1-char queries would match too much).
- if (results.length === 0 && text.length >= 3) {
- results = this.searchNodesFuzzy(text, { kinds, languages, limit });
- }
- // Supplement: ensure exact name matches are always candidates.
- // BM25 can bury short exact-match names (e.g. "getBean") under hundreds of
- // compound names (e.g. "getBeanDescriptor") in large codebases,
- // pushing them past the FTS fetch limit before post-hoc scoring can help.
- // Use the max BM25 score as the base so the nameMatchBonus (exact=30 vs
- // prefix=20) actually differentiates them after rescoring.
- if (results.length > 0 && query) {
- const existingIds = new Set(results.map(r => r.node.id));
- const maxFtsScore = Math.max(...results.map(r => r.score));
- const terms = query.split(/\s+/).filter(t => t.length >= 2);
- for (const term of terms) {
- let sql = 'SELECT * FROM nodes WHERE name = ? COLLATE NOCASE';
- const params: (string | number)[] = [term];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- sql += ' LIMIT 20';
- const rows = this.db.prepare(sql).all(...params) as NodeRow[];
- for (const row of rows) {
- if (!existingIds.has(row.id)) {
- results.push({ node: rowToNode(row), score: maxFtsScore });
- existingIds.add(row.id);
- }
- }
- }
- }
- // Apply multi-signal scoring
- if (results.length > 0 && (text || query)) {
- const scoringQuery = text || query;
- results = results.map(r => ({
- ...r,
- score: r.score
- + kindBonus(r.node.kind)
- + scorePathRelevance(r.node.filePath, scoringQuery, this.projectNameTokens)
- + nameMatchBonus(r.node.name, scoringQuery),
- }));
- results.sort((a, b) => b.score - a.score);
- // Trim to requested limit after rescoring
- if (results.length > limit) {
- results = results.slice(0, limit);
- }
- }
- // Apply path: + name: filters AFTER scoring. Scoring already uses
- // path/name as a soft signal; the explicit filters here are a hard
- // gate. Done last so the FTS limit fetched plenty of candidates to
- // narrow from.
- if (pathFilters.length > 0) {
- const lowered = pathFilters.map((p) => p.toLowerCase());
- results = results.filter((r) => {
- const fp = r.node.filePath.toLowerCase();
- return lowered.some((p) => fp.includes(p));
- });
- }
- if (nameFilters.length > 0) {
- const lowered = nameFilters.map((n) => n.toLowerCase());
- results = results.filter((r) => {
- const nm = r.node.name.toLowerCase();
- return lowered.some((n) => nm.includes(n));
- });
- }
- return results;
- }
- /**
- * Match-everything path used when the user supplied only field
- * filters (`kind:function lang:typescript`) with no text. Returns
- * candidates ordered by name; the caller's filter pass narrows to
- * what was asked for.
- */
- private searchAllByFilters(options: {
- kinds?: NodeKind[];
- languages?: Language[];
- limit: number;
- }): SearchResult[] {
- const { kinds, languages, limit } = options;
- let sql = 'SELECT * FROM nodes WHERE 1=1';
- const params: (string | number)[] = [];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- sql += ' ORDER BY name LIMIT ?';
- params.push(limit);
- const rows = this.db.prepare(sql).all(...params) as NodeRow[];
- return rows.map((row) => ({ node: rowToNode(row), score: 1 }));
- }
- /**
- * Fuzzy fallback: when zero FTS/LIKE hits, try an edit-distance
- * sweep over the distinct symbol-name set. Caps `maxDist` at 2 so
- * `getUssr` finds `getUser` but `process` doesn't match `prosody`.
- * Bounded edit distance keeps each comparison cheap; the per-query
- * scan is O(distinct-name-count) which is far smaller than total
- * node count on any real codebase.
- */
- private searchNodesFuzzy(
- text: string,
- options: { kinds?: NodeKind[]; languages?: Language[]; limit: number }
- ): SearchResult[] {
- const { kinds, languages, limit } = options;
- const lowered = text.toLowerCase();
- const maxDist = lowered.length <= 4 ? 1 : 2;
- // Pull the distinct name list once. The set is cached on QueryBuilder
- // by getAllNodeNames(); even on a 200k-node project the distinct
- // name set is typically O(10k) because most names repeat. The
- // candidate-cap below bounds memory regardless.
- const allNames = this.getAllNodeNames();
- const candidates: Array<{ name: string; dist: number }> = [];
- for (const name of allNames) {
- const dist = boundedEditDistance(name.toLowerCase(), lowered, maxDist);
- if (dist <= maxDist) candidates.push({ name, dist });
- }
- candidates.sort((a, b) => a.dist - b.dist);
- // Cap the per-name follow-up queries. Each survivor triggers a
- // separate `SELECT * FROM nodes WHERE name = ?`; without this cap
- // a project with many similar names (`getUser1`, `getUser2`...)
- // could fan out far beyond `limit` queries before the inner-loop
- // limit kicks in.
- const FUZZY_FOLLOWUP_CAP = Math.max(limit * 2, 50);
- const cappedCandidates = candidates.slice(0, FUZZY_FOLLOWUP_CAP);
- const results: SearchResult[] = [];
- const seen = new Set<string>();
- for (const c of cappedCandidates) {
- if (results.length >= limit) break;
- let sql = 'SELECT * FROM nodes WHERE name = ?';
- const params: (string | number)[] = [c.name];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- sql += ' LIMIT 5';
- const rows = this.db.prepare(sql).all(...params) as NodeRow[];
- for (const row of rows) {
- if (seen.has(row.id)) continue;
- seen.add(row.id);
- // Lower the score for each edit step away from the query so
- // exact-match fallbacks (dist 0) outrank dist-2 typos.
- results.push({ node: rowToNode(row), score: 1 / (1 + c.dist) });
- if (results.length >= limit) break;
- }
- }
- return results;
- }
- /**
- * FTS5 search with prefix matching
- */
- private searchNodesFTS(query: string, options: SearchOptions): SearchResult[] {
- const { kinds, languages, limit = 100, offset = 0 } = options;
- // Add prefix wildcard for better matching (e.g., "auth" matches "AuthService", "authenticate")
- // Escape special FTS5 characters and add prefix wildcard.
- //
- // `::` is a qualifier separator in Rust/C++/Ruby, not a token char,
- // so treat it as whitespace before the strip step. Otherwise queries
- // like `stage_apply::run` collapse to `stage_applyrun` (the colons
- // are stripped without splitting) and find nothing. See #173.
- const ftsQuery = query
- .replace(/::/g, ' ') // Rust/C++/Ruby qualifier separator
- .replace(/['"*():^]/g, '') // Remove FTS5 special chars
- .split(/\s+/)
- .filter(term => term.length > 0)
- // Strip FTS5 boolean operators to prevent query manipulation
- .filter(term => !/^(AND|OR|NOT|NEAR)$/i.test(term))
- .map(term => `"${term}"*`) // Prefix match each term
- .join(' OR ');
- if (!ftsQuery) {
- return [];
- }
- // BM25 column weights: id=0, name=20, qualified_name=5, docstring=1, signature=2
- // Heavy name weight ensures exact/prefix name matches rank above incidental
- // mentions in long docstrings or qualified names of nested symbols.
- // Fetch 5x requested limit so post-hoc rescoring (kindBonus, pathRelevance,
- // nameMatchBonus) can promote results that BM25 alone undervalues.
- const ftsLimit = Math.max(limit * 5, 100);
- let sql = `
- SELECT nodes.*, bm25(nodes_fts, 0, 20, 5, 1, 2) as score
- FROM nodes_fts
- JOIN nodes ON nodes_fts.id = nodes.id
- WHERE nodes_fts MATCH ?
- `;
- const params: (string | number)[] = [ftsQuery];
- if (kinds && kinds.length > 0) {
- sql += ` AND nodes.kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND nodes.language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- sql += ' ORDER BY score LIMIT ? OFFSET ?';
- params.push(ftsLimit, offset);
- try {
- const rows = this.db.prepare(sql).all(...params) as (NodeRow & { score: number })[];
- return rows.map((row) => ({
- node: rowToNode(row),
- score: Math.abs(row.score), // bm25 returns negative scores
- }));
- } catch {
- // FTS query failed, return empty
- return [];
- }
- }
- /**
- * LIKE-based substring search for cases where FTS doesn't match
- * Useful for camelCase matching (e.g., "signIn" finds "signInWithGoogle")
- */
- private searchNodesLike(query: string, options: SearchOptions): SearchResult[] {
- const { kinds, languages, limit = 100, offset = 0 } = options;
- let sql = `
- SELECT nodes.*,
- CASE
- WHEN name = ? THEN 1.0
- WHEN name LIKE ? THEN 0.9
- WHEN name LIKE ? THEN 0.8
- WHEN qualified_name LIKE ? THEN 0.7
- ELSE 0.5
- END as score
- FROM nodes
- WHERE (
- name LIKE ? OR
- qualified_name LIKE ? OR
- name LIKE ?
- )
- `;
- // Pattern variants for better matching
- const exactMatch = query;
- const startsWith = `${query}%`;
- const contains = `%${query}%`;
- const params: (string | number)[] = [
- exactMatch, // Exact match score
- startsWith, // Starts with score
- contains, // Contains score
- contains, // Qualified name score
- contains, // WHERE: name contains
- contains, // WHERE: qualified_name contains
- startsWith, // WHERE: name starts with
- ];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- sql += ' ORDER BY score DESC, length(name) ASC LIMIT ? OFFSET ?';
- params.push(limit, offset);
- const rows = this.db.prepare(sql).all(...params) as (NodeRow & { score: number })[];
- return rows.map((row) => ({
- node: rowToNode(row),
- score: row.score,
- }));
- }
- /**
- * Find nodes by exact name match
- *
- * Used for hybrid search - looks up symbols by exact name or case-insensitive match.
- * Returns high-confidence matches for known symbol names extracted from query.
- *
- * @param names - Array of symbol names to look up
- * @param options - Search options (kinds, languages, limit)
- * @returns SearchResult array with exact matches scored at 1.0
- */
- findNodesByExactName(names: string[], options: SearchOptions = {}): SearchResult[] {
- if (names.length === 0) return [];
- const { kinds, languages, limit = 50 } = options;
- // Two-pass approach to handle common names (e.g., "run" has 40+ matches):
- // Pass 1: Find which files contain distinctive (rare) symbols from the query.
- // Pass 2: Query each name, boosting results that co-locate with distinctive symbols.
- // Pass 1: Find files containing each queried name, identify distinctive names
- const nameToFiles = new Map<string, Set<string>>();
- for (const name of names) {
- let sql = 'SELECT DISTINCT file_path FROM nodes WHERE name COLLATE NOCASE = ?';
- const params: (string | number)[] = [name];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- sql += ' LIMIT 100';
- const rows = this.db.prepare(sql).all(...params) as { file_path: string }[];
- nameToFiles.set(name.toLowerCase(), new Set(rows.map(r => r.file_path)));
- }
- // Distinctive names are those with fewer than 10 file matches (e.g., "scrapeLoop" = 1 file)
- const distinctiveFiles = new Set<string>();
- for (const [, files] of nameToFiles) {
- if (files.size > 0 && files.size < 10) {
- for (const f of files) distinctiveFiles.add(f);
- }
- }
- // Pass 2: Query each name with per-name limit, scoring by co-location
- const perNameLimit = Math.max(8, Math.ceil(limit / names.length));
- const allResults: SearchResult[] = [];
- const seenIds = new Set<string>();
- for (const name of names) {
- let sql = `
- SELECT nodes.*, 1.0 as score
- FROM nodes
- WHERE name COLLATE NOCASE = ?
- `;
- const params: (string | number)[] = [name];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- // Fetch enough to find co-located results among common names
- sql += ' LIMIT ?';
- params.push(Math.max(perNameLimit * 3, 50));
- const rows = this.db.prepare(sql).all(...params) as (NodeRow & { score: number })[];
- const nameResults: SearchResult[] = [];
- for (const row of rows) {
- const node = rowToNode(row);
- if (seenIds.has(node.id)) continue;
- // Boost results in files that also contain distinctive symbols
- const coLocationBoost = distinctiveFiles.has(node.filePath) ? 20 : 0;
- nameResults.push({ node, score: row.score + coLocationBoost });
- }
- // Sort by score (co-located first), take per-name limit
- nameResults.sort((a, b) => b.score - a.score);
- for (const r of nameResults.slice(0, perNameLimit)) {
- seenIds.add(r.node.id);
- allResults.push(r);
- }
- }
- // Sort all results by score so co-located results bubble up
- allResults.sort((a, b) => b.score - a.score);
- return allResults.slice(0, limit);
- }
- /**
- * Find nodes whose name contains a substring (LIKE-based).
- * Useful for CamelCase-part matching where FTS fails because
- * e.g. "TransportSearchAction" is one FTS token, not matchable by "Search"*.
- *
- * Results are ordered by name length (shorter = more likely to be the core type).
- */
- findNodesByNameSubstring(
- substring: string,
- options: SearchOptions & { excludePrefix?: boolean } = {}
- ): SearchResult[] {
- const { kinds, languages, limit = 30, excludePrefix } = options;
- let sql = `
- SELECT nodes.*, 1.0 as score
- FROM nodes
- WHERE name LIKE ?
- `;
- const params: (string | number)[] = [`%${substring}%`];
- // Exclude prefix matches (handled by FTS-based prefix search in Step 2b)
- if (excludePrefix) {
- sql += ` AND name NOT LIKE ?`;
- params.push(`${substring}%`);
- }
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (languages && languages.length > 0) {
- sql += ` AND language IN (${languages.map(() => '?').join(',')})`;
- params.push(...languages);
- }
- sql += ' ORDER BY length(name) ASC LIMIT ?';
- params.push(limit);
- const rows = this.db.prepare(sql).all(...params) as (NodeRow & { score: number })[];
- return rows.map((row) => ({
- node: rowToNode(row),
- score: row.score,
- }));
- }
- // ===========================================================================
- // Edge Operations
- // ===========================================================================
- /**
- * Insert a new edge
- */
- insertEdge(edge: Edge): void {
- if (!this.stmts.insertEdge) {
- this.stmts.insertEdge = this.db.prepare(`
- INSERT OR IGNORE INTO edges (source, target, kind, metadata, line, col, provenance)
- VALUES (@source, @target, @kind, @metadata, @line, @col, @provenance)
- `);
- }
- this.stmts.insertEdge.run({
- source: edge.source,
- target: edge.target,
- kind: edge.kind,
- metadata: edge.metadata ? JSON.stringify(edge.metadata) : null,
- line: edge.line ?? null,
- col: edge.column ?? null,
- provenance: edge.provenance ?? null,
- });
- }
- /**
- * Insert multiple edges in a transaction
- */
- insertEdges(edges: Edge[]): void {
- if (edges.length === 0) return;
- this.db.transaction(() => {
- const endpointIds = new Set<string>();
- for (const edge of edges) {
- endpointIds.add(edge.source);
- endpointIds.add(edge.target);
- }
- const existingNodeIds = this.getExistingNodeIds([...endpointIds]);
- for (const edge of edges) {
- if (!existingNodeIds.has(edge.source) || !existingNodeIds.has(edge.target)) {
- continue;
- }
- this.insertEdge(edge);
- }
- })();
- }
- /**
- * Delete all edges from a source node
- */
- deleteEdgesBySource(sourceId: string): void {
- if (!this.stmts.deleteEdgesBySource) {
- this.stmts.deleteEdgesBySource = this.db.prepare('DELETE FROM edges WHERE source = ?');
- }
- this.stmts.deleteEdgesBySource.run(sourceId);
- }
- /**
- * Get outgoing edges from a node
- */
- getOutgoingEdges(sourceId: string, kinds?: EdgeKind[], provenance?: string): Edge[] {
- if ((kinds && kinds.length > 0) || provenance) {
- let sql = 'SELECT * FROM edges WHERE source = ?';
- const params: (string | number)[] = [sourceId];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- if (provenance) {
- sql += ' AND provenance = ?';
- params.push(provenance);
- }
- const rows = this.db.prepare(sql).all(...params) as EdgeRow[];
- return rows.map(rowToEdge);
- }
- if (!this.stmts.getEdgesBySource) {
- this.stmts.getEdgesBySource = this.db.prepare('SELECT * FROM edges WHERE source = ?');
- }
- const rows = this.stmts.getEdgesBySource.all(sourceId) as EdgeRow[];
- return rows.map(rowToEdge);
- }
- /**
- * Get incoming edges to a node
- */
- getIncomingEdges(targetId: string, kinds?: EdgeKind[]): Edge[] {
- if (kinds && kinds.length > 0) {
- const sql = `SELECT * FROM edges WHERE target = ? AND kind IN (${kinds.map(() => '?').join(',')})`;
- const rows = this.db.prepare(sql).all(targetId, ...kinds) as EdgeRow[];
- return rows.map(rowToEdge);
- }
- if (!this.stmts.getEdgesByTarget) {
- this.stmts.getEdgesByTarget = this.db.prepare('SELECT * FROM edges WHERE target = ?');
- }
- const rows = this.stmts.getEdgesByTarget.all(targetId) as EdgeRow[];
- return rows.map(rowToEdge);
- }
- /**
- * Find all edges where both source and target are in the given node set.
- * Useful for recovering inter-node connectivity after BFS.
- */
- findEdgesBetweenNodes(nodeIds: string[], kinds?: EdgeKind[]): Edge[] {
- if (nodeIds.length === 0) return [];
- const idsJson = JSON.stringify(nodeIds);
- let sql = `SELECT * FROM edges WHERE source IN (SELECT value FROM json_each(?)) AND target IN (SELECT value FROM json_each(?))`;
- const params: string[] = [idsJson, idsJson];
- if (kinds && kinds.length > 0) {
- sql += ` AND kind IN (${kinds.map(() => '?').join(',')})`;
- params.push(...kinds);
- }
- const rows = this.db.prepare(sql).all(...params) as EdgeRow[];
- return rows.map(rowToEdge);
- }
- /**
- * Distinct file paths that DEPEND ON `filePath`: every file containing a
- * symbol with a cross-file edge (any kind except `contains`) into a symbol
- * of this file. This is the file-level projection of the symbol dependency
- * graph and the basis for blast-radius / `affected` test selection.
- *
- * It deliberately does NOT restrict to `imports` edges. In this graph an
- * `imports` edge connects a file to its own local import declarations
- * (it is always same-file), so an imports-only lookup returns zero
- * cross-file dependents for every file. The real cross-file dependency
- * signal is the resolved call/reference graph — calls, references,
- * instantiates, extends, implements, overrides, type_of, returns,
- * decorates — exactly what {@link GraphTraverser.getImpactRadius} traverses.
- * `contains` is excluded: a parent containing a symbol does not *depend* on
- * it. One indexed query (idx_nodes_file_path + idx_edges_target_kind).
- */
- getDependentFilePaths(filePath: string): string[] {
- const sql = `SELECT DISTINCT src.file_path AS fp
- FROM edges e
- JOIN nodes tgt ON tgt.id = e.target
- JOIN nodes src ON src.id = e.source
- WHERE tgt.file_path = ?
- AND e.kind != 'contains'
- AND src.file_path != ?`;
- const rows = this.db.prepare(sql).all(filePath, filePath) as Array<{ fp: string }>;
- return rows.map((r) => r.fp);
- }
- /**
- * Distinct file paths that `filePath` DEPENDS ON — the inverse of
- * {@link getDependentFilePaths}: every file containing a symbol that a
- * symbol of this file has a cross-file edge into. Same edge-kind rules
- * (all kinds except `contains`); same reason imports-only is insufficient.
- */
- getDependencyFilePaths(filePath: string): string[] {
- const sql = `SELECT DISTINCT tgt.file_path AS fp
- FROM edges e
- JOIN nodes src ON src.id = e.source
- JOIN nodes tgt ON tgt.id = e.target
- WHERE src.file_path = ?
- AND e.kind != 'contains'
- AND tgt.file_path != ?`;
- const rows = this.db.prepare(sql).all(filePath, filePath) as Array<{ fp: string }>;
- return rows.map((r) => r.fp);
- }
- // ===========================================================================
- // File Operations
- // ===========================================================================
- /**
- * Insert or update a file record
- */
- upsertFile(file: FileRecord): void {
- if (!this.stmts.upsertFile) {
- this.stmts.upsertFile = this.db.prepare(`
- INSERT INTO files (path, content_hash, language, size, modified_at, indexed_at, node_count, errors)
- VALUES (@path, @contentHash, @language, @size, @modifiedAt, @indexedAt, @nodeCount, @errors)
- ON CONFLICT(path) DO UPDATE SET
- content_hash = @contentHash,
- language = @language,
- size = @size,
- modified_at = @modifiedAt,
- indexed_at = @indexedAt,
- node_count = @nodeCount,
- errors = @errors
- `);
- }
- this.stmts.upsertFile.run({
- path: file.path,
- contentHash: file.contentHash,
- language: file.language,
- size: file.size,
- modifiedAt: file.modifiedAt,
- indexedAt: file.indexedAt,
- nodeCount: file.nodeCount,
- errors: file.errors ? JSON.stringify(file.errors) : null,
- });
- }
- /**
- * Delete a file record and its nodes
- */
- deleteFile(filePath: string): void {
- this.db.transaction(() => {
- this.deleteNodesByFile(filePath);
- if (!this.stmts.deleteFile) {
- this.stmts.deleteFile = this.db.prepare('DELETE FROM files WHERE path = ?');
- }
- this.stmts.deleteFile.run(filePath);
- })();
- }
- /**
- * Get a file record by path
- */
- getFileByPath(filePath: string): FileRecord | null {
- if (!this.stmts.getFileByPath) {
- this.stmts.getFileByPath = this.db.prepare('SELECT * FROM files WHERE path = ?');
- }
- const row = this.stmts.getFileByPath.get(filePath) as FileRow | undefined;
- return row ? rowToFileRecord(row) : null;
- }
- /**
- * Get all tracked files
- */
- getAllFiles(): FileRecord[] {
- if (!this.stmts.getAllFiles) {
- this.stmts.getAllFiles = this.db.prepare('SELECT * FROM files ORDER BY path');
- }
- const rows = this.stmts.getAllFiles.all() as FileRow[];
- return rows.map(rowToFileRecord);
- }
- /**
- * Most recent index timestamp (ms since epoch) across all tracked files, or
- * null when nothing is indexed yet. One indexed aggregate, no per-row scan. (#329)
- */
- getLastIndexedAt(): number | null {
- const row = this.db
- .prepare('SELECT MAX(indexed_at) AS last FROM files')
- .get() as { last: number | null } | undefined;
- return row?.last ?? null;
- }
- /**
- * Get files that need re-indexing (hash changed)
- */
- getStaleFiles(currentHashes: Map<string, string>): FileRecord[] {
- const files = this.getAllFiles();
- return files.filter((f) => {
- const currentHash = currentHashes.get(f.path);
- return currentHash && currentHash !== f.contentHash;
- });
- }
- // ===========================================================================
- // Unresolved References
- // ===========================================================================
- /**
- * Insert an unresolved reference
- */
- insertUnresolvedRef(ref: UnresolvedReference): void {
- if (!this.stmts.insertUnresolved) {
- this.stmts.insertUnresolved = this.db.prepare(`
- INSERT INTO unresolved_refs (from_node_id, reference_name, reference_kind, line, col, candidates, file_path, language)
- VALUES (@fromNodeId, @referenceName, @referenceKind, @line, @col, @candidates, @filePath, @language)
- `);
- }
- this.stmts.insertUnresolved.run({
- fromNodeId: ref.fromNodeId,
- referenceName: ref.referenceName,
- referenceKind: ref.referenceKind,
- line: ref.line,
- col: ref.column,
- candidates: ref.candidates ? JSON.stringify(ref.candidates) : null,
- filePath: ref.filePath ?? '',
- language: ref.language ?? 'unknown',
- });
- }
- /**
- * Insert multiple unresolved references in a transaction
- */
- insertUnresolvedRefsBatch(refs: UnresolvedReference[]): void {
- if (refs.length === 0) return;
- const insert = this.db.transaction(() => {
- for (const ref of refs) {
- this.insertUnresolvedRef(ref);
- }
- });
- insert();
- }
- /**
- * Delete unresolved references from a node
- */
- deleteUnresolvedByNode(nodeId: string): void {
- if (!this.stmts.deleteUnresolvedByNode) {
- this.stmts.deleteUnresolvedByNode = this.db.prepare(
- 'DELETE FROM unresolved_refs WHERE from_node_id = ?'
- );
- }
- this.stmts.deleteUnresolvedByNode.run(nodeId);
- }
- /**
- * Get unresolved references by name (for resolution)
- */
- getUnresolvedByName(name: string): UnresolvedReference[] {
- if (!this.stmts.getUnresolvedByName) {
- this.stmts.getUnresolvedByName = this.db.prepare(
- 'SELECT * FROM unresolved_refs WHERE reference_name = ?'
- );
- }
- const rows = this.stmts.getUnresolvedByName.all(name) as UnresolvedRefRow[];
- return rows.map((row) => ({
- fromNodeId: row.from_node_id,
- referenceName: row.reference_name,
- referenceKind: row.reference_kind as EdgeKind,
- line: row.line,
- column: row.col,
- candidates: row.candidates ? safeJsonParse(row.candidates, undefined) : undefined,
- filePath: row.file_path,
- language: row.language as Language,
- }));
- }
- /**
- * Get all unresolved references
- */
- getUnresolvedReferences(): UnresolvedReference[] {
- const rows = this.db.prepare('SELECT * FROM unresolved_refs').all() as UnresolvedRefRow[];
- return rows.map((row) => ({
- fromNodeId: row.from_node_id,
- referenceName: row.reference_name,
- referenceKind: row.reference_kind as EdgeKind,
- line: row.line,
- column: row.col,
- candidates: row.candidates ? safeJsonParse(row.candidates, undefined) : undefined,
- filePath: row.file_path,
- language: row.language as Language,
- }));
- }
- /**
- * Get the count of unresolved references without loading them into memory
- */
- getUnresolvedReferencesCount(): number {
- if (!this.stmts.getUnresolvedCount) {
- this.stmts.getUnresolvedCount = this.db.prepare(
- 'SELECT COUNT(*) as count FROM unresolved_refs'
- );
- }
- const row = this.stmts.getUnresolvedCount.get() as { count: number };
- return row.count;
- }
- /**
- * Get a batch of unresolved references using LIMIT/OFFSET pagination.
- * Used to process references in bounded memory chunks.
- */
- getUnresolvedReferencesBatch(offset: number, limit: number): UnresolvedReference[] {
- if (!this.stmts.getUnresolvedBatch) {
- this.stmts.getUnresolvedBatch = this.db.prepare(
- 'SELECT * FROM unresolved_refs LIMIT ? OFFSET ?'
- );
- }
- const rows = this.stmts.getUnresolvedBatch.all(limit, offset) as UnresolvedRefRow[];
- return rows.map((row) => ({
- fromNodeId: row.from_node_id,
- referenceName: row.reference_name,
- referenceKind: row.reference_kind as EdgeKind,
- line: row.line,
- column: row.col,
- candidates: row.candidates ? safeJsonParse(row.candidates, undefined) : undefined,
- filePath: row.file_path,
- language: row.language as Language,
- }));
- }
- /**
- * Get all tracked file paths (lightweight — no full FileRecord objects)
- */
- getAllFilePaths(): string[] {
- if (!this.stmts.getAllFilePaths) {
- this.stmts.getAllFilePaths = this.db.prepare('SELECT path FROM files ORDER BY path');
- }
- const rows = this.stmts.getAllFilePaths.all() as Array<{ path: string }>;
- return rows.map((r) => r.path);
- }
- /**
- * Get all distinct node names (lightweight — just name strings for pre-filtering)
- */
- getAllNodeNames(): string[] {
- if (!this.stmts.getAllNodeNames) {
- this.stmts.getAllNodeNames = this.db.prepare('SELECT DISTINCT name FROM nodes');
- }
- const rows = this.stmts.getAllNodeNames.all() as Array<{ name: string }>;
- return rows.map((r) => r.name);
- }
- /**
- * Get unresolved references scoped to specific file paths.
- * Uses the idx_unresolved_file_path index for efficient lookup.
- */
- getUnresolvedReferencesByFiles(filePaths: string[]): UnresolvedReference[] {
- if (filePaths.length === 0) return [];
- // Chunk under SQLite's parameter limit: the first sync of a very large repo
- // passes every changed file here, which an unbounded `IN (...)` would bind
- // as one parameter each — exceeding MAX_VARIABLE_NUMBER and aborting with
- // "too many SQL variables". (#540)
- const rows: UnresolvedRefRow[] = [];
- for (let i = 0; i < filePaths.length; i += SQLITE_PARAM_CHUNK_SIZE) {
- const chunk = filePaths.slice(i, i + SQLITE_PARAM_CHUNK_SIZE);
- const placeholders = chunk.map(() => '?').join(',');
- const chunkRows = this.db
- .prepare(`SELECT * FROM unresolved_refs WHERE file_path IN (${placeholders})`)
- .all(...chunk) as UnresolvedRefRow[];
- rows.push(...chunkRows);
- }
- return rows.map((row) => ({
- fromNodeId: row.from_node_id,
- referenceName: row.reference_name,
- referenceKind: row.reference_kind as EdgeKind,
- line: row.line,
- column: row.col,
- candidates: row.candidates ? safeJsonParse(row.candidates, undefined) : undefined,
- filePath: row.file_path,
- language: row.language as Language,
- }));
- }
- /**
- * Delete all unresolved references (after resolution)
- */
- clearUnresolvedReferences(): void {
- this.db.exec('DELETE FROM unresolved_refs');
- }
- /**
- * Delete resolved references by their IDs
- */
- deleteResolvedReferences(fromNodeIds: string[]): void {
- if (fromNodeIds.length === 0) return;
- const placeholders = fromNodeIds.map(() => '?').join(',');
- this.db.prepare(`DELETE FROM unresolved_refs WHERE from_node_id IN (${placeholders})`).run(...fromNodeIds);
- }
- /**
- * Delete specific resolved references by (fromNodeId, referenceName, referenceKind) tuples.
- * More precise than deleteResolvedReferences — only removes refs that were actually resolved.
- */
- deleteSpecificResolvedReferences(refs: Array<{ fromNodeId: string; referenceName: string; referenceKind: string }>): void {
- if (refs.length === 0) return;
- const stmt = this.db.prepare(
- 'DELETE FROM unresolved_refs WHERE from_node_id = ? AND reference_name = ? AND reference_kind = ?'
- );
- const deleteMany = this.db.transaction((items: typeof refs) => {
- for (const ref of items) {
- stmt.run(ref.fromNodeId, ref.referenceName, ref.referenceKind);
- }
- });
- deleteMany(refs);
- }
- // ===========================================================================
- // Statistics
- // ===========================================================================
- /**
- * Lightweight (nodes, edges) count snapshot. Used around an index/sync
- * run to compute true additions across extraction + resolution +
- * synthesis — the per-phase counter in the orchestrator only sees
- * extraction's contribution, which is why the CLI summary under-reported
- * the edge count (resolution + synthesizer edges were invisible).
- */
- getNodeAndEdgeCount(): { nodes: number; edges: number } {
- return this.db
- .prepare('SELECT (SELECT COUNT(*) FROM nodes) AS nodes, (SELECT COUNT(*) FROM edges) AS edges')
- .get() as { nodes: number; edges: number };
- }
- /**
- * Get graph statistics
- */
- getStats(): GraphStats {
- // Single query for all three aggregate counts
- const counts = this.db.prepare(`
- SELECT
- (SELECT COUNT(*) FROM nodes) AS node_count,
- (SELECT COUNT(*) FROM edges) AS edge_count,
- (SELECT COUNT(*) FROM files) AS file_count
- `).get() as { node_count: number; edge_count: number; file_count: number };
- const nodesByKind = {} as Record<NodeKind, number>;
- const nodeKindRows = this.db
- .prepare('SELECT kind, COUNT(*) as count FROM nodes GROUP BY kind')
- .all() as Array<{ kind: string; count: number }>;
- for (const row of nodeKindRows) {
- nodesByKind[row.kind as NodeKind] = row.count;
- }
- const edgesByKind = {} as Record<EdgeKind, number>;
- const edgeKindRows = this.db
- .prepare('SELECT kind, COUNT(*) as count FROM edges GROUP BY kind')
- .all() as Array<{ kind: string; count: number }>;
- for (const row of edgeKindRows) {
- edgesByKind[row.kind as EdgeKind] = row.count;
- }
- const filesByLanguage = {} as Record<Language, number>;
- const languageRows = this.db
- .prepare('SELECT language, COUNT(*) as count FROM files GROUP BY language')
- .all() as Array<{ language: string; count: number }>;
- for (const row of languageRows) {
- filesByLanguage[row.language as Language] = row.count;
- }
- return {
- nodeCount: counts.node_count,
- edgeCount: counts.edge_count,
- fileCount: counts.file_count,
- nodesByKind,
- edgesByKind,
- filesByLanguage,
- dbSizeBytes: 0, // Set by caller using DatabaseConnection.getSize()
- lastUpdated: Date.now(),
- };
- }
- // ===========================================================================
- // Project Metadata
- // ===========================================================================
- /**
- * Get a metadata value by key
- */
- getMetadata(key: string): string | null {
- const row = this.db.prepare('SELECT value FROM project_metadata WHERE key = ?').get(key) as { value: string } | undefined;
- return row?.value ?? null;
- }
- /**
- * Set a metadata key-value pair (upsert)
- */
- setMetadata(key: string, value: string): void {
- this.db.prepare(
- 'INSERT INTO project_metadata (key, value, updated_at) VALUES (?, ?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value, updated_at = excluded.updated_at'
- ).run(key, value, Date.now());
- }
- /**
- * Get all metadata as a key-value record
- */
- getAllMetadata(): Record<string, string> {
- const rows = this.db.prepare('SELECT key, value FROM project_metadata').all() as { key: string; value: string }[];
- const result: Record<string, string> = {};
- for (const row of rows) {
- result[row.key] = row.value;
- }
- return result;
- }
- /**
- * Clear all data from the database
- */
- clear(): void {
- this.nodeCache.clear();
- this.db.transaction(() => {
- this.db.exec('DELETE FROM unresolved_refs');
- this.db.exec('DELETE FROM edges');
- this.db.exec('DELETE FROM nodes');
- this.db.exec('DELETE FROM files');
- })();
- }
- }
|