search.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. /**
  2. * Vector Search
  3. *
  4. * Provides vector similarity search using sqlite-vss extension.
  5. * Falls back to brute-force cosine similarity if sqlite-vss is not available.
  6. */
  7. import Database from 'better-sqlite3';
  8. import { Node } from '../types';
  9. import { TextEmbedder, EMBEDDING_DIMENSION } from './embedder';
  10. /**
  11. * Options for vector search
  12. */
  13. export interface VectorSearchOptions {
  14. /** Maximum number of results to return */
  15. limit?: number;
  16. /** Minimum similarity score (0-1) */
  17. minScore?: number;
  18. /** Node kinds to filter results */
  19. nodeKinds?: Node['kind'][];
  20. }
  21. /**
  22. * Vector Search Manager
  23. *
  24. * Handles vector storage and similarity search for semantic code search.
  25. */
  26. export class VectorSearchManager {
  27. private db: Database.Database;
  28. private vssEnabled = false;
  29. private embeddingDimension: number;
  30. constructor(db: Database.Database, dimension: number = EMBEDDING_DIMENSION) {
  31. this.db = db;
  32. this.embeddingDimension = dimension;
  33. }
  34. /**
  35. * Initialize vector search
  36. *
  37. * Attempts to load sqlite-vss extension. Falls back to brute-force
  38. * search if the extension is not available.
  39. */
  40. async initialize(): Promise<void> {
  41. try {
  42. // Try to load sqlite-vss extension
  43. await this.loadVssExtension();
  44. this.vssEnabled = true;
  45. console.log('sqlite-vss extension loaded successfully');
  46. // Create the VSS virtual table
  47. this.createVssTable();
  48. } catch (error) {
  49. // Fall back to brute-force search
  50. console.warn(
  51. 'sqlite-vss extension not available, falling back to brute-force search:',
  52. error instanceof Error ? error.message : String(error)
  53. );
  54. this.vssEnabled = false;
  55. }
  56. // Ensure the vectors table exists (for both VSS and fallback modes)
  57. this.ensureVectorsTable();
  58. }
  59. /**
  60. * Load the sqlite-vss extension
  61. */
  62. private async loadVssExtension(): Promise<void> {
  63. try {
  64. // The sqlite-vss npm package provides functions to load extensions
  65. const vss = await import('sqlite-vss');
  66. // Use the load function which loads both vector0 and vss0
  67. if (typeof vss.load === 'function') {
  68. vss.load(this.db);
  69. } else if (typeof vss.default?.load === 'function') {
  70. vss.default.load(this.db);
  71. } else {
  72. throw new Error('sqlite-vss load function not found');
  73. }
  74. } catch (error) {
  75. throw new Error(`Failed to load sqlite-vss: ${error instanceof Error ? error.message : String(error)}`);
  76. }
  77. }
  78. /**
  79. * Create the VSS virtual table for vector search
  80. */
  81. private createVssTable(): void {
  82. // Check if the table already exists
  83. const tableExists = this.db
  84. .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vss_vectors'")
  85. .get();
  86. if (!tableExists) {
  87. // Create VSS virtual table
  88. // vss0 is the vector search extension
  89. this.db.exec(`
  90. CREATE VIRTUAL TABLE IF NOT EXISTS vss_vectors USING vss0(
  91. embedding(${this.embeddingDimension})
  92. );
  93. `);
  94. // Create mapping table to link VSS rowids to node IDs
  95. this.db.exec(`
  96. CREATE TABLE IF NOT EXISTS vss_map (
  97. rowid INTEGER PRIMARY KEY,
  98. node_id TEXT NOT NULL UNIQUE
  99. );
  100. `);
  101. // Create index on node_id
  102. this.db.exec(`
  103. CREATE INDEX IF NOT EXISTS idx_vss_map_node ON vss_map(node_id);
  104. `);
  105. }
  106. }
  107. /**
  108. * Ensure the basic vectors table exists (for fallback mode)
  109. */
  110. private ensureVectorsTable(): void {
  111. this.db.exec(`
  112. CREATE TABLE IF NOT EXISTS vectors (
  113. node_id TEXT PRIMARY KEY,
  114. embedding BLOB NOT NULL,
  115. model TEXT NOT NULL,
  116. created_at INTEGER NOT NULL
  117. );
  118. `);
  119. }
  120. /**
  121. * Check if VSS extension is enabled
  122. */
  123. isVssEnabled(): boolean {
  124. return this.vssEnabled;
  125. }
  126. /**
  127. * Store a vector embedding for a node
  128. *
  129. * @param nodeId - ID of the node
  130. * @param embedding - Vector embedding
  131. * @param model - Model used to generate embedding
  132. */
  133. storeVector(nodeId: string, embedding: Float32Array, model: string): void {
  134. const now = Date.now();
  135. // Store in the vectors table (always, for persistence)
  136. const blob = Buffer.from(embedding.buffer);
  137. this.db
  138. .prepare(
  139. `
  140. INSERT OR REPLACE INTO vectors (node_id, embedding, model, created_at)
  141. VALUES (?, ?, ?, ?)
  142. `
  143. )
  144. .run(nodeId, blob, model, now);
  145. // Also store in VSS table if enabled
  146. if (this.vssEnabled) {
  147. this.storeInVss(nodeId, embedding);
  148. }
  149. }
  150. /**
  151. * Store vector in VSS virtual table
  152. */
  153. private storeInVss(nodeId: string, embedding: Float32Array): void {
  154. try {
  155. // Check if already exists
  156. const existing = this.db
  157. .prepare('SELECT rowid FROM vss_map WHERE node_id = ?')
  158. .get(nodeId) as { rowid: number } | undefined;
  159. if (existing) {
  160. // Update existing vector
  161. const vectorJson = JSON.stringify(Array.from(embedding));
  162. this.db
  163. .prepare('UPDATE vss_vectors SET embedding = ? WHERE rowid = ?')
  164. .run(vectorJson, existing.rowid);
  165. } else {
  166. // Insert new vector - get max rowid and increment
  167. const maxRow = this.db
  168. .prepare('SELECT MAX(rowid) as max FROM vss_map')
  169. .get() as { max: number | null } | undefined;
  170. const newRowid = (maxRow?.max ?? 0) + 1;
  171. const vectorJson = JSON.stringify(Array.from(embedding));
  172. this.db
  173. .prepare('INSERT INTO vss_vectors (rowid, embedding) VALUES (?, ?)')
  174. .run(newRowid, vectorJson);
  175. // Map the rowid to node_id
  176. this.db
  177. .prepare('INSERT INTO vss_map (rowid, node_id) VALUES (?, ?)')
  178. .run(newRowid, nodeId);
  179. }
  180. } catch (error) {
  181. // VSS operations can fail for various reasons (dimension mismatch, etc.)
  182. // Fall back to brute-force search silently
  183. console.warn(
  184. 'VSS storage failed, using brute-force search:',
  185. error instanceof Error ? error.message : String(error)
  186. );
  187. }
  188. }
  189. /**
  190. * Store multiple vectors in a batch
  191. *
  192. * @param entries - Array of node IDs and embeddings
  193. * @param model - Model used to generate embeddings
  194. */
  195. storeVectorBatch(
  196. entries: Array<{ nodeId: string; embedding: Float32Array }>,
  197. model: string
  198. ): void {
  199. const now = Date.now();
  200. // Use a transaction for better performance
  201. this.db.transaction(() => {
  202. for (const entry of entries) {
  203. const blob = Buffer.from(entry.embedding.buffer);
  204. this.db
  205. .prepare(
  206. `
  207. INSERT OR REPLACE INTO vectors (node_id, embedding, model, created_at)
  208. VALUES (?, ?, ?, ?)
  209. `
  210. )
  211. .run(entry.nodeId, blob, model, now);
  212. if (this.vssEnabled) {
  213. this.storeInVss(entry.nodeId, entry.embedding);
  214. }
  215. }
  216. })();
  217. }
  218. /**
  219. * Get vector for a node
  220. *
  221. * @param nodeId - ID of the node
  222. * @returns Embedding or null if not found
  223. */
  224. getVector(nodeId: string): Float32Array | null {
  225. const row = this.db
  226. .prepare('SELECT embedding FROM vectors WHERE node_id = ?')
  227. .get(nodeId) as { embedding: Buffer } | undefined;
  228. if (!row) {
  229. return null;
  230. }
  231. return new Float32Array(row.embedding.buffer.slice(
  232. row.embedding.byteOffset,
  233. row.embedding.byteOffset + row.embedding.byteLength
  234. ));
  235. }
  236. /**
  237. * Delete vector for a node
  238. *
  239. * @param nodeId - ID of the node
  240. */
  241. deleteVector(nodeId: string): void {
  242. this.db.prepare('DELETE FROM vectors WHERE node_id = ?').run(nodeId);
  243. if (this.vssEnabled) {
  244. // Get the rowid before deleting
  245. const mapping = this.db
  246. .prepare('SELECT rowid FROM vss_map WHERE node_id = ?')
  247. .get(nodeId) as { rowid: number } | undefined;
  248. if (mapping) {
  249. this.db.prepare('DELETE FROM vss_vectors WHERE rowid = ?').run(mapping.rowid);
  250. this.db.prepare('DELETE FROM vss_map WHERE node_id = ?').run(nodeId);
  251. }
  252. }
  253. }
  254. /**
  255. * Search for similar vectors
  256. *
  257. * @param queryEmbedding - Query vector to search for
  258. * @param options - Search options
  259. * @returns Array of node IDs with similarity scores
  260. */
  261. search(
  262. queryEmbedding: Float32Array,
  263. options: VectorSearchOptions = {}
  264. ): Array<{ nodeId: string; score: number }> {
  265. const { limit = 10, minScore = 0 } = options;
  266. if (this.vssEnabled) {
  267. return this.searchWithVss(queryEmbedding, limit, minScore);
  268. } else {
  269. return this.searchBruteForce(queryEmbedding, limit, minScore);
  270. }
  271. }
  272. /**
  273. * Search using sqlite-vss KNN search
  274. */
  275. private searchWithVss(
  276. queryEmbedding: Float32Array,
  277. limit: number,
  278. minScore: number
  279. ): Array<{ nodeId: string; score: number }> {
  280. try {
  281. const vectorJson = JSON.stringify(Array.from(queryEmbedding));
  282. // Sanitize limit to prevent SQL injection (ensure it's a positive integer)
  283. const safeLimit = Math.max(1, Math.floor(limit));
  284. // Use VSS KNN search
  285. // The distance is L2 (euclidean), we need to convert to similarity score
  286. // Note: sqlite-vss requires LIMIT to be a literal, not a parameter
  287. const rows = this.db
  288. .prepare(
  289. `
  290. SELECT m.node_id, v.distance
  291. FROM (
  292. SELECT rowid, distance
  293. FROM vss_vectors
  294. WHERE vss_search(embedding, ?)
  295. LIMIT ${safeLimit}
  296. ) v
  297. JOIN vss_map m ON m.rowid = v.rowid
  298. `
  299. )
  300. .all(vectorJson) as Array<{ node_id: string; distance: number }>;
  301. // Convert L2 distance to similarity score (1 / (1 + distance))
  302. return rows
  303. .map((row) => ({
  304. nodeId: row.node_id,
  305. score: 1 / (1 + row.distance),
  306. }))
  307. .filter((r) => r.score >= minScore);
  308. } catch (error) {
  309. // VSS search failed, fall back to brute force
  310. console.warn(
  311. 'VSS search failed, using brute-force:',
  312. error instanceof Error ? error.message : String(error)
  313. );
  314. return this.searchBruteForce(queryEmbedding, limit, minScore);
  315. }
  316. }
  317. /**
  318. * Brute-force search using cosine similarity
  319. */
  320. private searchBruteForce(
  321. queryEmbedding: Float32Array,
  322. limit: number,
  323. minScore: number
  324. ): Array<{ nodeId: string; score: number }> {
  325. // Get all vectors
  326. const rows = this.db
  327. .prepare('SELECT node_id, embedding FROM vectors')
  328. .all() as Array<{ node_id: string; embedding: Buffer }>;
  329. // Calculate cosine similarity for each
  330. const results: Array<{ nodeId: string; score: number }> = [];
  331. for (const row of rows) {
  332. const embedding = new Float32Array(row.embedding.buffer.slice(
  333. row.embedding.byteOffset,
  334. row.embedding.byteOffset + row.embedding.byteLength
  335. ));
  336. const score = TextEmbedder.cosineSimilarity(queryEmbedding, embedding);
  337. if (score >= minScore) {
  338. results.push({ nodeId: row.node_id, score });
  339. }
  340. }
  341. // Sort by score descending and limit
  342. results.sort((a, b) => b.score - a.score);
  343. return results.slice(0, limit);
  344. }
  345. /**
  346. * Get count of stored vectors
  347. */
  348. getVectorCount(): number {
  349. const result = this.db
  350. .prepare('SELECT COUNT(*) as count FROM vectors')
  351. .get() as { count: number };
  352. return result.count;
  353. }
  354. /**
  355. * Check if a node has a vector
  356. */
  357. hasVector(nodeId: string): boolean {
  358. const result = this.db
  359. .prepare('SELECT 1 FROM vectors WHERE node_id = ? LIMIT 1')
  360. .get(nodeId);
  361. return !!result;
  362. }
  363. /**
  364. * Get all node IDs that have vectors
  365. */
  366. getIndexedNodeIds(): string[] {
  367. const rows = this.db
  368. .prepare('SELECT node_id FROM vectors')
  369. .all() as Array<{ node_id: string }>;
  370. return rows.map((r) => r.node_id);
  371. }
  372. /**
  373. * Clear all vectors
  374. */
  375. clear(): void {
  376. this.db.prepare('DELETE FROM vectors').run();
  377. if (this.vssEnabled) {
  378. this.db.prepare('DELETE FROM vss_vectors').run();
  379. this.db.prepare('DELETE FROM vss_map').run();
  380. }
  381. }
  382. /**
  383. * Rebuild VSS index from vectors table
  384. *
  385. * Useful after bulk operations or if VSS index gets out of sync.
  386. */
  387. rebuildVssIndex(): void {
  388. if (!this.vssEnabled) {
  389. return;
  390. }
  391. // Clear VSS tables
  392. this.db.prepare('DELETE FROM vss_vectors').run();
  393. this.db.prepare('DELETE FROM vss_map').run();
  394. // Reload from vectors table
  395. const rows = this.db
  396. .prepare('SELECT node_id, embedding FROM vectors')
  397. .all() as Array<{ node_id: string; embedding: Buffer }>;
  398. this.db.transaction(() => {
  399. for (const row of rows) {
  400. const embedding = new Float32Array(row.embedding.buffer.slice(
  401. row.embedding.byteOffset,
  402. row.embedding.byteOffset + row.embedding.byteLength
  403. ));
  404. this.storeInVss(row.node_id, embedding);
  405. }
  406. })();
  407. }
  408. }
  409. /**
  410. * Create a vector search manager
  411. */
  412. export function createVectorSearch(
  413. db: Database.Database,
  414. dimension?: number
  415. ): VectorSearchManager {
  416. return new VectorSearchManager(db, dimension);
  417. }