search.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. /**
  2. * Vector Search
  3. *
  4. * Provides vector similarity search using sqlite-vss extension.
  5. * Falls back to brute-force cosine similarity if sqlite-vss is not available.
  6. */
  7. import { SqliteDatabase } from '../db/sqlite-adapter';
  8. import { Node } from '../types';
  9. import { TextEmbedder, EMBEDDING_DIMENSION } from './embedder';
  10. /**
  11. * Options for vector search
  12. */
  13. export interface VectorSearchOptions {
  14. /** Maximum number of results to return */
  15. limit?: number;
  16. /** Minimum similarity score (0-1) */
  17. minScore?: number;
  18. /** Node kinds to filter results */
  19. nodeKinds?: Node['kind'][];
  20. }
  21. /**
  22. * Vector Search Manager
  23. *
  24. * Handles vector storage and similarity search for semantic code search.
  25. */
  26. export class VectorSearchManager {
  27. private db: SqliteDatabase;
  28. private vssEnabled = false;
  29. private embeddingDimension: number;
  30. constructor(db: SqliteDatabase, dimension: number = EMBEDDING_DIMENSION) {
  31. this.db = db;
  32. this.embeddingDimension = dimension;
  33. }
  34. /**
  35. * Initialize vector search
  36. *
  37. * Attempts to load sqlite-vss extension. Falls back to brute-force
  38. * search if the extension is not available.
  39. */
  40. async initialize(): Promise<void> {
  41. try {
  42. // Try to load sqlite-vss extension
  43. await this.loadVssExtension();
  44. this.vssEnabled = true;
  45. console.log('sqlite-vss extension loaded successfully');
  46. // Create the VSS virtual table
  47. this.createVssTable();
  48. } catch (error) {
  49. // Fall back to brute-force search
  50. console.warn(
  51. 'sqlite-vss extension not available, falling back to brute-force search:',
  52. error instanceof Error ? error.message : String(error)
  53. );
  54. this.vssEnabled = false;
  55. }
  56. // Ensure the vectors table exists (for both VSS and fallback modes)
  57. this.ensureVectorsTable();
  58. }
  59. /**
  60. * Load the sqlite-vss extension
  61. */
  62. private async loadVssExtension(): Promise<void> {
  63. try {
  64. // The sqlite-vss npm package provides functions to load extensions
  65. const vss = await import('sqlite-vss');
  66. // Use the load function which loads both vector0 and vss0
  67. // VSS extension expects the raw better-sqlite3 Database instance
  68. if (typeof vss.load === 'function') {
  69. vss.load(this.db as any);
  70. } else if (typeof vss.default?.load === 'function') {
  71. vss.default.load(this.db as any);
  72. } else {
  73. throw new Error('sqlite-vss load function not found');
  74. }
  75. } catch (error) {
  76. throw new Error(`Failed to load sqlite-vss: ${error instanceof Error ? error.message : String(error)}`);
  77. }
  78. }
  79. /**
  80. * Create the VSS virtual table for vector search
  81. */
  82. private createVssTable(): void {
  83. // Check if the table already exists
  84. const tableExists = this.db
  85. .prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='vss_vectors'")
  86. .get();
  87. if (!tableExists) {
  88. // Create VSS virtual table
  89. // vss0 is the vector search extension
  90. this.db.exec(`
  91. CREATE VIRTUAL TABLE IF NOT EXISTS vss_vectors USING vss0(
  92. embedding(${this.embeddingDimension})
  93. );
  94. `);
  95. // Create mapping table to link VSS rowids to node IDs
  96. this.db.exec(`
  97. CREATE TABLE IF NOT EXISTS vss_map (
  98. rowid INTEGER PRIMARY KEY,
  99. node_id TEXT NOT NULL UNIQUE
  100. );
  101. `);
  102. // Create index on node_id
  103. this.db.exec(`
  104. CREATE INDEX IF NOT EXISTS idx_vss_map_node ON vss_map(node_id);
  105. `);
  106. }
  107. }
  108. /**
  109. * Ensure the basic vectors table exists (for fallback mode)
  110. */
  111. private ensureVectorsTable(): void {
  112. this.db.exec(`
  113. CREATE TABLE IF NOT EXISTS vectors (
  114. node_id TEXT PRIMARY KEY,
  115. embedding BLOB NOT NULL,
  116. model TEXT NOT NULL,
  117. created_at INTEGER NOT NULL
  118. );
  119. `);
  120. }
  121. /**
  122. * Check if VSS extension is enabled
  123. */
  124. isVssEnabled(): boolean {
  125. return this.vssEnabled;
  126. }
  127. /**
  128. * Store a vector embedding for a node
  129. *
  130. * @param nodeId - ID of the node
  131. * @param embedding - Vector embedding
  132. * @param model - Model used to generate embedding
  133. */
  134. storeVector(nodeId: string, embedding: Float32Array, model: string): void {
  135. const now = Date.now();
  136. // Store in the vectors table (always, for persistence)
  137. const blob = Buffer.from(embedding.buffer);
  138. this.db
  139. .prepare(
  140. `
  141. INSERT OR REPLACE INTO vectors (node_id, embedding, model, created_at)
  142. VALUES (?, ?, ?, ?)
  143. `
  144. )
  145. .run(nodeId, blob, model, now);
  146. // Also store in VSS table if enabled
  147. if (this.vssEnabled) {
  148. this.storeInVss(nodeId, embedding);
  149. }
  150. }
  151. /**
  152. * Store vector in VSS virtual table
  153. */
  154. private storeInVss(nodeId: string, embedding: Float32Array): void {
  155. try {
  156. // Check if already exists
  157. const existing = this.db
  158. .prepare('SELECT rowid FROM vss_map WHERE node_id = ?')
  159. .get(nodeId) as { rowid: number } | undefined;
  160. if (existing) {
  161. // Update existing vector
  162. const vectorJson = JSON.stringify(Array.from(embedding));
  163. this.db
  164. .prepare('UPDATE vss_vectors SET embedding = ? WHERE rowid = ?')
  165. .run(vectorJson, existing.rowid);
  166. } else {
  167. // Insert new vector - get max rowid and increment
  168. const maxRow = this.db
  169. .prepare('SELECT MAX(rowid) as max FROM vss_map')
  170. .get() as { max: number | null } | undefined;
  171. const newRowid = (maxRow?.max ?? 0) + 1;
  172. const vectorJson = JSON.stringify(Array.from(embedding));
  173. this.db
  174. .prepare('INSERT INTO vss_vectors (rowid, embedding) VALUES (?, ?)')
  175. .run(newRowid, vectorJson);
  176. // Map the rowid to node_id
  177. this.db
  178. .prepare('INSERT INTO vss_map (rowid, node_id) VALUES (?, ?)')
  179. .run(newRowid, nodeId);
  180. }
  181. } catch (error) {
  182. // VSS operations can fail for various reasons (dimension mismatch, etc.)
  183. // Fall back to brute-force search silently
  184. console.warn(
  185. 'VSS storage failed, using brute-force search:',
  186. error instanceof Error ? error.message : String(error)
  187. );
  188. }
  189. }
  190. /**
  191. * Store multiple vectors in a batch
  192. *
  193. * @param entries - Array of node IDs and embeddings
  194. * @param model - Model used to generate embeddings
  195. */
  196. storeVectorBatch(
  197. entries: Array<{ nodeId: string; embedding: Float32Array }>,
  198. model: string
  199. ): void {
  200. const now = Date.now();
  201. // Use a transaction for better performance
  202. this.db.transaction(() => {
  203. for (const entry of entries) {
  204. const blob = Buffer.from(entry.embedding.buffer);
  205. this.db
  206. .prepare(
  207. `
  208. INSERT OR REPLACE INTO vectors (node_id, embedding, model, created_at)
  209. VALUES (?, ?, ?, ?)
  210. `
  211. )
  212. .run(entry.nodeId, blob, model, now);
  213. if (this.vssEnabled) {
  214. this.storeInVss(entry.nodeId, entry.embedding);
  215. }
  216. }
  217. })();
  218. }
  219. /**
  220. * Get vector for a node
  221. *
  222. * @param nodeId - ID of the node
  223. * @returns Embedding or null if not found
  224. */
  225. getVector(nodeId: string): Float32Array | null {
  226. const row = this.db
  227. .prepare('SELECT embedding FROM vectors WHERE node_id = ?')
  228. .get(nodeId) as { embedding: Buffer } | undefined;
  229. if (!row) {
  230. return null;
  231. }
  232. return new Float32Array(row.embedding.buffer.slice(
  233. row.embedding.byteOffset,
  234. row.embedding.byteOffset + row.embedding.byteLength
  235. ));
  236. }
  237. /**
  238. * Delete vector for a node
  239. *
  240. * @param nodeId - ID of the node
  241. */
  242. deleteVector(nodeId: string): void {
  243. this.db.prepare('DELETE FROM vectors WHERE node_id = ?').run(nodeId);
  244. if (this.vssEnabled) {
  245. // Get the rowid before deleting
  246. const mapping = this.db
  247. .prepare('SELECT rowid FROM vss_map WHERE node_id = ?')
  248. .get(nodeId) as { rowid: number } | undefined;
  249. if (mapping) {
  250. this.db.prepare('DELETE FROM vss_vectors WHERE rowid = ?').run(mapping.rowid);
  251. this.db.prepare('DELETE FROM vss_map WHERE node_id = ?').run(nodeId);
  252. }
  253. }
  254. }
  255. /**
  256. * Search for similar vectors
  257. *
  258. * @param queryEmbedding - Query vector to search for
  259. * @param options - Search options
  260. * @returns Array of node IDs with similarity scores
  261. */
  262. search(
  263. queryEmbedding: Float32Array,
  264. options: VectorSearchOptions = {}
  265. ): Array<{ nodeId: string; score: number }> {
  266. const { limit = 10, minScore = 0 } = options;
  267. if (this.vssEnabled) {
  268. return this.searchWithVss(queryEmbedding, limit, minScore);
  269. } else {
  270. return this.searchBruteForce(queryEmbedding, limit, minScore);
  271. }
  272. }
  273. /**
  274. * Search using sqlite-vss KNN search
  275. */
  276. private searchWithVss(
  277. queryEmbedding: Float32Array,
  278. limit: number,
  279. minScore: number
  280. ): Array<{ nodeId: string; score: number }> {
  281. try {
  282. const vectorJson = JSON.stringify(Array.from(queryEmbedding));
  283. // Sanitize limit to prevent SQL injection (ensure it's a positive integer)
  284. const safeLimit = Math.max(1, Math.floor(limit));
  285. // Use VSS KNN search
  286. // The distance is L2 (euclidean), we need to convert to similarity score
  287. // Note: sqlite-vss requires LIMIT to be a literal, not a parameter
  288. const rows = this.db
  289. .prepare(
  290. `
  291. SELECT m.node_id, v.distance
  292. FROM (
  293. SELECT rowid, distance
  294. FROM vss_vectors
  295. WHERE vss_search(embedding, ?)
  296. LIMIT ${safeLimit}
  297. ) v
  298. JOIN vss_map m ON m.rowid = v.rowid
  299. `
  300. )
  301. .all(vectorJson) as Array<{ node_id: string; distance: number }>;
  302. // Convert L2 distance to similarity score (1 / (1 + distance))
  303. return rows
  304. .map((row) => ({
  305. nodeId: row.node_id,
  306. score: 1 / (1 + row.distance),
  307. }))
  308. .filter((r) => r.score >= minScore);
  309. } catch (error) {
  310. // VSS search failed, fall back to brute force
  311. console.warn(
  312. 'VSS search failed, using brute-force:',
  313. error instanceof Error ? error.message : String(error)
  314. );
  315. return this.searchBruteForce(queryEmbedding, limit, minScore);
  316. }
  317. }
  318. /**
  319. * Brute-force search using cosine similarity
  320. */
  321. private searchBruteForce(
  322. queryEmbedding: Float32Array,
  323. limit: number,
  324. minScore: number
  325. ): Array<{ nodeId: string; score: number }> {
  326. // Get all vectors
  327. const rows = this.db
  328. .prepare('SELECT node_id, embedding FROM vectors')
  329. .all() as Array<{ node_id: string; embedding: Buffer }>;
  330. // Calculate cosine similarity for each
  331. const results: Array<{ nodeId: string; score: number }> = [];
  332. for (const row of rows) {
  333. const embedding = new Float32Array(row.embedding.buffer.slice(
  334. row.embedding.byteOffset,
  335. row.embedding.byteOffset + row.embedding.byteLength
  336. ));
  337. const score = TextEmbedder.cosineSimilarity(queryEmbedding, embedding);
  338. if (score >= minScore) {
  339. results.push({ nodeId: row.node_id, score });
  340. }
  341. }
  342. // Sort by score descending and limit
  343. results.sort((a, b) => b.score - a.score);
  344. return results.slice(0, limit);
  345. }
  346. /**
  347. * Get count of stored vectors
  348. */
  349. getVectorCount(): number {
  350. const result = this.db
  351. .prepare('SELECT COUNT(*) as count FROM vectors')
  352. .get() as { count: number };
  353. return result.count;
  354. }
  355. /**
  356. * Check if a node has a vector
  357. */
  358. hasVector(nodeId: string): boolean {
  359. const result = this.db
  360. .prepare('SELECT 1 FROM vectors WHERE node_id = ? LIMIT 1')
  361. .get(nodeId);
  362. return !!result;
  363. }
  364. /**
  365. * Get all node IDs that have vectors
  366. */
  367. getIndexedNodeIds(): string[] {
  368. const rows = this.db
  369. .prepare('SELECT node_id FROM vectors')
  370. .all() as Array<{ node_id: string }>;
  371. return rows.map((r) => r.node_id);
  372. }
  373. /**
  374. * Clear all vectors
  375. */
  376. clear(): void {
  377. this.db.prepare('DELETE FROM vectors').run();
  378. if (this.vssEnabled) {
  379. this.db.prepare('DELETE FROM vss_vectors').run();
  380. this.db.prepare('DELETE FROM vss_map').run();
  381. }
  382. }
  383. /**
  384. * Rebuild VSS index from vectors table
  385. *
  386. * Useful after bulk operations or if VSS index gets out of sync.
  387. */
  388. rebuildVssIndex(): void {
  389. if (!this.vssEnabled) {
  390. return;
  391. }
  392. // Clear VSS tables
  393. this.db.prepare('DELETE FROM vss_vectors').run();
  394. this.db.prepare('DELETE FROM vss_map').run();
  395. // Reload from vectors table
  396. const rows = this.db
  397. .prepare('SELECT node_id, embedding FROM vectors')
  398. .all() as Array<{ node_id: string; embedding: Buffer }>;
  399. this.db.transaction(() => {
  400. for (const row of rows) {
  401. const embedding = new Float32Array(row.embedding.buffer.slice(
  402. row.embedding.byteOffset,
  403. row.embedding.byteOffset + row.embedding.byteLength
  404. ));
  405. this.storeInVss(row.node_id, embedding);
  406. }
  407. })();
  408. }
  409. }
  410. /**
  411. * Create a vector search manager
  412. */
  413. export function createVectorSearch(
  414. db: SqliteDatabase,
  415. dimension?: number
  416. ): VectorSearchManager {
  417. return new VectorSearchManager(db, dimension);
  418. }