1
0

embedder.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. /**
  2. * Text Embedder
  3. *
  4. * Generates vector embeddings using the nomic-embed-text model via Transformers.js.
  5. * Uses ONNX runtime under the hood for fast local inference.
  6. */
  7. import * as path from 'path';
  8. import * as fs from 'fs';
  9. import { homedir } from 'os';
  10. // Global model cache directory - uses codegraph's models directory for shared embedding models
  11. const GLOBAL_MODELS_DIR = path.join(homedir(), '.codegraph', 'models');
  12. // Dynamic import for @xenova/transformers (ESM-only package)
  13. // We use dynamic import to support CommonJS builds
  14. let transformersModule: typeof import('@xenova/transformers') | null = null;
  15. async function getTransformers() {
  16. if (!transformersModule) {
  17. transformersModule = await import('@xenova/transformers');
  18. }
  19. return transformersModule;
  20. }
  21. // Type for the feature extraction pipeline
  22. type FeatureExtractionPipeline = any;
  23. /**
  24. * Default model for embeddings
  25. * nomic-embed-text-v1.5 produces 384-dimensional embeddings
  26. */
  27. export const DEFAULT_MODEL = 'nomic-ai/nomic-embed-text-v1.5';
  28. export const EMBEDDING_DIMENSION = 768; // nomic-embed-text-v1.5 uses 768 dimensions
  29. /**
  30. * Options for the embedder
  31. */
  32. export interface EmbedderOptions {
  33. /** Model ID to use (default: nomic-ai/nomic-embed-text-v1.5) */
  34. modelId?: string;
  35. /** Directory to cache the model (default: ~/.codegraph/models) */
  36. cacheDir?: string;
  37. /** Whether to show progress during model download */
  38. showProgress?: boolean;
  39. }
  40. /**
  41. * Text embedding result
  42. */
  43. export interface EmbeddingResult {
  44. /** The embedding vector */
  45. embedding: Float32Array;
  46. /** Dimension of the embedding */
  47. dimension: number;
  48. /** Model used to generate the embedding */
  49. model: string;
  50. }
  51. /**
  52. * Batch embedding result
  53. */
  54. export interface BatchEmbeddingResult {
  55. /** Array of embeddings in same order as input */
  56. embeddings: Float32Array[];
  57. /** Dimension of each embedding */
  58. dimension: number;
  59. /** Model used to generate embeddings */
  60. model: string;
  61. /** Processing time in milliseconds */
  62. durationMs: number;
  63. }
  64. /**
  65. * Text Embedder using Transformers.js
  66. *
  67. * Uses the nomic-embed-text-v1.5 model to generate embeddings for code
  68. * and natural language queries.
  69. */
  70. export class TextEmbedder {
  71. private modelId: string;
  72. private cacheDir: string;
  73. private pipeline: FeatureExtractionPipeline | null = null;
  74. private initialized = false;
  75. private showProgress: boolean;
  76. constructor(options: EmbedderOptions = {}) {
  77. this.modelId = options.modelId || DEFAULT_MODEL;
  78. this.cacheDir = options.cacheDir || GLOBAL_MODELS_DIR;
  79. this.showProgress = options.showProgress ?? false;
  80. }
  81. /**
  82. * Initialize the embedder by loading the model
  83. *
  84. * This will download the model on first use if not already cached.
  85. */
  86. async initialize(): Promise<void> {
  87. if (this.initialized) {
  88. return;
  89. }
  90. // Load transformers.js dynamically (ESM-only package)
  91. const { pipeline, env } = await getTransformers();
  92. // Configure transformers.js to use local cache
  93. env.cacheDir = this.cacheDir;
  94. // Ensure cache directory exists
  95. if (!fs.existsSync(this.cacheDir)) {
  96. fs.mkdirSync(this.cacheDir, { recursive: true });
  97. }
  98. // Disable remote model checking if model is already cached
  99. // This speeds up initialization significantly
  100. const modelCacheExists = fs.existsSync(
  101. path.join(this.cacheDir, this.modelId.replace('/', '--'))
  102. );
  103. if (modelCacheExists) {
  104. env.allowRemoteModels = false;
  105. }
  106. // Load the pipeline with quantized model to reduce WASM memory pressure.
  107. // Quantized (int8/uint8) is ~4x smaller than FP32 with minimal quality loss.
  108. this.pipeline = await pipeline('feature-extraction', this.modelId, {
  109. quantized: true,
  110. progress_callback: this.showProgress
  111. ? (progress: { status: string; file?: string; progress?: number }) => {
  112. if (progress.status === 'progress' && progress.file && progress.progress) {
  113. const pct = Math.round(progress.progress);
  114. process.stdout.write(`\rDownloading ${progress.file}: ${pct}%`);
  115. } else if (progress.status === 'done') {
  116. process.stdout.write('\n');
  117. }
  118. }
  119. : undefined,
  120. });
  121. this.initialized = true;
  122. }
  123. /**
  124. * Check if the embedder is initialized
  125. */
  126. isInitialized(): boolean {
  127. return this.initialized;
  128. }
  129. /**
  130. * Get the model ID being used
  131. */
  132. getModelId(): string {
  133. return this.modelId;
  134. }
  135. /**
  136. * Get the embedding dimension
  137. */
  138. getDimension(): number {
  139. return EMBEDDING_DIMENSION;
  140. }
  141. /**
  142. * Generate embedding for a single text
  143. *
  144. * @param text - Text to embed
  145. * @returns Embedding result
  146. */
  147. async embed(text: string): Promise<EmbeddingResult> {
  148. if (!this.initialized || !this.pipeline) {
  149. throw new Error('Embedder not initialized. Call initialize() first.');
  150. }
  151. // Prepare text for nomic-embed-text (it expects specific prefixes)
  152. const preparedText = this.prepareText(text, 'document');
  153. // Generate embedding
  154. const output = await this.pipeline(preparedText, {
  155. pooling: 'mean',
  156. normalize: true,
  157. });
  158. // Extract the embedding array - handle various data formats
  159. const data = output.data as unknown;
  160. const embedding = this.toFloat32Array(data);
  161. return {
  162. embedding,
  163. dimension: embedding.length,
  164. model: this.modelId,
  165. };
  166. }
  167. /**
  168. * Generate embedding for a query (uses different prefix)
  169. *
  170. * @param query - Query text to embed
  171. * @returns Embedding result
  172. */
  173. async embedQuery(query: string): Promise<EmbeddingResult> {
  174. if (!this.initialized || !this.pipeline) {
  175. throw new Error('Embedder not initialized. Call initialize() first.');
  176. }
  177. // Prepare text for nomic-embed-text query
  178. const preparedText = this.prepareText(query, 'search_query');
  179. // Generate embedding
  180. const output = await this.pipeline(preparedText, {
  181. pooling: 'mean',
  182. normalize: true,
  183. });
  184. // Extract the embedding array - handle various data formats
  185. const data = output.data as unknown;
  186. const embedding = this.toFloat32Array(data);
  187. return {
  188. embedding,
  189. dimension: embedding.length,
  190. model: this.modelId,
  191. };
  192. }
  193. /**
  194. * Generate embeddings for multiple texts in a batch
  195. *
  196. * @param texts - Array of texts to embed
  197. * @param type - Type of text (document or search_query)
  198. * @returns Batch embedding result
  199. */
  200. async embedBatch(
  201. texts: string[],
  202. type: 'document' | 'search_query' = 'document'
  203. ): Promise<BatchEmbeddingResult> {
  204. if (!this.initialized || !this.pipeline) {
  205. throw new Error('Embedder not initialized. Call initialize() first.');
  206. }
  207. if (texts.length === 0) {
  208. return {
  209. embeddings: [],
  210. dimension: EMBEDDING_DIMENSION,
  211. model: this.modelId,
  212. durationMs: 0,
  213. };
  214. }
  215. const startTime = Date.now();
  216. // Prepare all texts
  217. const preparedTexts = texts.map((t) => this.prepareText(t, type));
  218. // Generate embeddings
  219. const outputs = await this.pipeline(preparedTexts, {
  220. pooling: 'mean',
  221. normalize: true,
  222. });
  223. // Extract embeddings
  224. const embeddings: Float32Array[] = [];
  225. const dims = outputs.dims as number[];
  226. const dimension = dims[1] ?? EMBEDDING_DIMENSION;
  227. const data = outputs.data as unknown;
  228. const flatData = this.toFloat32Array(data);
  229. for (let i = 0; i < texts.length; i++) {
  230. const start = i * dimension;
  231. const end = start + dimension;
  232. embeddings.push(flatData.slice(start, end));
  233. }
  234. return {
  235. embeddings,
  236. dimension,
  237. model: this.modelId,
  238. durationMs: Date.now() - startTime,
  239. };
  240. }
  241. /**
  242. * Convert various array formats to Float32Array
  243. */
  244. private toFloat32Array(data: unknown): Float32Array {
  245. if (data instanceof Float32Array) {
  246. return data;
  247. }
  248. if (Array.isArray(data)) {
  249. return new Float32Array(data);
  250. }
  251. if (data && typeof data === 'object' && 'length' in data) {
  252. // Handle TypedArray-like objects
  253. const arr = data as ArrayLike<number>;
  254. return Float32Array.from(Array.from(arr));
  255. }
  256. throw new Error('Unsupported data format for embedding');
  257. }
  258. /**
  259. * Prepare text for the nomic-embed-text model
  260. *
  261. * The model expects specific prefixes for different tasks:
  262. * - "search_document: " for documents to be searched
  263. * - "search_query: " for search queries
  264. */
  265. private prepareText(text: string, type: 'document' | 'search_query'): string {
  266. // Truncate very long texts (model has a max token limit)
  267. const maxLength = 8192; // nomic-embed-text-v1.5 supports 8192 tokens
  268. const truncatedText = text.length > maxLength ? text.slice(0, maxLength) : text;
  269. // Add appropriate prefix
  270. if (type === 'search_query') {
  271. return `search_query: ${truncatedText}`;
  272. } else {
  273. return `search_document: ${truncatedText}`;
  274. }
  275. }
  276. /**
  277. * Create text representation of a code node for embedding
  278. *
  279. * Combines name, signature, docstring, and code snippet into
  280. * a searchable text representation.
  281. */
  282. static createNodeText(node: {
  283. name: string;
  284. kind: string;
  285. qualifiedName?: string;
  286. signature?: string;
  287. docstring?: string;
  288. filePath: string;
  289. }): string {
  290. const parts: string[] = [];
  291. // Add kind and name
  292. parts.push(`${node.kind}: ${node.name}`);
  293. // Add qualified name if different from name
  294. if (node.qualifiedName && node.qualifiedName !== node.name) {
  295. parts.push(`path: ${node.qualifiedName}`);
  296. }
  297. // Add file path
  298. parts.push(`file: ${node.filePath}`);
  299. // Add signature if present
  300. if (node.signature) {
  301. parts.push(`signature: ${node.signature}`);
  302. }
  303. // Add docstring if present
  304. if (node.docstring) {
  305. parts.push(`documentation: ${node.docstring}`);
  306. }
  307. return parts.join('\n');
  308. }
  309. /**
  310. * Compute cosine similarity between two embeddings
  311. */
  312. static cosineSimilarity(a: Float32Array, b: Float32Array): number {
  313. if (a.length !== b.length) {
  314. throw new Error('Embeddings must have the same dimension');
  315. }
  316. let dotProduct = 0;
  317. let normA = 0;
  318. let normB = 0;
  319. for (let i = 0; i < a.length; i++) {
  320. const aVal = a[i]!;
  321. const bVal = b[i]!;
  322. dotProduct += aVal * bVal;
  323. normA += aVal * aVal;
  324. normB += bVal * bVal;
  325. }
  326. normA = Math.sqrt(normA);
  327. normB = Math.sqrt(normB);
  328. if (normA === 0 || normB === 0) {
  329. return 0;
  330. }
  331. return dotProduct / (normA * normB);
  332. }
  333. /**
  334. * Release resources
  335. */
  336. dispose(): void {
  337. this.pipeline = null;
  338. this.initialized = false;
  339. }
  340. }
  341. /**
  342. * Create a text embedder instance
  343. */
  344. export function createEmbedder(options?: EmbedderOptions): TextEmbedder {
  345. return new TextEmbedder(options);
  346. }