embedder.ts 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. /**
  2. * Text Embedder
  3. *
  4. * Generates vector embeddings using the nomic-embed-text model via Transformers.js.
  5. * Uses ONNX runtime under the hood for fast local inference.
  6. */
  7. import * as path from 'path';
  8. import * as fs from 'fs';
  9. // Dynamic import for @xenova/transformers (ESM-only package)
  10. // We use dynamic import to support CommonJS builds
  11. let transformersModule: typeof import('@xenova/transformers') | null = null;
  12. async function getTransformers() {
  13. if (!transformersModule) {
  14. transformersModule = await import('@xenova/transformers');
  15. }
  16. return transformersModule;
  17. }
  18. // Type for the feature extraction pipeline
  19. type FeatureExtractionPipeline = any;
  20. /**
  21. * Default model for embeddings
  22. * nomic-embed-text-v1.5 produces 384-dimensional embeddings
  23. */
  24. export const DEFAULT_MODEL = 'nomic-ai/nomic-embed-text-v1.5';
  25. export const EMBEDDING_DIMENSION = 768; // nomic-embed-text-v1.5 uses 768 dimensions
  26. /**
  27. * Options for the embedder
  28. */
  29. export interface EmbedderOptions {
  30. /** Model ID to use (default: nomic-ai/nomic-embed-text-v1.5) */
  31. modelId?: string;
  32. /** Directory to cache the model (default: .codegraph/models) */
  33. cacheDir?: string;
  34. /** Whether to show progress during model download */
  35. showProgress?: boolean;
  36. }
  37. /**
  38. * Text embedding result
  39. */
  40. export interface EmbeddingResult {
  41. /** The embedding vector */
  42. embedding: Float32Array;
  43. /** Dimension of the embedding */
  44. dimension: number;
  45. /** Model used to generate the embedding */
  46. model: string;
  47. }
  48. /**
  49. * Batch embedding result
  50. */
  51. export interface BatchEmbeddingResult {
  52. /** Array of embeddings in same order as input */
  53. embeddings: Float32Array[];
  54. /** Dimension of each embedding */
  55. dimension: number;
  56. /** Model used to generate embeddings */
  57. model: string;
  58. /** Processing time in milliseconds */
  59. durationMs: number;
  60. }
  61. /**
  62. * Text Embedder using Transformers.js
  63. *
  64. * Uses the nomic-embed-text-v1.5 model to generate embeddings for code
  65. * and natural language queries.
  66. */
  67. export class TextEmbedder {
  68. private modelId: string;
  69. private cacheDir: string;
  70. private pipeline: FeatureExtractionPipeline | null = null;
  71. private initialized = false;
  72. private showProgress: boolean;
  73. constructor(options: EmbedderOptions = {}) {
  74. this.modelId = options.modelId || DEFAULT_MODEL;
  75. this.cacheDir = options.cacheDir || '.codegraph/models';
  76. this.showProgress = options.showProgress ?? false;
  77. }
  78. /**
  79. * Initialize the embedder by loading the model
  80. *
  81. * This will download the model on first use if not already cached.
  82. */
  83. async initialize(): Promise<void> {
  84. if (this.initialized) {
  85. return;
  86. }
  87. // Load transformers.js dynamically (ESM-only package)
  88. const { pipeline, env } = await getTransformers();
  89. // Configure transformers.js to use local cache
  90. env.cacheDir = this.cacheDir;
  91. // Ensure cache directory exists
  92. if (!fs.existsSync(this.cacheDir)) {
  93. fs.mkdirSync(this.cacheDir, { recursive: true });
  94. }
  95. // Disable remote model checking if model is already cached
  96. // This speeds up initialization significantly
  97. const modelCacheExists = fs.existsSync(
  98. path.join(this.cacheDir, this.modelId.replace('/', '--'))
  99. );
  100. if (modelCacheExists) {
  101. env.allowRemoteModels = false;
  102. }
  103. // Load the pipeline
  104. this.pipeline = await pipeline('feature-extraction', this.modelId, {
  105. progress_callback: this.showProgress
  106. ? (progress: { status: string; file?: string; progress?: number }) => {
  107. if (progress.status === 'progress' && progress.file && progress.progress) {
  108. const pct = Math.round(progress.progress);
  109. process.stdout.write(`\rDownloading ${progress.file}: ${pct}%`);
  110. } else if (progress.status === 'done') {
  111. process.stdout.write('\n');
  112. }
  113. }
  114. : undefined,
  115. });
  116. this.initialized = true;
  117. }
  118. /**
  119. * Check if the embedder is initialized
  120. */
  121. isInitialized(): boolean {
  122. return this.initialized;
  123. }
  124. /**
  125. * Get the model ID being used
  126. */
  127. getModelId(): string {
  128. return this.modelId;
  129. }
  130. /**
  131. * Get the embedding dimension
  132. */
  133. getDimension(): number {
  134. return EMBEDDING_DIMENSION;
  135. }
  136. /**
  137. * Generate embedding for a single text
  138. *
  139. * @param text - Text to embed
  140. * @returns Embedding result
  141. */
  142. async embed(text: string): Promise<EmbeddingResult> {
  143. if (!this.initialized || !this.pipeline) {
  144. throw new Error('Embedder not initialized. Call initialize() first.');
  145. }
  146. // Prepare text for nomic-embed-text (it expects specific prefixes)
  147. const preparedText = this.prepareText(text, 'document');
  148. // Generate embedding
  149. const output = await this.pipeline(preparedText, {
  150. pooling: 'mean',
  151. normalize: true,
  152. });
  153. // Extract the embedding array - handle various data formats
  154. const data = output.data as unknown;
  155. const embedding = this.toFloat32Array(data);
  156. return {
  157. embedding,
  158. dimension: embedding.length,
  159. model: this.modelId,
  160. };
  161. }
  162. /**
  163. * Generate embedding for a query (uses different prefix)
  164. *
  165. * @param query - Query text to embed
  166. * @returns Embedding result
  167. */
  168. async embedQuery(query: string): Promise<EmbeddingResult> {
  169. if (!this.initialized || !this.pipeline) {
  170. throw new Error('Embedder not initialized. Call initialize() first.');
  171. }
  172. // Prepare text for nomic-embed-text query
  173. const preparedText = this.prepareText(query, 'search_query');
  174. // Generate embedding
  175. const output = await this.pipeline(preparedText, {
  176. pooling: 'mean',
  177. normalize: true,
  178. });
  179. // Extract the embedding array - handle various data formats
  180. const data = output.data as unknown;
  181. const embedding = this.toFloat32Array(data);
  182. return {
  183. embedding,
  184. dimension: embedding.length,
  185. model: this.modelId,
  186. };
  187. }
  188. /**
  189. * Generate embeddings for multiple texts in a batch
  190. *
  191. * @param texts - Array of texts to embed
  192. * @param type - Type of text (document or search_query)
  193. * @returns Batch embedding result
  194. */
  195. async embedBatch(
  196. texts: string[],
  197. type: 'document' | 'search_query' = 'document'
  198. ): Promise<BatchEmbeddingResult> {
  199. if (!this.initialized || !this.pipeline) {
  200. throw new Error('Embedder not initialized. Call initialize() first.');
  201. }
  202. if (texts.length === 0) {
  203. return {
  204. embeddings: [],
  205. dimension: EMBEDDING_DIMENSION,
  206. model: this.modelId,
  207. durationMs: 0,
  208. };
  209. }
  210. const startTime = Date.now();
  211. // Prepare all texts
  212. const preparedTexts = texts.map((t) => this.prepareText(t, type));
  213. // Generate embeddings
  214. const outputs = await this.pipeline(preparedTexts, {
  215. pooling: 'mean',
  216. normalize: true,
  217. });
  218. // Extract embeddings
  219. const embeddings: Float32Array[] = [];
  220. const dims = outputs.dims as number[];
  221. const dimension = dims[1] ?? EMBEDDING_DIMENSION;
  222. const data = outputs.data as unknown;
  223. const flatData = this.toFloat32Array(data);
  224. for (let i = 0; i < texts.length; i++) {
  225. const start = i * dimension;
  226. const end = start + dimension;
  227. embeddings.push(flatData.slice(start, end));
  228. }
  229. return {
  230. embeddings,
  231. dimension,
  232. model: this.modelId,
  233. durationMs: Date.now() - startTime,
  234. };
  235. }
  236. /**
  237. * Convert various array formats to Float32Array
  238. */
  239. private toFloat32Array(data: unknown): Float32Array {
  240. if (data instanceof Float32Array) {
  241. return data;
  242. }
  243. if (Array.isArray(data)) {
  244. return new Float32Array(data);
  245. }
  246. if (data && typeof data === 'object' && 'length' in data) {
  247. // Handle TypedArray-like objects
  248. const arr = data as ArrayLike<number>;
  249. return new Float32Array(arr.length);
  250. }
  251. throw new Error('Unsupported data format for embedding');
  252. }
  253. /**
  254. * Prepare text for the nomic-embed-text model
  255. *
  256. * The model expects specific prefixes for different tasks:
  257. * - "search_document: " for documents to be searched
  258. * - "search_query: " for search queries
  259. */
  260. private prepareText(text: string, type: 'document' | 'search_query'): string {
  261. // Truncate very long texts (model has a max token limit)
  262. const maxLength = 8192; // nomic-embed-text-v1.5 supports 8192 tokens
  263. const truncatedText = text.length > maxLength ? text.slice(0, maxLength) : text;
  264. // Add appropriate prefix
  265. if (type === 'search_query') {
  266. return `search_query: ${truncatedText}`;
  267. } else {
  268. return `search_document: ${truncatedText}`;
  269. }
  270. }
  271. /**
  272. * Create text representation of a code node for embedding
  273. *
  274. * Combines name, signature, docstring, and code snippet into
  275. * a searchable text representation.
  276. */
  277. static createNodeText(node: {
  278. name: string;
  279. kind: string;
  280. qualifiedName?: string;
  281. signature?: string;
  282. docstring?: string;
  283. filePath: string;
  284. }): string {
  285. const parts: string[] = [];
  286. // Add kind and name
  287. parts.push(`${node.kind}: ${node.name}`);
  288. // Add qualified name if different from name
  289. if (node.qualifiedName && node.qualifiedName !== node.name) {
  290. parts.push(`path: ${node.qualifiedName}`);
  291. }
  292. // Add file path
  293. parts.push(`file: ${node.filePath}`);
  294. // Add signature if present
  295. if (node.signature) {
  296. parts.push(`signature: ${node.signature}`);
  297. }
  298. // Add docstring if present
  299. if (node.docstring) {
  300. parts.push(`documentation: ${node.docstring}`);
  301. }
  302. return parts.join('\n');
  303. }
  304. /**
  305. * Compute cosine similarity between two embeddings
  306. */
  307. static cosineSimilarity(a: Float32Array, b: Float32Array): number {
  308. if (a.length !== b.length) {
  309. throw new Error('Embeddings must have the same dimension');
  310. }
  311. let dotProduct = 0;
  312. let normA = 0;
  313. let normB = 0;
  314. for (let i = 0; i < a.length; i++) {
  315. const aVal = a[i]!;
  316. const bVal = b[i]!;
  317. dotProduct += aVal * bVal;
  318. normA += aVal * aVal;
  319. normB += bVal * bVal;
  320. }
  321. normA = Math.sqrt(normA);
  322. normB = Math.sqrt(normB);
  323. if (normA === 0 || normB === 0) {
  324. return 0;
  325. }
  326. return dotProduct / (normA * normB);
  327. }
  328. /**
  329. * Release resources
  330. */
  331. dispose(): void {
  332. this.pipeline = null;
  333. this.initialized = false;
  334. }
  335. }
  336. /**
  337. * Create a text embedder instance
  338. */
  339. export function createEmbedder(options?: EmbedderOptions): TextEmbedder {
  340. return new TextEmbedder(options);
  341. }