1
0

embedder.ts 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. /**
  2. * Text Embedder
  3. *
  4. * Generates vector embeddings using the nomic-embed-text model via Transformers.js.
  5. * Uses ONNX runtime under the hood for fast local inference.
  6. */
  7. import * as path from 'path';
  8. import * as fs from 'fs';
  9. import { homedir } from 'os';
  10. // Global model cache directory - uses codegraph's models directory for shared embedding models
  11. const GLOBAL_MODELS_DIR = path.join(homedir(), '.codegraph', 'models');
  12. // Dynamic import for @xenova/transformers (ESM-only package)
  13. // We use dynamic import to support CommonJS builds
  14. let transformersModule: typeof import('@xenova/transformers') | null = null;
  15. async function getTransformers() {
  16. if (!transformersModule) {
  17. transformersModule = await import('@xenova/transformers');
  18. }
  19. return transformersModule;
  20. }
  21. // Type for the feature extraction pipeline
  22. type FeatureExtractionPipeline = any;
  23. /**
  24. * Default model for embeddings
  25. * nomic-embed-text-v1.5 produces 384-dimensional embeddings
  26. */
  27. export const DEFAULT_MODEL = 'nomic-ai/nomic-embed-text-v1.5';
  28. export const EMBEDDING_DIMENSION = 768; // nomic-embed-text-v1.5 uses 768 dimensions
  29. /**
  30. * Options for the embedder
  31. */
  32. export interface EmbedderOptions {
  33. /** Model ID to use (default: nomic-ai/nomic-embed-text-v1.5) */
  34. modelId?: string;
  35. /** Directory to cache the model (default: ~/.codegraph/models) */
  36. cacheDir?: string;
  37. /** Whether to show progress during model download */
  38. showProgress?: boolean;
  39. }
  40. /**
  41. * Text embedding result
  42. */
  43. export interface EmbeddingResult {
  44. /** The embedding vector */
  45. embedding: Float32Array;
  46. /** Dimension of the embedding */
  47. dimension: number;
  48. /** Model used to generate the embedding */
  49. model: string;
  50. }
  51. /**
  52. * Batch embedding result
  53. */
  54. export interface BatchEmbeddingResult {
  55. /** Array of embeddings in same order as input */
  56. embeddings: Float32Array[];
  57. /** Dimension of each embedding */
  58. dimension: number;
  59. /** Model used to generate embeddings */
  60. model: string;
  61. /** Processing time in milliseconds */
  62. durationMs: number;
  63. }
  64. /**
  65. * Text Embedder using Transformers.js
  66. *
  67. * Uses the nomic-embed-text-v1.5 model to generate embeddings for code
  68. * and natural language queries.
  69. */
  70. export class TextEmbedder {
  71. private modelId: string;
  72. private cacheDir: string;
  73. private pipeline: FeatureExtractionPipeline | null = null;
  74. private initialized = false;
  75. private showProgress: boolean;
  76. constructor(options: EmbedderOptions = {}) {
  77. this.modelId = options.modelId || DEFAULT_MODEL;
  78. this.cacheDir = options.cacheDir || GLOBAL_MODELS_DIR;
  79. this.showProgress = options.showProgress ?? false;
  80. }
  81. /**
  82. * Initialize the embedder by loading the model
  83. *
  84. * This will download the model on first use if not already cached.
  85. */
  86. async initialize(): Promise<void> {
  87. if (this.initialized) {
  88. return;
  89. }
  90. // Load transformers.js dynamically (ESM-only package)
  91. const { pipeline, env } = await getTransformers();
  92. // Configure transformers.js to use local cache
  93. env.cacheDir = this.cacheDir;
  94. // Ensure cache directory exists
  95. if (!fs.existsSync(this.cacheDir)) {
  96. fs.mkdirSync(this.cacheDir, { recursive: true });
  97. }
  98. // Disable remote model checking if model is already cached
  99. // This speeds up initialization significantly
  100. const modelCacheExists = fs.existsSync(
  101. path.join(this.cacheDir, this.modelId.replace('/', '--'))
  102. );
  103. if (modelCacheExists) {
  104. env.allowRemoteModels = false;
  105. }
  106. // Load the pipeline
  107. this.pipeline = await pipeline('feature-extraction', this.modelId, {
  108. progress_callback: this.showProgress
  109. ? (progress: { status: string; file?: string; progress?: number }) => {
  110. if (progress.status === 'progress' && progress.file && progress.progress) {
  111. const pct = Math.round(progress.progress);
  112. process.stdout.write(`\rDownloading ${progress.file}: ${pct}%`);
  113. } else if (progress.status === 'done') {
  114. process.stdout.write('\n');
  115. }
  116. }
  117. : undefined,
  118. });
  119. this.initialized = true;
  120. }
  121. /**
  122. * Check if the embedder is initialized
  123. */
  124. isInitialized(): boolean {
  125. return this.initialized;
  126. }
  127. /**
  128. * Get the model ID being used
  129. */
  130. getModelId(): string {
  131. return this.modelId;
  132. }
  133. /**
  134. * Get the embedding dimension
  135. */
  136. getDimension(): number {
  137. return EMBEDDING_DIMENSION;
  138. }
  139. /**
  140. * Generate embedding for a single text
  141. *
  142. * @param text - Text to embed
  143. * @returns Embedding result
  144. */
  145. async embed(text: string): Promise<EmbeddingResult> {
  146. if (!this.initialized || !this.pipeline) {
  147. throw new Error('Embedder not initialized. Call initialize() first.');
  148. }
  149. // Prepare text for nomic-embed-text (it expects specific prefixes)
  150. const preparedText = this.prepareText(text, 'document');
  151. // Generate embedding
  152. const output = await this.pipeline(preparedText, {
  153. pooling: 'mean',
  154. normalize: true,
  155. });
  156. // Extract the embedding array - handle various data formats
  157. const data = output.data as unknown;
  158. const embedding = this.toFloat32Array(data);
  159. return {
  160. embedding,
  161. dimension: embedding.length,
  162. model: this.modelId,
  163. };
  164. }
  165. /**
  166. * Generate embedding for a query (uses different prefix)
  167. *
  168. * @param query - Query text to embed
  169. * @returns Embedding result
  170. */
  171. async embedQuery(query: string): Promise<EmbeddingResult> {
  172. if (!this.initialized || !this.pipeline) {
  173. throw new Error('Embedder not initialized. Call initialize() first.');
  174. }
  175. // Prepare text for nomic-embed-text query
  176. const preparedText = this.prepareText(query, 'search_query');
  177. // Generate embedding
  178. const output = await this.pipeline(preparedText, {
  179. pooling: 'mean',
  180. normalize: true,
  181. });
  182. // Extract the embedding array - handle various data formats
  183. const data = output.data as unknown;
  184. const embedding = this.toFloat32Array(data);
  185. return {
  186. embedding,
  187. dimension: embedding.length,
  188. model: this.modelId,
  189. };
  190. }
  191. /**
  192. * Generate embeddings for multiple texts in a batch
  193. *
  194. * @param texts - Array of texts to embed
  195. * @param type - Type of text (document or search_query)
  196. * @returns Batch embedding result
  197. */
  198. async embedBatch(
  199. texts: string[],
  200. type: 'document' | 'search_query' = 'document'
  201. ): Promise<BatchEmbeddingResult> {
  202. if (!this.initialized || !this.pipeline) {
  203. throw new Error('Embedder not initialized. Call initialize() first.');
  204. }
  205. if (texts.length === 0) {
  206. return {
  207. embeddings: [],
  208. dimension: EMBEDDING_DIMENSION,
  209. model: this.modelId,
  210. durationMs: 0,
  211. };
  212. }
  213. const startTime = Date.now();
  214. // Prepare all texts
  215. const preparedTexts = texts.map((t) => this.prepareText(t, type));
  216. // Generate embeddings
  217. const outputs = await this.pipeline(preparedTexts, {
  218. pooling: 'mean',
  219. normalize: true,
  220. });
  221. // Extract embeddings
  222. const embeddings: Float32Array[] = [];
  223. const dims = outputs.dims as number[];
  224. const dimension = dims[1] ?? EMBEDDING_DIMENSION;
  225. const data = outputs.data as unknown;
  226. const flatData = this.toFloat32Array(data);
  227. for (let i = 0; i < texts.length; i++) {
  228. const start = i * dimension;
  229. const end = start + dimension;
  230. embeddings.push(flatData.slice(start, end));
  231. }
  232. return {
  233. embeddings,
  234. dimension,
  235. model: this.modelId,
  236. durationMs: Date.now() - startTime,
  237. };
  238. }
  239. /**
  240. * Convert various array formats to Float32Array
  241. */
  242. private toFloat32Array(data: unknown): Float32Array {
  243. if (data instanceof Float32Array) {
  244. return data;
  245. }
  246. if (Array.isArray(data)) {
  247. return new Float32Array(data);
  248. }
  249. if (data && typeof data === 'object' && 'length' in data) {
  250. // Handle TypedArray-like objects
  251. const arr = data as ArrayLike<number>;
  252. return Float32Array.from(Array.from(arr));
  253. }
  254. throw new Error('Unsupported data format for embedding');
  255. }
  256. /**
  257. * Prepare text for the nomic-embed-text model
  258. *
  259. * The model expects specific prefixes for different tasks:
  260. * - "search_document: " for documents to be searched
  261. * - "search_query: " for search queries
  262. */
  263. private prepareText(text: string, type: 'document' | 'search_query'): string {
  264. // Truncate very long texts (model has a max token limit)
  265. const maxLength = 8192; // nomic-embed-text-v1.5 supports 8192 tokens
  266. const truncatedText = text.length > maxLength ? text.slice(0, maxLength) : text;
  267. // Add appropriate prefix
  268. if (type === 'search_query') {
  269. return `search_query: ${truncatedText}`;
  270. } else {
  271. return `search_document: ${truncatedText}`;
  272. }
  273. }
  274. /**
  275. * Create text representation of a code node for embedding
  276. *
  277. * Combines name, signature, docstring, and code snippet into
  278. * a searchable text representation.
  279. */
  280. static createNodeText(node: {
  281. name: string;
  282. kind: string;
  283. qualifiedName?: string;
  284. signature?: string;
  285. docstring?: string;
  286. filePath: string;
  287. }): string {
  288. const parts: string[] = [];
  289. // Add kind and name
  290. parts.push(`${node.kind}: ${node.name}`);
  291. // Add qualified name if different from name
  292. if (node.qualifiedName && node.qualifiedName !== node.name) {
  293. parts.push(`path: ${node.qualifiedName}`);
  294. }
  295. // Add file path
  296. parts.push(`file: ${node.filePath}`);
  297. // Add signature if present
  298. if (node.signature) {
  299. parts.push(`signature: ${node.signature}`);
  300. }
  301. // Add docstring if present
  302. if (node.docstring) {
  303. parts.push(`documentation: ${node.docstring}`);
  304. }
  305. return parts.join('\n');
  306. }
  307. /**
  308. * Compute cosine similarity between two embeddings
  309. */
  310. static cosineSimilarity(a: Float32Array, b: Float32Array): number {
  311. if (a.length !== b.length) {
  312. throw new Error('Embeddings must have the same dimension');
  313. }
  314. let dotProduct = 0;
  315. let normA = 0;
  316. let normB = 0;
  317. for (let i = 0; i < a.length; i++) {
  318. const aVal = a[i]!;
  319. const bVal = b[i]!;
  320. dotProduct += aVal * bVal;
  321. normA += aVal * aVal;
  322. normB += bVal * bVal;
  323. }
  324. normA = Math.sqrt(normA);
  325. normB = Math.sqrt(normB);
  326. if (normA === 0 || normB === 0) {
  327. return 0;
  328. }
  329. return dotProduct / (normA * normB);
  330. }
  331. /**
  332. * Release resources
  333. */
  334. dispose(): void {
  335. this.pipeline = null;
  336. this.initialized = false;
  337. }
  338. }
  339. /**
  340. * Create a text embedder instance
  341. */
  342. export function createEmbedder(options?: EmbedderOptions): TextEmbedder {
  343. return new TextEmbedder(options);
  344. }