| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410 |
- /**
- * Text Embedder
- *
- * Generates vector embeddings using the nomic-embed-text model via Transformers.js.
- * Uses ONNX runtime under the hood for fast local inference.
- */
- import * as path from 'path';
- import * as fs from 'fs';
- import { homedir } from 'os';
- // Global model cache directory - uses codegraph's models directory for shared embedding models
- const GLOBAL_MODELS_DIR = path.join(homedir(), '.codegraph', 'models');
- // Dynamic import for @xenova/transformers (ESM-only package)
- // We use dynamic import to support CommonJS builds
- let transformersModule: typeof import('@xenova/transformers') | null = null;
- async function getTransformers() {
- if (!transformersModule) {
- transformersModule = await import('@xenova/transformers');
- }
- return transformersModule;
- }
- // Type for the feature extraction pipeline
- type FeatureExtractionPipeline = any;
- /**
- * Default model for embeddings
- * nomic-embed-text-v1.5 produces 384-dimensional embeddings
- */
- export const DEFAULT_MODEL = 'nomic-ai/nomic-embed-text-v1.5';
- export const EMBEDDING_DIMENSION = 768; // nomic-embed-text-v1.5 uses 768 dimensions
- /**
- * Options for the embedder
- */
- export interface EmbedderOptions {
- /** Model ID to use (default: nomic-ai/nomic-embed-text-v1.5) */
- modelId?: string;
- /** Directory to cache the model (default: ~/.codegraph/models) */
- cacheDir?: string;
- /** Whether to show progress during model download */
- showProgress?: boolean;
- }
- /**
- * Text embedding result
- */
- export interface EmbeddingResult {
- /** The embedding vector */
- embedding: Float32Array;
- /** Dimension of the embedding */
- dimension: number;
- /** Model used to generate the embedding */
- model: string;
- }
- /**
- * Batch embedding result
- */
- export interface BatchEmbeddingResult {
- /** Array of embeddings in same order as input */
- embeddings: Float32Array[];
- /** Dimension of each embedding */
- dimension: number;
- /** Model used to generate embeddings */
- model: string;
- /** Processing time in milliseconds */
- durationMs: number;
- }
- /**
- * Text Embedder using Transformers.js
- *
- * Uses the nomic-embed-text-v1.5 model to generate embeddings for code
- * and natural language queries.
- */
- export class TextEmbedder {
- private modelId: string;
- private cacheDir: string;
- private pipeline: FeatureExtractionPipeline | null = null;
- private initialized = false;
- private showProgress: boolean;
- constructor(options: EmbedderOptions = {}) {
- this.modelId = options.modelId || DEFAULT_MODEL;
- this.cacheDir = options.cacheDir || GLOBAL_MODELS_DIR;
- this.showProgress = options.showProgress ?? false;
- }
- /**
- * Initialize the embedder by loading the model
- *
- * This will download the model on first use if not already cached.
- */
- async initialize(): Promise<void> {
- if (this.initialized) {
- return;
- }
- // Load transformers.js dynamically (ESM-only package)
- const { pipeline, env } = await getTransformers();
- // Configure transformers.js to use local cache
- env.cacheDir = this.cacheDir;
- // Ensure cache directory exists
- if (!fs.existsSync(this.cacheDir)) {
- fs.mkdirSync(this.cacheDir, { recursive: true });
- }
- // Disable remote model checking if model is already cached
- // This speeds up initialization significantly
- const modelCacheExists = fs.existsSync(
- path.join(this.cacheDir, this.modelId.replace('/', '--'))
- );
- if (modelCacheExists) {
- env.allowRemoteModels = false;
- }
- // Load the pipeline with quantized model to reduce WASM memory pressure.
- // Quantized (int8/uint8) is ~4x smaller than FP32 with minimal quality loss.
- this.pipeline = await pipeline('feature-extraction', this.modelId, {
- quantized: true,
- progress_callback: this.showProgress
- ? (progress: { status: string; file?: string; progress?: number }) => {
- if (progress.status === 'progress' && progress.file && progress.progress) {
- const pct = Math.round(progress.progress);
- process.stdout.write(`\rDownloading ${progress.file}: ${pct}%`);
- } else if (progress.status === 'done') {
- process.stdout.write('\n');
- }
- }
- : undefined,
- });
- this.initialized = true;
- }
- /**
- * Check if the embedder is initialized
- */
- isInitialized(): boolean {
- return this.initialized;
- }
- /**
- * Get the model ID being used
- */
- getModelId(): string {
- return this.modelId;
- }
- /**
- * Get the embedding dimension
- */
- getDimension(): number {
- return EMBEDDING_DIMENSION;
- }
- /**
- * Generate embedding for a single text
- *
- * @param text - Text to embed
- * @returns Embedding result
- */
- async embed(text: string): Promise<EmbeddingResult> {
- if (!this.initialized || !this.pipeline) {
- throw new Error('Embedder not initialized. Call initialize() first.');
- }
- // Prepare text for nomic-embed-text (it expects specific prefixes)
- const preparedText = this.prepareText(text, 'document');
- // Generate embedding
- const output = await this.pipeline(preparedText, {
- pooling: 'mean',
- normalize: true,
- });
- // Extract the embedding array - handle various data formats
- const data = output.data as unknown;
- const embedding = this.toFloat32Array(data);
- return {
- embedding,
- dimension: embedding.length,
- model: this.modelId,
- };
- }
- /**
- * Generate embedding for a query (uses different prefix)
- *
- * @param query - Query text to embed
- * @returns Embedding result
- */
- async embedQuery(query: string): Promise<EmbeddingResult> {
- if (!this.initialized || !this.pipeline) {
- throw new Error('Embedder not initialized. Call initialize() first.');
- }
- // Prepare text for nomic-embed-text query
- const preparedText = this.prepareText(query, 'search_query');
- // Generate embedding
- const output = await this.pipeline(preparedText, {
- pooling: 'mean',
- normalize: true,
- });
- // Extract the embedding array - handle various data formats
- const data = output.data as unknown;
- const embedding = this.toFloat32Array(data);
- return {
- embedding,
- dimension: embedding.length,
- model: this.modelId,
- };
- }
- /**
- * Generate embeddings for multiple texts in a batch
- *
- * @param texts - Array of texts to embed
- * @param type - Type of text (document or search_query)
- * @returns Batch embedding result
- */
- async embedBatch(
- texts: string[],
- type: 'document' | 'search_query' = 'document'
- ): Promise<BatchEmbeddingResult> {
- if (!this.initialized || !this.pipeline) {
- throw new Error('Embedder not initialized. Call initialize() first.');
- }
- if (texts.length === 0) {
- return {
- embeddings: [],
- dimension: EMBEDDING_DIMENSION,
- model: this.modelId,
- durationMs: 0,
- };
- }
- const startTime = Date.now();
- // Prepare all texts
- const preparedTexts = texts.map((t) => this.prepareText(t, type));
- // Generate embeddings
- const outputs = await this.pipeline(preparedTexts, {
- pooling: 'mean',
- normalize: true,
- });
- // Extract embeddings
- const embeddings: Float32Array[] = [];
- const dims = outputs.dims as number[];
- const dimension = dims[1] ?? EMBEDDING_DIMENSION;
- const data = outputs.data as unknown;
- const flatData = this.toFloat32Array(data);
- for (let i = 0; i < texts.length; i++) {
- const start = i * dimension;
- const end = start + dimension;
- embeddings.push(flatData.slice(start, end));
- }
- return {
- embeddings,
- dimension,
- model: this.modelId,
- durationMs: Date.now() - startTime,
- };
- }
- /**
- * Convert various array formats to Float32Array
- */
- private toFloat32Array(data: unknown): Float32Array {
- if (data instanceof Float32Array) {
- return data;
- }
- if (Array.isArray(data)) {
- return new Float32Array(data);
- }
- if (data && typeof data === 'object' && 'length' in data) {
- // Handle TypedArray-like objects
- const arr = data as ArrayLike<number>;
- return Float32Array.from(Array.from(arr));
- }
- throw new Error('Unsupported data format for embedding');
- }
- /**
- * Prepare text for the nomic-embed-text model
- *
- * The model expects specific prefixes for different tasks:
- * - "search_document: " for documents to be searched
- * - "search_query: " for search queries
- */
- private prepareText(text: string, type: 'document' | 'search_query'): string {
- // Truncate very long texts (model has a max token limit)
- const maxLength = 8192; // nomic-embed-text-v1.5 supports 8192 tokens
- const truncatedText = text.length > maxLength ? text.slice(0, maxLength) : text;
- // Add appropriate prefix
- if (type === 'search_query') {
- return `search_query: ${truncatedText}`;
- } else {
- return `search_document: ${truncatedText}`;
- }
- }
- /**
- * Create text representation of a code node for embedding
- *
- * Combines name, signature, docstring, and code snippet into
- * a searchable text representation.
- */
- static createNodeText(node: {
- name: string;
- kind: string;
- qualifiedName?: string;
- signature?: string;
- docstring?: string;
- filePath: string;
- }): string {
- const parts: string[] = [];
- // Add kind and name
- parts.push(`${node.kind}: ${node.name}`);
- // Add qualified name if different from name
- if (node.qualifiedName && node.qualifiedName !== node.name) {
- parts.push(`path: ${node.qualifiedName}`);
- }
- // Add file path
- parts.push(`file: ${node.filePath}`);
- // Add signature if present
- if (node.signature) {
- parts.push(`signature: ${node.signature}`);
- }
- // Add docstring if present
- if (node.docstring) {
- parts.push(`documentation: ${node.docstring}`);
- }
- return parts.join('\n');
- }
- /**
- * Compute cosine similarity between two embeddings
- */
- static cosineSimilarity(a: Float32Array, b: Float32Array): number {
- if (a.length !== b.length) {
- throw new Error('Embeddings must have the same dimension');
- }
- let dotProduct = 0;
- let normA = 0;
- let normB = 0;
- for (let i = 0; i < a.length; i++) {
- const aVal = a[i]!;
- const bVal = b[i]!;
- dotProduct += aVal * bVal;
- normA += aVal * aVal;
- normB += bVal * bVal;
- }
- normA = Math.sqrt(normA);
- normB = Math.sqrt(normB);
- if (normA === 0 || normB === 0) {
- return 0;
- }
- return dotProduct / (normA * normB);
- }
- /**
- * Release resources
- */
- dispose(): void {
- this.pipeline = null;
- this.initialized = false;
- }
- }
- /**
- * Create a text embedder instance
- */
- export function createEmbedder(options?: EmbedderOptions): TextEmbedder {
- return new TextEmbedder(options);
- }
|