""" This script handles document embedding using EmbeddingGemma. This is the entry point for indexing documents. """ from sentence_transformers import SentenceTransformer import numpy as np import faiss import os import pickle from typing import List, Tuple def embed_documents(path: str, config: dict) -> List[Tuple[str, np.ndarray]]: """ Embed documents from a directory and save to FAISS index. Args: path (str): Path to the directory containing the documents to embed. config (dict): Configuration dictionary. Returns: List of tuples containing (filename, embedding) """ try: model = SentenceTransformer(config["embedding"]["model_path"]) print( f"Initialized embedding model: {config['embedding']['model_path']}") except Exception as e: # Changed to catch broader exception print(f"Error initializing embedding model: {e}") return [] embeddings = [] texts = [] filenames = [] # Read all documents for fname in os.listdir(path): fpath = os.path.join(path, fname) if os.path.isfile(fpath): try: # Try different encodings to handle various file types for encoding in ['utf-8', 'latin-1', 'cp1252']: try: with open(fpath, "r", encoding=encoding) as f: text = f.read() break except UnicodeDecodeError: continue else: print( f"Could not decode file {fpath} with common encodings") continue if text.strip(): # Only process non-empty files emb = model.encode(text) # Ensure all embeddings have the same dimension if embeddings and emb.shape[0] != embeddings[0].shape[0]: print(f"Dimension mismatch in file {fname}, skipping") continue embeddings.append(emb) texts.append(text) filenames.append(fname) except Exception as e: print(f"Error processing file {fpath}: {e}") if not embeddings: print("No documents were successfully embedded.") return [] print("Embedder script started", flush=True) print(f"Documents in path: {os.listdir(path)}") print(f"Successfully processed {len(embeddings)} documents") # Create FAISS index dimension = embeddings[0].shape[0] index = faiss.IndexFlatIP(dimension) # Convert to numpy array and normalize embeddings_matrix = np.array(embeddings).astype("float32") faiss.normalize_L2(embeddings_matrix) # Normalize for cosine similarity # Add normalized embeddings to index index.add(embeddings_matrix) # Save FAISS index and metadata os.makedirs("vector_cache", exist_ok=True) faiss.write_index(index, "vector_cache/faiss_index.bin") # Save metadata with open("vector_cache/metadata.pkl", "wb") as f: pickle.dump({"texts": texts, "filenames": filenames}, f) print( f"Saved FAISS index to vector_cache/ with {len(embeddings)} documents.") print(f"Total embeddings created: {len(embeddings)}") return list(zip(filenames, embeddings)) # Example usage if __name__ == "__main__": config = { "embedding": { "model_path": "sentence-transformers/all-MiniLM-L6-v2" # Example model } } result = embed_documents("./docs", config)