import numpy as np from langchain.vectorstores import faiss from langchain.embeddings import HuggingFaceEmbeddings from sentence_transformers import SentenceTransformer from datasets import load_dataset import torch import json device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load embedding model embedding_model = HuggingFaceEmbeddings( model_name="paraphrase-MiniLM-L3-v2", model_kwargs={"device": device} ) all_documents = [] index = None actual_docs = None def create_faiss_index_file(): for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']: ragbench_dataset = load_dataset("rungalileo/ragbench", dataset) for split in ragbench_dataset.keys(): for row in ragbench_dataset[split]: doc = row["documents"] if isinstance(doc, list): doc = " ".join(doc) all_documents.append(doc) # Convert to embeddings embeddings = embedding_model.embed_documents(all_documents) # Convert embeddings to a NumPy array embeddings_np = np.array(embeddings, dtype=np.float32) # Store in FAISS using the NumPy array's shape index = faiss.IndexFlatL2(embeddings_np.shape[1]) index.add(embeddings_np) # Save FAISS index faiss.write_index(index, f"data_local\rag7_index.faiss") # Save documents in JSON (metadata storage) with open(f"data_local\rag7_docs.json", "w") as f: json.dump(all_documents, f) print(f"data is stored!") def load_data_from_faiss(): load_faiss() load_metatdata() def load_faiss(): # Load the correct FAISS index faiss_index_path = f"data_local\rag7_index.faiss" index = faiss.read_index(faiss_index_path) def load_metatdata(): # Load document metadata with open(f"data_local\rag7_docs.json", "r") as f: actual_docs = json.load(f) # Contains all documents for this dataset