Spaces:

cb1716pics
/

23RAG7

Sleeping

App Files Files Community

cb1716pics commited on Feb 20

Commit

d346441

verified ·

1 Parent(s): a523549

Upload data_processing.py

Browse files

Files changed (1) hide show

data_processing.py +55 -37

data_processing.py CHANGED Viewed

@@ -5,6 +5,7 @@ from sentence_transformers import SentenceTransformer
 from datasets import load_dataset
 import torch
 import json
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -16,56 +17,73 @@ embedding_model = HuggingFaceEmbeddings(
 all_documents = []
 ragbench = {}
 def create_faiss_index_file():
-   for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
-                'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
-                'tatqa', 'techqa']:
-    ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
-    for split in ragbench_dataset.keys():
-        for row in ragbench_dataset[split]:
-            doc = row["documents"]
-            if isinstance(doc, list):
-                doc = " ".join(doc)
-            all_documents.append(doc)
-     # Convert to embeddings
-    embeddings = embedding_model.embed_documents(all_documents)
-    # Convert embeddings to a NumPy array
-    embeddings_np = np.array(embeddings, dtype=np.float32)
-    global index_w
-    # Store in FAISS using the NumPy array's shape
-    index_w = faiss.IndexFlatL2(embeddings_np.shape[1])
-    index_w.add(embeddings_np)
     # Save FAISS index
-    faiss.write_index(index, f"data_local/rag7_index.faiss")
-    # Save documents in JSON (metadata storage)
-    with open(f"data_local/rag7_docs.json", "w") as f:
         json.dump(all_documents, f)
-    print(f"data is stored!")
-def load_data_from_faiss():
-    load_faiss()
-    load_metatdata()
 def load_ragbench():
-    ragbench = {}
-    for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa', 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa', 'tatqa', 'techqa']:
         ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
-def load_faiss():
     global index
-    faiss_index_path = f"data_local/rag7_index.faiss"
-    index = faiss.read_index(faiss_index_path)
-def load_metatdata():
     global actual_docs
-    with open(f"data_local/rag7_docs.json", "r") as f:
-        actual_docs = json.load(f)  # Contains all documents for this dataset

 from datasets import load_dataset
 import torch
 import json
+import os
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 all_documents = []
 ragbench = {}
+index = None
+actual_docs = []
+# Ensure data directory exists
+os.makedirs("data_local", exist_ok=True)
 def create_faiss_index_file():
+    global index  # Ensure we use the global FAISS index
+    all_documents.clear()  # Reset document list
+    for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
+                    'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
+                    'tatqa', 'techqa']:
+        ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
+        for split in ragbench_dataset.keys():
+            for row in ragbench_dataset[split]:
+                doc = row["documents"]
+                if isinstance(doc, list):
+                    doc = " ".join(doc)  # Convert list to string if needed
+                all_documents.append(doc)
+    # Convert documents to embeddings
+    embeddings = embedding_model.embed_documents(all_documents)
+    embeddings_np = np.array(embeddings, dtype=np.float32)
+    # Initialize and store in FAISS
+    index = faiss.IndexFlatL2(embeddings_np.shape[1])
+    index.add(embeddings_np)
     # Save FAISS index
+    faiss.write_index(index, "data_local/rag7_index.faiss")
+    # Save documents metadata
+    with open("data_local/rag7_docs.json", "w") as f:
         json.dump(all_documents, f)
+    print("FAISS index and metadata saved successfully!")
 def load_ragbench():
+    global ragbench
+    ragbench.clear()  # Reset dictionary
+    for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
+                    'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
+                    'tatqa', 'techqa']:
         ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
+def load_faiss():
     global index
+    faiss_index_path = "data_local/rag7_index.faiss"
+    if os.path.exists(faiss_index_path):
+        index = faiss.read_index(faiss_index_path)
+        print("FAISS index loaded successfully.")
+    else:
+        print("FAISS index file not found. Run create_faiss_index_file() first.")
+def load_metadata():
     global actual_docs
+    metadata_path = "data_local/rag7_docs.json"
+    if os.path.exists(metadata_path):
+        with open(metadata_path, "r") as f:
+            actual_docs = json.load(f)
+        print("Metadata loaded successfully.")
+    else:
+        print("Metadata file not found. Run create_faiss_index_file() first.")
+def load_data_from_faiss():
+    load_faiss()
+    load_metadata()
+    #return index, actual_docs