23RAG7

Sleeping

App Files Files Community

cb1716pics commited on Feb 20

Commit

99afa50

verified ·

1 Parent(s): 192559e

Upload 4 files

Browse files

Files changed (4) hide show

app.py +3 -0
data_processing.py +66 -41
requirements.txt +2 -1
retrieval.py +12 -1

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ import time
 # Page Title
 st.title("RAG7 - Real World RAG System")
 # @st.cache_data
 # def load_data():
 #     load_data_from_faiss()

 # Page Title
 st.title("RAG7 - Real World RAG System")
+global retrieved_documents
+retrieved_documents = []
 # @st.cache_data
 # def load_data():
 #     load_data_from_faiss()

data_processing.py CHANGED Viewed

@@ -1,60 +1,78 @@
-import numpy as np
 import faiss
-from langchain.embeddings import HuggingFaceEmbeddings
-from sentence_transformers import SentenceTransformer
-from datasets import load_dataset
 import torch
 import json
 import os
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load embedding model
 embedding_model = HuggingFaceEmbeddings(
-    model_name="paraphrase-MiniLM-L3-v2",
     model_kwargs={"device": device}
 )
 all_documents = []
 ragbench = {}
 index = None
-actual_docs = []
 # Ensure data directory exists
 os.makedirs("data_local", exist_ok=True)
-def create_faiss_index_file():
-    global index  # Ensure we use the global FAISS index
-    all_documents.clear()  # Reset document list
-    for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
-                    'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
-                    'tatqa', 'techqa']:
-        ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
-        for split in ragbench_dataset.keys():
-            for row in ragbench_dataset[split]:
-                doc = row["documents"]
-                if isinstance(doc, list):
-                    doc = " ".join(doc)  # Convert list to string if needed
-                all_documents.append(doc)
-    # Convert documents to embeddings
-    embeddings = embedding_model.embed_documents(all_documents)
     embeddings_np = np.array(embeddings, dtype=np.float32)
-    # Initialize and store in FAISS
-    index = faiss.IndexFlatL2(embeddings_np.shape[1])
-    index.add(embeddings_np)
     # Save FAISS index
-    faiss.write_index(index, "data_local/rag7_index.faiss")
-    # Save documents metadata
-    with open("data_local/rag7_docs.json", "w") as f:
-        json.dump(all_documents, f)
-    print("FAISS index and metadata saved successfully!")
 def load_ragbench():
     global ragbench
@@ -64,26 +82,33 @@ def load_ragbench():
                     'tatqa', 'techqa']:
         ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
-def load_faiss():
     global index
-    faiss_index_path = "data_local/rag7_index.faiss"
     if os.path.exists(faiss_index_path):
         index = faiss.read_index(faiss_index_path)
         print("FAISS index loaded successfully.")
     else:
-        print("FAISS index file not found. Run create_faiss_index_file() first.")
-def load_metadata():
-    global actual_docs
-    metadata_path = "data_local/rag7_docs.json"
     if os.path.exists(metadata_path):
         with open(metadata_path, "r") as f:
-            actual_docs = json.load(f)
         print("Metadata loaded successfully.")
     else:
         print("Metadata file not found. Run create_faiss_index_file() first.")
-def load_data_from_faiss():
-    load_faiss()
-    load_metadata()
-    #return index, actual_docs

 import faiss
 import torch
 import json
 import os
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
+from datasets import load_dataset
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import CrossEncoder
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Load embedding model
 embedding_model = HuggingFaceEmbeddings(
+    model_name="all-MiniLM-L12-v2",
     model_kwargs={"device": device}
 )
+reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
 all_documents = []
 ragbench = {}
 index = None
+chunk_docs = []
+documents = []
 # Ensure data directory exists
 os.makedirs("data_local", exist_ok=True)
+# Initialize a text splitter
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=1024,
+    chunk_overlap=100
+)
+def chunk_documents(docs):
+    chunks = [chunk for doc in docs for chunk in text_splitter.split_text(doc)]
+    return chunks
+def create_faiss_index(dataset):
+    # Load dataset
+    ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
+    for split in ragbench_dataset.keys():
+        for row in ragbench_dataset[split]:
+            # Ensure document is a string before appending
+            doc = row["documents"]
+            if isinstance(doc, list):
+                # If doc is a list, join its elements into a single string
+                doc = " ".join(doc)
+            documents.append(doc)  # Extract document text
+            # Chunking
+    chunked_documents = chunk_documents(documents)
+    # Save documents in JSON (metadata storage)
+    with open(f"{dataset}_chunked_docs.json", "w") as f:
+        json.dump(chunked_documents, f)
+    print(len(chunked_documents))
+    # Convert to embeddings
+    embeddings = embedding_model.embed_documents(chunked_documents)
+    # Convert embeddings to a NumPy array
     embeddings_np = np.array(embeddings, dtype=np.float32)
     # Save FAISS index
+    index = faiss.IndexHNSWFlat(embeddings_np.shape[1], 32)  # 32 is the graph size
+    index.add(embeddings_np)
+    faiss.write_index(index, f"{dataset}_chunked_index.faiss")
+    print(f"{dataset} stored as individual FAISS index!")
 def load_ragbench():
     global ragbench
                     'tatqa', 'techqa']:
         ragbench[dataset] = load_dataset("rungalileo/ragbench", dataset)
+def load_faiss(query_dataset):
     global index
+    faiss_index_path = f"data_local/{query_dataset}_quantized.faiss"
     if os.path.exists(faiss_index_path):
         index = faiss.read_index(faiss_index_path)
         print("FAISS index loaded successfully.")
     else:
+        print("FAISS index file not found. Run create_faiss_index_file() first.")
+def load_chunks(query_dataset):
+    global chunk_docs
+    metadata_path = f"data_local/{query_dataset}_chunked_docs.json"
     if os.path.exists(metadata_path):
         with open(metadata_path, "r") as f:
+            chunk_docs = json.load(f)
         print("Metadata loaded successfully.")
     else:
         print("Metadata file not found. Run create_faiss_index_file() first.")
+def load_data_from_faiss(query_dataset):
+    load_faiss(query_dataset)
+    load_chunks(query_dataset)
+    #return index_, chunks_
+def rerank_documents(query, retrieved_docs):
+    doc_texts = [doc for doc in retrieved_docs]
+    scores = reranker.predict([[query, doc] for doc in doc_texts])
+    ranked_docs = [doc for _, doc in sorted(zip(scores, retrieved_docs), reverse=True)]
+    return ranked_docs[:5]  # Return top 5 most relevant

requirements.txt CHANGED Viewed

@@ -14,4 +14,5 @@ rank_bm25
 nltk
 requests
 rouge-score
-numpy

 nltk
 requests
 rouge-score
+numpy
+rank_bm25

retrieval.py CHANGED Viewed

@@ -33,4 +33,15 @@ def remove_duplicate_documents(documents):
         if doc_content not in seen_documents:
             unique_documents.append(doc)
             seen_documents.add(doc_content)
-    return unique_documents

         if doc_content not in seen_documents:
             unique_documents.append(doc)
             seen_documents.add(doc_content)
+    return unique_documents
+def find_query_dataset(query):
+    index = faiss.read_index("question_index.faiss")
+    with open("dataset_mapping.json", "r") as f:
+        dataset_names = json.load(f)
+    question_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
+    _, nearest_index = index.search(question_embedding, 1)
+    best_dataset = dataset_names[nearest_index[0][0]]
+    return best_dataset