23RAG7

Sleeping

App Files Files Community

cb1716pics commited on Feb 20

Commit

1b04b96

verified ·

1 Parent(s): 00dc0db

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +2 -0
app.py +13 -2
data_local/rag7_docs.json +3 -0
data_local/rag7_index.faiss +3 -0
data_processing.py +66 -0
evaluation.py +76 -0
generator.py +29 -0
retrieval.py +34 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_local/rag7_docs.json filter=lfs diff=lfs merge=lfs -text
+data_local/rag7_index.faiss filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,15 +1,26 @@
 import streamlit as st
 # Page Title
 st.title("RAG7 - Real World RAG System")
 # Question Section
 st.subheader("Hi, What do you want to know today?")
 question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
 # Submit Button
 if st.button("Submit"):
-    response = f"Response for: {question}"  # Placeholder response
 else:
     response = ""
@@ -24,7 +35,7 @@ col1, col2 = st.columns([1, 3])  # Creating two columns for button and metrics d
 with col1:
     if st.button("Calculate Metrics"):
-        metrics = "Accuracy: 90%\nLatency: 50ms\nRelevance Score: 0.85"  # Placeholder metrics
     else:
         metrics = ""

 import streamlit as st
+from generator import generate_response_from_document
+from retrieval import retrieve_documents
+from evaluation import calculate_metrics
+from data_processing import load_data_from_faiss, ragbench
 # Page Title
 st.title("RAG7 - Real World RAG System")
+@st.cache_data
+def load_data():
+    load_data_from_faiss()
+data_status = load_data()
 # Question Section
 st.subheader("Hi, What do you want to know today?")
 question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
 # Submit Button
 if st.button("Submit"):
+    retrieved_documents = retrieve_documents(question, 5)
+    response = generate_response_from_document(question, retrieved_documents)
 else:
     response = ""
 with col1:
     if st.button("Calculate Metrics"):
+        metrics = calculate_metrics(question, response, retrieved_documents, ragbench)
     else:
         metrics = ""

data_local/rag7_docs.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ffab81dea126b119f8cd0866da062a433c4bd5d24aedd06c0dcbc3bcbd9a4433
+size 409081045

data_local/rag7_index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70a478fdf54591eb877f2ae772b3b2cc2ca85ed4a29ae174c72287f5eeb28b63
+size 146505261

data_processing.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+from langchain.vectorstores import faiss
+from langchain.embeddings import HuggingFaceEmbeddings
+from sentence_transformers import SentenceTransformer
+from datasets import load_dataset
+import torch
+import json
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load embedding model
+embedding_model = HuggingFaceEmbeddings(
+    model_name="paraphrase-MiniLM-L3-v2",
+    model_kwargs={"device": device}
+)
+all_documents = []
+index = None
+actual_docs = None
+def create_faiss_index_file():
+   for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
+                'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
+                'tatqa', 'techqa']:
+    ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
+    for split in ragbench_dataset.keys():
+        for row in ragbench_dataset[split]:
+            doc = row["documents"]
+            if isinstance(doc, list):
+                doc = " ".join(doc)
+            all_documents.append(doc)
+     # Convert to embeddings
+    embeddings = embedding_model.embed_documents(all_documents)
+    # Convert embeddings to a NumPy array
+    embeddings_np = np.array(embeddings, dtype=np.float32)
+    # Store in FAISS using the NumPy array's shape
+    index = faiss.IndexFlatL2(embeddings_np.shape[1])
+    index.add(embeddings_np)
+    # Save FAISS index
+    faiss.write_index(index, f"data_local\rag7_index.faiss")
+    # Save documents in JSON (metadata storage)
+    with open(f"data_local\rag7_docs.json", "w") as f:
+        json.dump(all_documents, f)
+    print(f"data is stored!")
+def load_data_from_faiss():
+    load_faiss()
+    load_metatdata()
+def load_faiss():
+    # Load the correct FAISS index
+    faiss_index_path = f"data_local\rag7_index.faiss"
+    index = faiss.read_index(faiss_index_path)
+def load_metatdata():
+    # Load document metadata
+    with open(f"data_local\rag7_docs.json", "r") as f:
+        actual_docs = json.load(f)  # Contains all documents for this dataset

evaluation.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import numpy as np
+from sklearn.metrics import mean_squared_error, roc_auc_score
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+ground_truth_answer = ''
+ground_truth_metrics = {}
+def calculate_metrics(question, response, docs,data):
+    retrieve_ground_truths(question,data)
+    # Predicted metrics
+    predicted_metrics = {
+        "context_relevance": context_relevance(question, docs),
+        "context_utilization": context_utilization(response, docs),
+        "completeness": completeness(response, ground_truth_answer),
+        "adherence": adherence(response, docs)
+    }
+    return predicted_metrics
+def retrieve_ground_truths(question,ragbench):
+    # Iterate through all splits (train, test, validation)
+    for dataset_name in ragbench.keys():
+        for split_name,instances in ragbench[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
+            print(f"Processing {split_name} split")
+            for instance in instances: # Fixed: Corrected indentation
+                # Check if the question (data) matches the query
+                if instance['question'] == question:
+                    # If a match is found, retrieve id and response
+                    instance_id = instance['id']
+                    instance_response = instance['response']
+                    ground_truth_metrics = {
+                        "context_relevance": instance['relevance_score'],
+                        "context_utilization": instance['utilization_score'],
+                        "completeness": instance['completeness_score'],
+                        "adherence": instance['adherence_score']
+                    }
+                    ground_truth_answer = instance_response
+                    print(f"Match found in {split_name} split!")
+                    print(f"ID: {instance_id}, Response: {instance_response}")
+                    break  # Exit after finding the first match (optional)
+# Step 1: Helper function to compute cosine similarity
+def compute_cosine_similarity(text1, text2):
+    vectorizer = TfidfVectorizer()
+    vectors = vectorizer.fit_transform([text1, text2])
+    return cosine_similarity(vectors[0], vectors[1])[0][0]
+# Step 2: Metric 1 - Context Relevance
+def context_relevance(question, relevant_documents):
+    combined_docs = " ".join([doc.page_content for doc in relevant_documents])
+    return compute_cosine_similarity(question, combined_docs)
+# Step 3: Metric 2 - Context Utilization
+def context_utilization(response, relevant_documents):
+    combined_docs = " ".join([doc.page_content for doc in relevant_documents])
+    return compute_cosine_similarity(response, combined_docs)
+# Step 4: Metric 3 - Completeness
+def completeness(response, ground_truth_answer):
+    return compute_cosine_similarity(response, ground_truth_answer)
+# Step 5: Metric 4 - Adherence
+def adherence(response, relevant_documents):
+    combined_docs = " ".join([doc.page_content for doc in relevant_documents])
+    response_tokens = set(response.split())
+    relevant_tokens = set(combined_docs.split())
+    supported_tokens = response_tokens.intersection(relevant_tokens)
+    return len(supported_tokens) / len(response_tokens)
+# Step 6: Compute RMSE for metrics
+def compute_rmse(predicted_values, ground_truth_values):
+    return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))

generator.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import openai
+import time
+import os
+# Set your OpenAI API key
+openai.api_key = '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'
+def generate_response_from_document(query, retrieved_docs):
+    if not retrieved_docs:  # Check if no documents were retrieved
+        return "I cannot answer the question due to insufficient information in the documents."
+    context = " ".join([doc.page_content for doc in retrieved_docs]) # Now iterates over Document objects
+    prompt = (
+            "You are a highly intelligent assistant tasked with answering a question based strictly on the provided context. "
+            f"Given Question: {query} \n\n"
+            f"Context: {context} \n"
+            "Answer the question directly and concisely using only the information available in the context."
+        )
+    try:
+        response = openai.chat.completions.create( # Use the new chat completions API
+            model= "gpt-3.5-turbo", #"gpt-4", #"gpt-3.5-turbo"   Or use another suitable model like gpt-4
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=300,
+            temperature=0.7,
+        )
+        return response.choices[0].message.content.strip() # Extract content from message
+    except Exception as e:
+        return f"Error generating response: {str(e)}"

retrieval.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import json
+import numpy as np
+from langchain.schema import Document
+from langchain.vectorstores import faiss
+from data_processing import embedding_model, index
+# Retrieval Function
+def retrieve_documents(query, top_k=5):
+    # Embed the query
+    query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
+    # Search in FAISS (top 5 results)
+    _, nearest_indices = index.search(query_embedding, top_k)
+    # Load document metadata
+    with open(f"data_local\rag7_docs.json", "r") as f:
+        documents = json.load(f)  # Contains all documents for this dataset
+    # Retrieve the actual documents and create Document objects
+    retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
+    return retrieved_docs
+def remove_duplicate_documents(documents):
+    unique_documents = []
+    seen_documents = set()  # To keep track of seen documents
+    for doc in documents:
+        # Using the page_content as a unique identifier for deduplication
+        doc_content = doc.page_content
+        if doc_content not in seen_documents:
+            unique_documents.append(doc)
+            seen_documents.add(doc_content)
+    return unique_documents