Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- .gitattributes +2 -0
- app.py +13 -2
- data_local/rag7_docs.json +3 -0
- data_local/rag7_index.faiss +3 -0
- data_processing.py +66 -0
- evaluation.py +76 -0
- generator.py +29 -0
- retrieval.py +34 -0
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data_local/rag7_docs.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data_local/rag7_index.faiss filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,15 +1,26 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
2 |
|
3 |
# Page Title
|
4 |
st.title("RAG7 - Real World RAG System")
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# Question Section
|
7 |
st.subheader("Hi, What do you want to know today?")
|
8 |
question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
|
9 |
|
10 |
# Submit Button
|
11 |
if st.button("Submit"):
|
12 |
-
|
|
|
13 |
else:
|
14 |
response = ""
|
15 |
|
@@ -24,7 +35,7 @@ col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics d
|
|
24 |
|
25 |
with col1:
|
26 |
if st.button("Calculate Metrics"):
|
27 |
-
metrics =
|
28 |
else:
|
29 |
metrics = ""
|
30 |
|
|
|
1 |
import streamlit as st
|
2 |
+
from generator import generate_response_from_document
|
3 |
+
from retrieval import retrieve_documents
|
4 |
+
from evaluation import calculate_metrics
|
5 |
+
from data_processing import load_data_from_faiss, ragbench
|
6 |
|
7 |
# Page Title
|
8 |
st.title("RAG7 - Real World RAG System")
|
9 |
|
10 |
+
@st.cache_data
|
11 |
+
def load_data():
|
12 |
+
load_data_from_faiss()
|
13 |
+
|
14 |
+
data_status = load_data()
|
15 |
+
|
16 |
# Question Section
|
17 |
st.subheader("Hi, What do you want to know today?")
|
18 |
question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
|
19 |
|
20 |
# Submit Button
|
21 |
if st.button("Submit"):
|
22 |
+
retrieved_documents = retrieve_documents(question, 5)
|
23 |
+
response = generate_response_from_document(question, retrieved_documents)
|
24 |
else:
|
25 |
response = ""
|
26 |
|
|
|
35 |
|
36 |
with col1:
|
37 |
if st.button("Calculate Metrics"):
|
38 |
+
metrics = calculate_metrics(question, response, retrieved_documents, ragbench)
|
39 |
else:
|
40 |
metrics = ""
|
41 |
|
data_local/rag7_docs.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffab81dea126b119f8cd0866da062a433c4bd5d24aedd06c0dcbc3bcbd9a4433
|
3 |
+
size 409081045
|
data_local/rag7_index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70a478fdf54591eb877f2ae772b3b2cc2ca85ed4a29ae174c72287f5eeb28b63
|
3 |
+
size 146505261
|
data_processing.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from langchain.vectorstores import faiss
|
3 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
from datasets import load_dataset
|
6 |
+
import torch
|
7 |
+
import json
|
8 |
+
|
9 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
10 |
+
|
11 |
+
# Load embedding model
|
12 |
+
embedding_model = HuggingFaceEmbeddings(
|
13 |
+
model_name="paraphrase-MiniLM-L3-v2",
|
14 |
+
model_kwargs={"device": device}
|
15 |
+
)
|
16 |
+
|
17 |
+
all_documents = []
|
18 |
+
index = None
|
19 |
+
actual_docs = None
|
20 |
+
|
21 |
+
|
22 |
+
def create_faiss_index_file():
|
23 |
+
for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
|
24 |
+
'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
|
25 |
+
'tatqa', 'techqa']:
|
26 |
+
ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
|
27 |
+
for split in ragbench_dataset.keys():
|
28 |
+
for row in ragbench_dataset[split]:
|
29 |
+
doc = row["documents"]
|
30 |
+
if isinstance(doc, list):
|
31 |
+
doc = " ".join(doc)
|
32 |
+
|
33 |
+
all_documents.append(doc)
|
34 |
+
|
35 |
+
# Convert to embeddings
|
36 |
+
embeddings = embedding_model.embed_documents(all_documents)
|
37 |
+
|
38 |
+
# Convert embeddings to a NumPy array
|
39 |
+
embeddings_np = np.array(embeddings, dtype=np.float32)
|
40 |
+
|
41 |
+
# Store in FAISS using the NumPy array's shape
|
42 |
+
index = faiss.IndexFlatL2(embeddings_np.shape[1])
|
43 |
+
index.add(embeddings_np)
|
44 |
+
|
45 |
+
# Save FAISS index
|
46 |
+
faiss.write_index(index, f"data_local\rag7_index.faiss")
|
47 |
+
|
48 |
+
# Save documents in JSON (metadata storage)
|
49 |
+
with open(f"data_local\rag7_docs.json", "w") as f:
|
50 |
+
json.dump(all_documents, f)
|
51 |
+
|
52 |
+
print(f"data is stored!")
|
53 |
+
|
54 |
+
def load_data_from_faiss():
|
55 |
+
load_faiss()
|
56 |
+
load_metatdata()
|
57 |
+
|
58 |
+
def load_faiss():
|
59 |
+
# Load the correct FAISS index
|
60 |
+
faiss_index_path = f"data_local\rag7_index.faiss"
|
61 |
+
index = faiss.read_index(faiss_index_path)
|
62 |
+
|
63 |
+
def load_metatdata():
|
64 |
+
# Load document metadata
|
65 |
+
with open(f"data_local\rag7_docs.json", "r") as f:
|
66 |
+
actual_docs = json.load(f) # Contains all documents for this dataset
|
evaluation.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
from sklearn.metrics import mean_squared_error, roc_auc_score
|
4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
6 |
+
|
7 |
+
ground_truth_answer = ''
|
8 |
+
ground_truth_metrics = {}
|
9 |
+
|
10 |
+
|
11 |
+
def calculate_metrics(question, response, docs,data):
|
12 |
+
retrieve_ground_truths(question,data)
|
13 |
+
# Predicted metrics
|
14 |
+
predicted_metrics = {
|
15 |
+
"context_relevance": context_relevance(question, docs),
|
16 |
+
"context_utilization": context_utilization(response, docs),
|
17 |
+
"completeness": completeness(response, ground_truth_answer),
|
18 |
+
"adherence": adherence(response, docs)
|
19 |
+
}
|
20 |
+
return predicted_metrics
|
21 |
+
|
22 |
+
def retrieve_ground_truths(question,ragbench):
|
23 |
+
# Iterate through all splits (train, test, validation)
|
24 |
+
for dataset_name in ragbench.keys():
|
25 |
+
for split_name,instances in ragbench[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
|
26 |
+
print(f"Processing {split_name} split")
|
27 |
+
for instance in instances: # Fixed: Corrected indentation
|
28 |
+
# Check if the question (data) matches the query
|
29 |
+
if instance['question'] == question:
|
30 |
+
# If a match is found, retrieve id and response
|
31 |
+
instance_id = instance['id']
|
32 |
+
instance_response = instance['response']
|
33 |
+
ground_truth_metrics = {
|
34 |
+
"context_relevance": instance['relevance_score'],
|
35 |
+
"context_utilization": instance['utilization_score'],
|
36 |
+
"completeness": instance['completeness_score'],
|
37 |
+
"adherence": instance['adherence_score']
|
38 |
+
}
|
39 |
+
ground_truth_answer = instance_response
|
40 |
+
print(f"Match found in {split_name} split!")
|
41 |
+
print(f"ID: {instance_id}, Response: {instance_response}")
|
42 |
+
break # Exit after finding the first match (optional)
|
43 |
+
|
44 |
+
# Step 1: Helper function to compute cosine similarity
|
45 |
+
def compute_cosine_similarity(text1, text2):
|
46 |
+
vectorizer = TfidfVectorizer()
|
47 |
+
vectors = vectorizer.fit_transform([text1, text2])
|
48 |
+
return cosine_similarity(vectors[0], vectors[1])[0][0]
|
49 |
+
|
50 |
+
# Step 2: Metric 1 - Context Relevance
|
51 |
+
def context_relevance(question, relevant_documents):
|
52 |
+
combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
53 |
+
return compute_cosine_similarity(question, combined_docs)
|
54 |
+
|
55 |
+
# Step 3: Metric 2 - Context Utilization
|
56 |
+
def context_utilization(response, relevant_documents):
|
57 |
+
combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
58 |
+
return compute_cosine_similarity(response, combined_docs)
|
59 |
+
|
60 |
+
# Step 4: Metric 3 - Completeness
|
61 |
+
def completeness(response, ground_truth_answer):
|
62 |
+
return compute_cosine_similarity(response, ground_truth_answer)
|
63 |
+
|
64 |
+
# Step 5: Metric 4 - Adherence
|
65 |
+
def adherence(response, relevant_documents):
|
66 |
+
combined_docs = " ".join([doc.page_content for doc in relevant_documents])
|
67 |
+
response_tokens = set(response.split())
|
68 |
+
relevant_tokens = set(combined_docs.split())
|
69 |
+
supported_tokens = response_tokens.intersection(relevant_tokens)
|
70 |
+
return len(supported_tokens) / len(response_tokens)
|
71 |
+
|
72 |
+
# Step 6: Compute RMSE for metrics
|
73 |
+
def compute_rmse(predicted_values, ground_truth_values):
|
74 |
+
return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
|
75 |
+
|
76 |
+
|
generator.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import openai
|
2 |
+
import time
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Set your OpenAI API key
|
6 |
+
openai.api_key = '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'
|
7 |
+
|
8 |
+
def generate_response_from_document(query, retrieved_docs):
|
9 |
+
if not retrieved_docs: # Check if no documents were retrieved
|
10 |
+
return "I cannot answer the question due to insufficient information in the documents."
|
11 |
+
|
12 |
+
context = " ".join([doc.page_content for doc in retrieved_docs]) # Now iterates over Document objects
|
13 |
+
prompt = (
|
14 |
+
"You are a highly intelligent assistant tasked with answering a question based strictly on the provided context. "
|
15 |
+
f"Given Question: {query} \n\n"
|
16 |
+
f"Context: {context} \n"
|
17 |
+
"Answer the question directly and concisely using only the information available in the context."
|
18 |
+
)
|
19 |
+
|
20 |
+
try:
|
21 |
+
response = openai.chat.completions.create( # Use the new chat completions API
|
22 |
+
model= "gpt-3.5-turbo", #"gpt-4", #"gpt-3.5-turbo" Or use another suitable model like gpt-4
|
23 |
+
messages=[{"role": "user", "content": prompt}],
|
24 |
+
max_tokens=300,
|
25 |
+
temperature=0.7,
|
26 |
+
)
|
27 |
+
return response.choices[0].message.content.strip() # Extract content from message
|
28 |
+
except Exception as e:
|
29 |
+
return f"Error generating response: {str(e)}"
|
retrieval.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import numpy as np
|
3 |
+
from langchain.schema import Document
|
4 |
+
from langchain.vectorstores import faiss
|
5 |
+
|
6 |
+
from data_processing import embedding_model, index
|
7 |
+
|
8 |
+
# Retrieval Function
|
9 |
+
def retrieve_documents(query, top_k=5):
|
10 |
+
# Embed the query
|
11 |
+
query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
|
12 |
+
|
13 |
+
# Search in FAISS (top 5 results)
|
14 |
+
_, nearest_indices = index.search(query_embedding, top_k)
|
15 |
+
|
16 |
+
# Load document metadata
|
17 |
+
with open(f"data_local\rag7_docs.json", "r") as f:
|
18 |
+
documents = json.load(f) # Contains all documents for this dataset
|
19 |
+
|
20 |
+
# Retrieve the actual documents and create Document objects
|
21 |
+
retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
|
22 |
+
|
23 |
+
return retrieved_docs
|
24 |
+
|
25 |
+
def remove_duplicate_documents(documents):
|
26 |
+
unique_documents = []
|
27 |
+
seen_documents = set() # To keep track of seen documents
|
28 |
+
for doc in documents:
|
29 |
+
# Using the page_content as a unique identifier for deduplication
|
30 |
+
doc_content = doc.page_content
|
31 |
+
if doc_content not in seen_documents:
|
32 |
+
unique_documents.append(doc)
|
33 |
+
seen_documents.add(doc_content)
|
34 |
+
return unique_documents
|