cb1716pics commited on
Commit
1b04b96
·
verified ·
1 Parent(s): 00dc0db

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data_local/rag7_docs.json filter=lfs diff=lfs merge=lfs -text
37
+ data_local/rag7_index.faiss filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,15 +1,26 @@
1
  import streamlit as st
 
 
 
 
2
 
3
  # Page Title
4
  st.title("RAG7 - Real World RAG System")
5
 
 
 
 
 
 
 
6
  # Question Section
7
  st.subheader("Hi, What do you want to know today?")
8
  question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
9
 
10
  # Submit Button
11
  if st.button("Submit"):
12
- response = f"Response for: {question}" # Placeholder response
 
13
  else:
14
  response = ""
15
 
@@ -24,7 +35,7 @@ col1, col2 = st.columns([1, 3]) # Creating two columns for button and metrics d
24
 
25
  with col1:
26
  if st.button("Calculate Metrics"):
27
- metrics = "Accuracy: 90%\nLatency: 50ms\nRelevance Score: 0.85" # Placeholder metrics
28
  else:
29
  metrics = ""
30
 
 
1
  import streamlit as st
2
+ from generator import generate_response_from_document
3
+ from retrieval import retrieve_documents
4
+ from evaluation import calculate_metrics
5
+ from data_processing import load_data_from_faiss, ragbench
6
 
7
  # Page Title
8
  st.title("RAG7 - Real World RAG System")
9
 
10
+ @st.cache_data
11
+ def load_data():
12
+ load_data_from_faiss()
13
+
14
+ data_status = load_data()
15
+
16
  # Question Section
17
  st.subheader("Hi, What do you want to know today?")
18
  question = st.text_area("Enter your question:", placeholder="Type your question here...", height=100)
19
 
20
  # Submit Button
21
  if st.button("Submit"):
22
+ retrieved_documents = retrieve_documents(question, 5)
23
+ response = generate_response_from_document(question, retrieved_documents)
24
  else:
25
  response = ""
26
 
 
35
 
36
  with col1:
37
  if st.button("Calculate Metrics"):
38
+ metrics = calculate_metrics(question, response, retrieved_documents, ragbench)
39
  else:
40
  metrics = ""
41
 
data_local/rag7_docs.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffab81dea126b119f8cd0866da062a433c4bd5d24aedd06c0dcbc3bcbd9a4433
3
+ size 409081045
data_local/rag7_index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70a478fdf54591eb877f2ae772b3b2cc2ca85ed4a29ae174c72287f5eeb28b63
3
+ size 146505261
data_processing.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from langchain.vectorstores import faiss
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
+ from sentence_transformers import SentenceTransformer
5
+ from datasets import load_dataset
6
+ import torch
7
+ import json
8
+
9
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ # Load embedding model
12
+ embedding_model = HuggingFaceEmbeddings(
13
+ model_name="paraphrase-MiniLM-L3-v2",
14
+ model_kwargs={"device": device}
15
+ )
16
+
17
+ all_documents = []
18
+ index = None
19
+ actual_docs = None
20
+
21
+
22
+ def create_faiss_index_file():
23
+ for dataset in ['covidqa', 'cuad', 'delucionqa', 'emanual', 'expertqa',
24
+ 'finqa', 'hagrid', 'hotpotqa', 'msmarco', 'pubmedqa',
25
+ 'tatqa', 'techqa']:
26
+ ragbench_dataset = load_dataset("rungalileo/ragbench", dataset)
27
+ for split in ragbench_dataset.keys():
28
+ for row in ragbench_dataset[split]:
29
+ doc = row["documents"]
30
+ if isinstance(doc, list):
31
+ doc = " ".join(doc)
32
+
33
+ all_documents.append(doc)
34
+
35
+ # Convert to embeddings
36
+ embeddings = embedding_model.embed_documents(all_documents)
37
+
38
+ # Convert embeddings to a NumPy array
39
+ embeddings_np = np.array(embeddings, dtype=np.float32)
40
+
41
+ # Store in FAISS using the NumPy array's shape
42
+ index = faiss.IndexFlatL2(embeddings_np.shape[1])
43
+ index.add(embeddings_np)
44
+
45
+ # Save FAISS index
46
+ faiss.write_index(index, f"data_local\rag7_index.faiss")
47
+
48
+ # Save documents in JSON (metadata storage)
49
+ with open(f"data_local\rag7_docs.json", "w") as f:
50
+ json.dump(all_documents, f)
51
+
52
+ print(f"data is stored!")
53
+
54
+ def load_data_from_faiss():
55
+ load_faiss()
56
+ load_metatdata()
57
+
58
+ def load_faiss():
59
+ # Load the correct FAISS index
60
+ faiss_index_path = f"data_local\rag7_index.faiss"
61
+ index = faiss.read_index(faiss_index_path)
62
+
63
+ def load_metatdata():
64
+ # Load document metadata
65
+ with open(f"data_local\rag7_docs.json", "r") as f:
66
+ actual_docs = json.load(f) # Contains all documents for this dataset
evaluation.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np
3
+ from sklearn.metrics import mean_squared_error, roc_auc_score
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ ground_truth_answer = ''
8
+ ground_truth_metrics = {}
9
+
10
+
11
+ def calculate_metrics(question, response, docs,data):
12
+ retrieve_ground_truths(question,data)
13
+ # Predicted metrics
14
+ predicted_metrics = {
15
+ "context_relevance": context_relevance(question, docs),
16
+ "context_utilization": context_utilization(response, docs),
17
+ "completeness": completeness(response, ground_truth_answer),
18
+ "adherence": adherence(response, docs)
19
+ }
20
+ return predicted_metrics
21
+
22
+ def retrieve_ground_truths(question,ragbench):
23
+ # Iterate through all splits (train, test, validation)
24
+ for dataset_name in ragbench.keys():
25
+ for split_name,instances in ragbench[dataset_name].items(): # Fixed: Removed extra '.' and corrected indentation
26
+ print(f"Processing {split_name} split")
27
+ for instance in instances: # Fixed: Corrected indentation
28
+ # Check if the question (data) matches the query
29
+ if instance['question'] == question:
30
+ # If a match is found, retrieve id and response
31
+ instance_id = instance['id']
32
+ instance_response = instance['response']
33
+ ground_truth_metrics = {
34
+ "context_relevance": instance['relevance_score'],
35
+ "context_utilization": instance['utilization_score'],
36
+ "completeness": instance['completeness_score'],
37
+ "adherence": instance['adherence_score']
38
+ }
39
+ ground_truth_answer = instance_response
40
+ print(f"Match found in {split_name} split!")
41
+ print(f"ID: {instance_id}, Response: {instance_response}")
42
+ break # Exit after finding the first match (optional)
43
+
44
+ # Step 1: Helper function to compute cosine similarity
45
+ def compute_cosine_similarity(text1, text2):
46
+ vectorizer = TfidfVectorizer()
47
+ vectors = vectorizer.fit_transform([text1, text2])
48
+ return cosine_similarity(vectors[0], vectors[1])[0][0]
49
+
50
+ # Step 2: Metric 1 - Context Relevance
51
+ def context_relevance(question, relevant_documents):
52
+ combined_docs = " ".join([doc.page_content for doc in relevant_documents])
53
+ return compute_cosine_similarity(question, combined_docs)
54
+
55
+ # Step 3: Metric 2 - Context Utilization
56
+ def context_utilization(response, relevant_documents):
57
+ combined_docs = " ".join([doc.page_content for doc in relevant_documents])
58
+ return compute_cosine_similarity(response, combined_docs)
59
+
60
+ # Step 4: Metric 3 - Completeness
61
+ def completeness(response, ground_truth_answer):
62
+ return compute_cosine_similarity(response, ground_truth_answer)
63
+
64
+ # Step 5: Metric 4 - Adherence
65
+ def adherence(response, relevant_documents):
66
+ combined_docs = " ".join([doc.page_content for doc in relevant_documents])
67
+ response_tokens = set(response.split())
68
+ relevant_tokens = set(combined_docs.split())
69
+ supported_tokens = response_tokens.intersection(relevant_tokens)
70
+ return len(supported_tokens) / len(response_tokens)
71
+
72
+ # Step 6: Compute RMSE for metrics
73
+ def compute_rmse(predicted_values, ground_truth_values):
74
+ return np.sqrt(mean_squared_error(ground_truth_values, predicted_values))
75
+
76
+
generator.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import openai
2
+ import time
3
+ import os
4
+
5
+ # Set your OpenAI API key
6
+ openai.api_key = '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'
7
+
8
+ def generate_response_from_document(query, retrieved_docs):
9
+ if not retrieved_docs: # Check if no documents were retrieved
10
+ return "I cannot answer the question due to insufficient information in the documents."
11
+
12
+ context = " ".join([doc.page_content for doc in retrieved_docs]) # Now iterates over Document objects
13
+ prompt = (
14
+ "You are a highly intelligent assistant tasked with answering a question based strictly on the provided context. "
15
+ f"Given Question: {query} \n\n"
16
+ f"Context: {context} \n"
17
+ "Answer the question directly and concisely using only the information available in the context."
18
+ )
19
+
20
+ try:
21
+ response = openai.chat.completions.create( # Use the new chat completions API
22
+ model= "gpt-3.5-turbo", #"gpt-4", #"gpt-3.5-turbo" Or use another suitable model like gpt-4
23
+ messages=[{"role": "user", "content": prompt}],
24
+ max_tokens=300,
25
+ temperature=0.7,
26
+ )
27
+ return response.choices[0].message.content.strip() # Extract content from message
28
+ except Exception as e:
29
+ return f"Error generating response: {str(e)}"
retrieval.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import numpy as np
3
+ from langchain.schema import Document
4
+ from langchain.vectorstores import faiss
5
+
6
+ from data_processing import embedding_model, index
7
+
8
+ # Retrieval Function
9
+ def retrieve_documents(query, top_k=5):
10
+ # Embed the query
11
+ query_embedding = np.array(embedding_model.embed_documents([query]), dtype=np.float32)
12
+
13
+ # Search in FAISS (top 5 results)
14
+ _, nearest_indices = index.search(query_embedding, top_k)
15
+
16
+ # Load document metadata
17
+ with open(f"data_local\rag7_docs.json", "r") as f:
18
+ documents = json.load(f) # Contains all documents for this dataset
19
+
20
+ # Retrieve the actual documents and create Document objects
21
+ retrieved_docs = [Document(page_content=documents[i]) for i in nearest_indices[0]]
22
+
23
+ return retrieved_docs
24
+
25
+ def remove_duplicate_documents(documents):
26
+ unique_documents = []
27
+ seen_documents = set() # To keep track of seen documents
28
+ for doc in documents:
29
+ # Using the page_content as a unique identifier for deduplication
30
+ doc_content = doc.page_content
31
+ if doc_content not in seen_documents:
32
+ unique_documents.append(doc)
33
+ seen_documents.add(doc_content)
34
+ return unique_documents