DrishtiSharma commited on
Commit
afdff8e
·
verified ·
1 Parent(s): 3ed92ae

Create interim.py

Browse files
Files changed (1) hide show
  1. interim.py +181 -0
interim.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ from openai import OpenAI
4
+ import tempfile
5
+ from langchain.chains import ConversationalRetrievalChain
6
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain_community.document_loaders import (
10
+ PyPDFLoader,
11
+ TextLoader,
12
+ CSVLoader
13
+ )
14
+ from datetime import datetime
15
+ import pytz
16
+
17
+ # DocumentRAG class with environment variable support for API Key
18
+ class DocumentRAG:
19
+ def __init__(self):
20
+ self.document_store = None
21
+ self.qa_chain = None
22
+ self.document_summary = ""
23
+ self.chat_history = []
24
+ self.last_processed_time = None
25
+ self.api_key = os.getenv("OPENAI_API_KEY") # Fetch the API key from environment variable
26
+ self.init_time = datetime.now(pytz.UTC)
27
+
28
+ if not self.api_key:
29
+ raise ValueError("API Key not found. Make sure to set the 'OPENAI_API_KEY' environment variable.")
30
+
31
+ def process_documents(self, uploaded_files):
32
+ """Process uploaded files by saving them temporarily and extracting content."""
33
+ if not self.api_key:
34
+ return "Please set the OpenAI API key in the environment variables."
35
+ if not uploaded_files:
36
+ return "Please upload documents first."
37
+
38
+ try:
39
+ documents = []
40
+ for uploaded_file in uploaded_files:
41
+ # Save uploaded file to a temporary location
42
+ temp_file_path = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]).name
43
+ with open(temp_file_path, "wb") as temp_file:
44
+ temp_file.write(uploaded_file.read())
45
+
46
+ # Determine the loader based on the file type
47
+ if temp_file_path.endswith('.pdf'):
48
+ loader = PyPDFLoader(temp_file_path)
49
+ elif temp_file_path.endswith('.txt'):
50
+ loader = TextLoader(temp_file_path)
51
+ elif temp_file_path.endswith('.csv'):
52
+ loader = CSVLoader(temp_file_path)
53
+ else:
54
+ continue
55
+
56
+ # Load the documents
57
+ try:
58
+ documents.extend(loader.load())
59
+ except Exception as e:
60
+ print(f"Error loading {temp_file_path}: {str(e)}")
61
+ continue
62
+
63
+ if not documents:
64
+ return "No valid documents were processed. Please check your files."
65
+
66
+ # Split text for better processing
67
+ text_splitter = RecursiveCharacterTextSplitter(
68
+ chunk_size=1000,
69
+ chunk_overlap=200,
70
+ length_function=len
71
+ )
72
+ documents = text_splitter.split_documents(documents)
73
+
74
+ # Combine text for summary
75
+ combined_text = " ".join([doc.page_content for doc in documents])
76
+ self.document_summary = self.generate_summary(combined_text)
77
+
78
+ # Create embeddings and initialize retrieval chain
79
+ embeddings = OpenAIEmbeddings(api_key=self.api_key)
80
+ self.document_store = Chroma.from_documents(documents, embeddings)
81
+ self.qa_chain = ConversationalRetrievalChain.from_llm(
82
+ ChatOpenAI(temperature=0, model_name='gpt-4', api_key=self.api_key),
83
+ self.document_store.as_retriever(search_kwargs={'k': 6}),
84
+ return_source_documents=True,
85
+ verbose=False
86
+ )
87
+
88
+ self.last_processed_time = datetime.now(pytz.UTC)
89
+ return "Documents processed successfully!"
90
+ except Exception as e:
91
+ return f"Error processing documents: {str(e)}"
92
+
93
+ def generate_summary(self, text):
94
+ """Generate a summary of the provided text."""
95
+ if not self.api_key:
96
+ return "API Key not set. Please set it in the environment variables."
97
+ try:
98
+ client = OpenAI(api_key=self.api_key)
99
+ response = client.chat.completions.create(
100
+ model="gpt-4",
101
+ messages=[
102
+ {"role": "system", "content": "Summarize the document content concisely and provide 3-5 key points for discussion."},
103
+ {"role": "user", "content": text[:4000]}
104
+ ],
105
+ temperature=0.3
106
+ )
107
+ return response.choices[0].message.content
108
+ except Exception as e:
109
+ return f"Error generating summary: {str(e)}"
110
+
111
+ def handle_query(self, question, history):
112
+ if not self.qa_chain:
113
+ return history + [("System", "Please process the documents first.")]
114
+ try:
115
+ preface = """
116
+ Instruction: Respond in English. Be professional and concise, keeping the response under 300 words.
117
+ If you cannot provide an answer, say: "I am not sure about this question. Please try asking something else."
118
+ """
119
+ query = f"{preface}\nQuery: {question}"
120
+
121
+ result = self.qa_chain({
122
+ "question": query,
123
+ "chat_history": [(q, a) for q, a in history]
124
+ })
125
+
126
+ if "answer" not in result:
127
+ return history + [("System", "Sorry, an error occurred.")]
128
+
129
+ history.append((question, result["answer"]))
130
+ return history
131
+ except Exception as e:
132
+ return history + [("System", f"Error: {str(e)}")]
133
+
134
+ # Streamlit UI
135
+ st.title("Document Analyzer and Podcast Generator")
136
+
137
+ # Fetch the API key status
138
+ if "OPENAI_API_KEY" not in os.environ or not os.getenv("OPENAI_API_KEY"):
139
+ st.error("The 'OPENAI_API_KEY' environment variable is not set. Please configure it in your hosting environment.")
140
+ else:
141
+ st.success("API Key successfully loaded from environment variable.")
142
+
143
+ # Initialize RAG system
144
+ try:
145
+ rag_system = DocumentRAG()
146
+ except ValueError as e:
147
+ st.error(str(e))
148
+ st.stop()
149
+
150
+ # File upload
151
+ st.subheader("Step 1: Upload Documents")
152
+ uploaded_files = st.file_uploader("Upload files (PDF, TXT, CSV)", accept_multiple_files=True)
153
+
154
+ if st.button("Process Documents"):
155
+ if uploaded_files:
156
+ # Process the uploaded files
157
+ result = rag_system.process_documents(uploaded_files)
158
+
159
+ # Ensure that result is a string and display appropriately
160
+ if isinstance(result, str):
161
+ if "successfully" in result:
162
+ st.success(result)
163
+ else:
164
+ st.error(result)
165
+ else:
166
+ st.error("An unexpected error occurred during document processing.")
167
+ else:
168
+ st.warning("No files uploaded.")
169
+
170
+ # Document Q&A
171
+ st.subheader("Step 2: Ask Questions")
172
+ if rag_system.qa_chain:
173
+ history = []
174
+ user_question = st.text_input("Ask a question:")
175
+ if st.button("Submit Question"):
176
+ history = rag_system.handle_query(user_question, history)
177
+ for question, answer in history:
178
+ st.chat_message("user").write(question)
179
+ st.chat_message("assistant").write(answer)
180
+ else:
181
+ st.info("Please process documents before asking questions.")