uditk99 commited on
Commit
739895a
·
verified ·
1 Parent(s): bc748c7

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -0
app.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1BmH6jAmykO3k3aZv-Cjz-TWvDrDzrB10
8
+ """
9
+
10
+ # =============================================================================
11
+ # Imports & Setup
12
+ # =============================================================================
13
+ import os
14
+ import numpy as np
15
+ import pandas as pd
16
+ import faiss # For fast vector similarity search
17
+ from sentence_transformers import SentenceTransformer # For generating text embeddings
18
+ from rank_bm25 import BM25Okapi # For BM25 keyword-based retrieval
19
+ import spacy # For tokenization
20
+ from sklearn.metrics.pairwise import cosine_similarity # For computing cosine similarity
21
+ from sklearn.preprocessing import normalize # For normalizing BM25 scores
22
+
23
+ # For the Gradio UI
24
+ import gradio as gr
25
+
26
+ # For response generation using a small language model (we use FLAN-T5-Small)
27
+ from transformers import pipeline, set_seed
28
+
29
+ # Set a random seed for reproducibility
30
+ set_seed(42)
31
+
32
+ # Load SpaCy English model (make sure to download it with: python -m spacy download en_core_web_sm)
33
+ nlp = spacy.load("en_core_web_sm")
34
+
35
+ # =============================================================================
36
+ # 1. Data Collection & Preprocessing
37
+ # =============================================================================
38
+ # Load the CSV file containing financial data.
39
+ # (Make sure the CSV file "MSFT_1986-03-13_2025-02-04.csv" is in the "data" folder)
40
+ csv_file_path = r"MSFT_1986-03-13_2025-02-04.csv" # Adjust the path if necessary
41
+ # Load the CSV file into a DataFrame
42
+ df = pd.read_csv(csv_file_path)
43
+
44
+ # Display basic info about the dataset
45
+ print(df.info())
46
+
47
+ # Data Cleaning & Structuring
48
+
49
+ # Convert 'Date' column to datetime format
50
+ df['Date'] = pd.to_datetime(df['Date'])
51
+
52
+ # Sort data by Date
53
+ df = df.sort_values(by='Date')
54
+
55
+ # Extract Year from Date
56
+ df['Year'] = df['Date'].dt.year
57
+
58
+ # Aggregate data by Year to generate financial summaries
59
+ yearly_summary = df.groupby('Year').agg(
60
+ Open_Min=('Open', 'min'),
61
+ Open_Max=('Open', 'max'),
62
+ Close_Min=('Close', 'min'),
63
+ Close_Max=('Close', 'max'),
64
+ Avg_Volume=('Volume', 'mean')
65
+ ).reset_index()
66
+
67
+ # Create a textual summary for each year
68
+ yearly_summary['Summary'] = yearly_summary.apply(
69
+ lambda row: f"In {row['Year']}, the stock opened between ${row['Open_Min']:.2f} and ${row['Open_Max']:.2f}, "
70
+ f"while closing between ${row['Close_Min']:.2f} and ${row['Close_Max']:.2f}. "
71
+ f"The average trading volume was {row['Avg_Volume']:,.0f} shares.",
72
+ axis=1
73
+ )
74
+
75
+ # Display the cleaned and structured data
76
+ print(yearly_summary.head()) # Use this for terminal/console
77
+ # yearly_summary.head() # Use this in Jupyter Notebook
78
+
79
+ # =============================================================================
80
+ # 2. Basic RAG Implementation
81
+ # =============================================================================
82
+ # Convert financial summaries into text chunks and generate vector embeddings.
83
+ embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
84
+
85
+ # Convert yearly financial summaries into vector embeddings
86
+ summary_texts = yearly_summary["Summary"].tolist() # Extract summaries as text
87
+ summary_embeddings = embedding_model.encode(summary_texts, convert_to_numpy=True) # Generate embeddings
88
+
89
+ # Store embeddings as a NumPy array for further processing
90
+ summary_embeddings.shape # This should be (num_years, embedding_size)
91
+
92
+ # Define the dimension of embeddings (384 from MiniLM model)
93
+ embedding_dim = 384
94
+
95
+ # Create a FAISS index (Flat index for now, can be optimized later)
96
+ faiss_index = faiss.IndexFlatL2(embedding_dim)
97
+
98
+ # Convert embeddings to float32 (FAISS requires this format)
99
+ summary_embeddings = summary_embeddings.astype('float32')
100
+
101
+ # Add embeddings to the FAISS index
102
+ faiss_index.add(summary_embeddings)
103
+
104
+ # Store the year information for retrieval
105
+ year_map = {i: yearly_summary["Year"].iloc[i] for i in range(len(yearly_summary))}
106
+
107
+ # Verify that embeddings are stored successfully
108
+ faiss_index.ntotal
109
+
110
+ # =============================================================================
111
+ # 3. Advanced RAG Implementation
112
+ # =============================================================================
113
+ # 3.1: BM25 for Keyword-Based Search
114
+ # Tokenize each summary using SpaCy (tokens are converted to lowercase).
115
+ tokenized_summaries = [[token.text.lower() for token in nlp(summary)] for summary in summary_texts]
116
+ # Build the BM25 index.
117
+ bm25 = BM25Okapi(tokenized_summaries)
118
+
119
+ # 3.2: Define Retrieval Functions
120
+
121
+ def retrieve_similar_summaries(query_text, top_k=3):
122
+ """
123
+ Retrieve similar financial summaries using FAISS vector search.
124
+ """
125
+ query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')
126
+ distances, indices = faiss_index.search(query_embedding, top_k)
127
+ results = []
128
+ for idx in indices[0]:
129
+ results.append((year_map[idx], yearly_summary.iloc[idx]["Summary"]))
130
+ return pd.DataFrame(results, columns=["Year", "Summary"])
131
+
132
+ def hybrid_retrieve(query_text, top_k=3, alpha=0.5):
133
+ """
134
+ Hybrid retrieval combining FAISS (vector search) and BM25 (keyword search).
135
+ Scores are combined using the weighting factor 'alpha'.
136
+ """
137
+ query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')
138
+ _, faiss_indices = faiss_index.search(query_embedding, top_k)
139
+
140
+ bm25_scores = bm25.get_scores([token.text.lower() for token in nlp(query_text)])
141
+ bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
142
+
143
+ combined_scores = {}
144
+ for rank, idx in enumerate(faiss_indices[0]):
145
+ combined_scores[idx] = alpha * (top_k - rank)
146
+ bm25_norm_scores = normalize([bm25_scores])[0]
147
+ for rank, idx in enumerate(bm25_top_indices):
148
+ if idx in combined_scores:
149
+ combined_scores[idx] += (1 - alpha) * (top_k - rank)
150
+ else:
151
+ combined_scores[idx] = (1 - alpha) * (top_k - rank)
152
+
153
+ sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
154
+ results = [(year_map[idx], yearly_summary.iloc[idx]["Summary"]) for idx, _ in sorted_results]
155
+ return pd.DataFrame(results, columns=["Year", "Summary"])
156
+
157
+ def adaptive_retrieve(query_text, top_k=3, alpha=0.5):
158
+ """
159
+ Adaptive retrieval re-ranks results by combining FAISS and BM25 scores.
160
+ """
161
+ query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')
162
+ _, faiss_indices = faiss_index.search(query_embedding, top_k)
163
+
164
+ query_tokens = [token.text.lower() for token in nlp(query_text)]
165
+ bm25_scores = bm25.get_scores(query_tokens)
166
+ bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
167
+
168
+ faiss_scores = np.linspace(1, 0, num=top_k)
169
+ bm25_norm_scores = normalize([bm25_scores])[0]
170
+
171
+ combined_scores = {}
172
+ for rank, idx in enumerate(faiss_indices[0]):
173
+ combined_scores[idx] = alpha * faiss_scores[rank]
174
+ for idx in bm25_top_indices:
175
+ if idx in combined_scores:
176
+ combined_scores[idx] += (1 - alpha) * bm25_norm_scores[idx]
177
+ else:
178
+ combined_scores[idx] = (1 - alpha) * bm25_norm_scores[idx]
179
+
180
+ sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
181
+ results = [(year_map[idx], yearly_summary.iloc[idx]["Summary"]) for idx, _ in sorted_results]
182
+ return pd.DataFrame(results, columns=["Year", "Summary"])
183
+
184
+ def merge_similar_chunks(threshold=0.95):
185
+ """
186
+ Chunk Merging: Merge similar financial summaries based on cosine similarity.
187
+ This reduces redundancy when multiple chunks are very similar.
188
+ """
189
+ merged_summaries = []
190
+ used_indices = set()
191
+ for i in range(len(summary_embeddings)):
192
+ if i in used_indices:
193
+ continue
194
+ similarities = cosine_similarity([summary_embeddings[i]], summary_embeddings)[0]
195
+ similar_indices = np.where(similarities >= threshold)[0]
196
+ merged_text = " ".join(yearly_summary.iloc[idx]["Summary"] for idx in similar_indices)
197
+ merged_summaries.append((yearly_summary.iloc[i]["Year"], merged_text))
198
+ used_indices.update(similar_indices)
199
+ return pd.DataFrame(merged_summaries, columns=["Year", "Merged Summary"])
200
+
201
+ # Optional: Check merged summaries for debugging.
202
+ merged_summary_df = merge_similar_chunks(threshold=0.95)
203
+ print("Merged summaries shape:", merged_summary_df.shape)
204
+ merged_summary_df.head()
205
+
206
+ # =============================================================================
207
+ # 4. UI Development using Gradio (Updated for newer API)
208
+ # =============================================================================
209
+ def generate_response(query_text, top_k=3, alpha=0.5):
210
+ """
211
+ Generate an answer for a financial query by:
212
+ - Validating the query with an input-side guardrail.
213
+ - Retrieving context using adaptive retrieval.
214
+ - Generating a refined answer using FLAN-T5-Small.
215
+ Returns:
216
+ answer (str): The generated answer.
217
+ confidence (float): A mock confidence score based on BM25 scores.
218
+ """
219
+ # -----------------------------------------------------------------------------
220
+ # Guard Rail Implementation (Input-Side)
221
+ # -----------------------------------------------------------------------------
222
+ financial_keywords = ["open", "close", "stock", "price", "volume", "trading"]
223
+ if not any(keyword in query_text.lower() for keyword in financial_keywords):
224
+ return ("Guardrail Triggered: Your query does not appear to be related to financial data. Please ask a financial question."), 0.0
225
+
226
+ # Retrieve context using adaptive retrieval.
227
+ context_df = adaptive_retrieve(query_text, top_k=top_k, alpha=alpha)
228
+ context_text = " ".join(context_df["Summary"].tolist())
229
+
230
+ # Adjust the prompt to provide clear instructions.
231
+ prompt = f"Given the following financial data:\n{context_text}\nAnswer this question: {query_text}."
232
+
233
+ # Use FLAN-T5-Small for text generation via the text2text-generation pipeline.
234
+ # Increase max_length to allow longer answers.
235
+ generator = pipeline('text2text-generation', model='google/flan-t5-small')
236
+ generated = generator(prompt, max_length=200, num_return_sequences=1)
237
+ answer = generated[0]['generated_text'].replace(prompt, "").strip()
238
+
239
+ # Fallback message if answer is empty.
240
+ if not answer:
241
+ answer = "I'm sorry, I couldn't generate a clear answer. Please try rephrasing your question."
242
+
243
+ # Compute a mock confidence score using normalized BM25 scores.
244
+ query_tokens = [token.text.lower() for token in nlp(query_text)]
245
+ bm25_scores = bm25.get_scores(query_tokens)
246
+ max_score = np.max(bm25_scores) if np.max(bm25_scores) > 0 else 1
247
+ confidence = round(np.mean(bm25_scores) / max_score, 2)
248
+
249
+ return answer, confidence
250
+
251
+ # Create the Gradio interface using the new API.
252
+ iface = gr.Interface(
253
+ fn=generate_response,
254
+ inputs=gr.Textbox(lines=2, placeholder="Enter your financial question here..."),
255
+ outputs=[gr.Textbox(label="Answer"), gr.Textbox(label="Confidence Score")],
256
+ title="Financial RAG Model Interface",
257
+ description=("Ask questions based on the company's financial summaries "
258
+ )
259
+ )
260
+
261
+ # Launch the Gradio interface.
262
+ iface.launch()
263
+
264
+ # =============================================================================
265
+ # 6. Testing & Validation (Updated)
266
+ # =============================================================================
267
+ def print_test_results(query_text, top_k=3, alpha=0.5):
268
+ answer, confidence = generate_response(query_text, top_k, alpha)
269
+ print("Question: ", query_text)
270
+ print("Answer: ", answer)
271
+ print("Confidence Score: ", confidence)
272
+ print("-" * 50)
273
+
274
+ # Test 1: High-confidence financial query.
275
+ query_high = "What year had the lowest stock prices?"
276
+ print_test_results(query_high)
277
+
278
+ # Test 2: Low-confidence financial query.
279
+ query_low = "How did the trading volume vary?"
280
+ print_test_results(query_low)
281
+
282
+ # Test 3: Irrelevant query (should trigger guardrail).
283
+ query_irrelevant = "What is the capital of France?"
284
+ print_test_results(query_irrelevant)