Upload app.py
Browse files
app.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""app.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/1BmH6jAmykO3k3aZv-Cjz-TWvDrDzrB10
|
8 |
+
"""
|
9 |
+
|
10 |
+
# =============================================================================
|
11 |
+
# Imports & Setup
|
12 |
+
# =============================================================================
|
13 |
+
import os
|
14 |
+
import numpy as np
|
15 |
+
import pandas as pd
|
16 |
+
import faiss # For fast vector similarity search
|
17 |
+
from sentence_transformers import SentenceTransformer # For generating text embeddings
|
18 |
+
from rank_bm25 import BM25Okapi # For BM25 keyword-based retrieval
|
19 |
+
import spacy # For tokenization
|
20 |
+
from sklearn.metrics.pairwise import cosine_similarity # For computing cosine similarity
|
21 |
+
from sklearn.preprocessing import normalize # For normalizing BM25 scores
|
22 |
+
|
23 |
+
# For the Gradio UI
|
24 |
+
import gradio as gr
|
25 |
+
|
26 |
+
# For response generation using a small language model (we use FLAN-T5-Small)
|
27 |
+
from transformers import pipeline, set_seed
|
28 |
+
|
29 |
+
# Set a random seed for reproducibility
|
30 |
+
set_seed(42)
|
31 |
+
|
32 |
+
# Load SpaCy English model (make sure to download it with: python -m spacy download en_core_web_sm)
|
33 |
+
nlp = spacy.load("en_core_web_sm")
|
34 |
+
|
35 |
+
# =============================================================================
|
36 |
+
# 1. Data Collection & Preprocessing
|
37 |
+
# =============================================================================
|
38 |
+
# Load the CSV file containing financial data.
|
39 |
+
# (Make sure the CSV file "MSFT_1986-03-13_2025-02-04.csv" is in the "data" folder)
|
40 |
+
csv_file_path = r"MSFT_1986-03-13_2025-02-04.csv" # Adjust the path if necessary
|
41 |
+
# Load the CSV file into a DataFrame
|
42 |
+
df = pd.read_csv(csv_file_path)
|
43 |
+
|
44 |
+
# Display basic info about the dataset
|
45 |
+
print(df.info())
|
46 |
+
|
47 |
+
# Data Cleaning & Structuring
|
48 |
+
|
49 |
+
# Convert 'Date' column to datetime format
|
50 |
+
df['Date'] = pd.to_datetime(df['Date'])
|
51 |
+
|
52 |
+
# Sort data by Date
|
53 |
+
df = df.sort_values(by='Date')
|
54 |
+
|
55 |
+
# Extract Year from Date
|
56 |
+
df['Year'] = df['Date'].dt.year
|
57 |
+
|
58 |
+
# Aggregate data by Year to generate financial summaries
|
59 |
+
yearly_summary = df.groupby('Year').agg(
|
60 |
+
Open_Min=('Open', 'min'),
|
61 |
+
Open_Max=('Open', 'max'),
|
62 |
+
Close_Min=('Close', 'min'),
|
63 |
+
Close_Max=('Close', 'max'),
|
64 |
+
Avg_Volume=('Volume', 'mean')
|
65 |
+
).reset_index()
|
66 |
+
|
67 |
+
# Create a textual summary for each year
|
68 |
+
yearly_summary['Summary'] = yearly_summary.apply(
|
69 |
+
lambda row: f"In {row['Year']}, the stock opened between ${row['Open_Min']:.2f} and ${row['Open_Max']:.2f}, "
|
70 |
+
f"while closing between ${row['Close_Min']:.2f} and ${row['Close_Max']:.2f}. "
|
71 |
+
f"The average trading volume was {row['Avg_Volume']:,.0f} shares.",
|
72 |
+
axis=1
|
73 |
+
)
|
74 |
+
|
75 |
+
# Display the cleaned and structured data
|
76 |
+
print(yearly_summary.head()) # Use this for terminal/console
|
77 |
+
# yearly_summary.head() # Use this in Jupyter Notebook
|
78 |
+
|
79 |
+
# =============================================================================
|
80 |
+
# 2. Basic RAG Implementation
|
81 |
+
# =============================================================================
|
82 |
+
# Convert financial summaries into text chunks and generate vector embeddings.
|
83 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
84 |
+
|
85 |
+
# Convert yearly financial summaries into vector embeddings
|
86 |
+
summary_texts = yearly_summary["Summary"].tolist() # Extract summaries as text
|
87 |
+
summary_embeddings = embedding_model.encode(summary_texts, convert_to_numpy=True) # Generate embeddings
|
88 |
+
|
89 |
+
# Store embeddings as a NumPy array for further processing
|
90 |
+
summary_embeddings.shape # This should be (num_years, embedding_size)
|
91 |
+
|
92 |
+
# Define the dimension of embeddings (384 from MiniLM model)
|
93 |
+
embedding_dim = 384
|
94 |
+
|
95 |
+
# Create a FAISS index (Flat index for now, can be optimized later)
|
96 |
+
faiss_index = faiss.IndexFlatL2(embedding_dim)
|
97 |
+
|
98 |
+
# Convert embeddings to float32 (FAISS requires this format)
|
99 |
+
summary_embeddings = summary_embeddings.astype('float32')
|
100 |
+
|
101 |
+
# Add embeddings to the FAISS index
|
102 |
+
faiss_index.add(summary_embeddings)
|
103 |
+
|
104 |
+
# Store the year information for retrieval
|
105 |
+
year_map = {i: yearly_summary["Year"].iloc[i] for i in range(len(yearly_summary))}
|
106 |
+
|
107 |
+
# Verify that embeddings are stored successfully
|
108 |
+
faiss_index.ntotal
|
109 |
+
|
110 |
+
# =============================================================================
|
111 |
+
# 3. Advanced RAG Implementation
|
112 |
+
# =============================================================================
|
113 |
+
# 3.1: BM25 for Keyword-Based Search
|
114 |
+
# Tokenize each summary using SpaCy (tokens are converted to lowercase).
|
115 |
+
tokenized_summaries = [[token.text.lower() for token in nlp(summary)] for summary in summary_texts]
|
116 |
+
# Build the BM25 index.
|
117 |
+
bm25 = BM25Okapi(tokenized_summaries)
|
118 |
+
|
119 |
+
# 3.2: Define Retrieval Functions
|
120 |
+
|
121 |
+
def retrieve_similar_summaries(query_text, top_k=3):
|
122 |
+
"""
|
123 |
+
Retrieve similar financial summaries using FAISS vector search.
|
124 |
+
"""
|
125 |
+
query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')
|
126 |
+
distances, indices = faiss_index.search(query_embedding, top_k)
|
127 |
+
results = []
|
128 |
+
for idx in indices[0]:
|
129 |
+
results.append((year_map[idx], yearly_summary.iloc[idx]["Summary"]))
|
130 |
+
return pd.DataFrame(results, columns=["Year", "Summary"])
|
131 |
+
|
132 |
+
def hybrid_retrieve(query_text, top_k=3, alpha=0.5):
|
133 |
+
"""
|
134 |
+
Hybrid retrieval combining FAISS (vector search) and BM25 (keyword search).
|
135 |
+
Scores are combined using the weighting factor 'alpha'.
|
136 |
+
"""
|
137 |
+
query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')
|
138 |
+
_, faiss_indices = faiss_index.search(query_embedding, top_k)
|
139 |
+
|
140 |
+
bm25_scores = bm25.get_scores([token.text.lower() for token in nlp(query_text)])
|
141 |
+
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
|
142 |
+
|
143 |
+
combined_scores = {}
|
144 |
+
for rank, idx in enumerate(faiss_indices[0]):
|
145 |
+
combined_scores[idx] = alpha * (top_k - rank)
|
146 |
+
bm25_norm_scores = normalize([bm25_scores])[0]
|
147 |
+
for rank, idx in enumerate(bm25_top_indices):
|
148 |
+
if idx in combined_scores:
|
149 |
+
combined_scores[idx] += (1 - alpha) * (top_k - rank)
|
150 |
+
else:
|
151 |
+
combined_scores[idx] = (1 - alpha) * (top_k - rank)
|
152 |
+
|
153 |
+
sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
|
154 |
+
results = [(year_map[idx], yearly_summary.iloc[idx]["Summary"]) for idx, _ in sorted_results]
|
155 |
+
return pd.DataFrame(results, columns=["Year", "Summary"])
|
156 |
+
|
157 |
+
def adaptive_retrieve(query_text, top_k=3, alpha=0.5):
|
158 |
+
"""
|
159 |
+
Adaptive retrieval re-ranks results by combining FAISS and BM25 scores.
|
160 |
+
"""
|
161 |
+
query_embedding = embedding_model.encode([query_text], convert_to_numpy=True).astype('float32')
|
162 |
+
_, faiss_indices = faiss_index.search(query_embedding, top_k)
|
163 |
+
|
164 |
+
query_tokens = [token.text.lower() for token in nlp(query_text)]
|
165 |
+
bm25_scores = bm25.get_scores(query_tokens)
|
166 |
+
bm25_top_indices = np.argsort(bm25_scores)[::-1][:top_k]
|
167 |
+
|
168 |
+
faiss_scores = np.linspace(1, 0, num=top_k)
|
169 |
+
bm25_norm_scores = normalize([bm25_scores])[0]
|
170 |
+
|
171 |
+
combined_scores = {}
|
172 |
+
for rank, idx in enumerate(faiss_indices[0]):
|
173 |
+
combined_scores[idx] = alpha * faiss_scores[rank]
|
174 |
+
for idx in bm25_top_indices:
|
175 |
+
if idx in combined_scores:
|
176 |
+
combined_scores[idx] += (1 - alpha) * bm25_norm_scores[idx]
|
177 |
+
else:
|
178 |
+
combined_scores[idx] = (1 - alpha) * bm25_norm_scores[idx]
|
179 |
+
|
180 |
+
sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
|
181 |
+
results = [(year_map[idx], yearly_summary.iloc[idx]["Summary"]) for idx, _ in sorted_results]
|
182 |
+
return pd.DataFrame(results, columns=["Year", "Summary"])
|
183 |
+
|
184 |
+
def merge_similar_chunks(threshold=0.95):
|
185 |
+
"""
|
186 |
+
Chunk Merging: Merge similar financial summaries based on cosine similarity.
|
187 |
+
This reduces redundancy when multiple chunks are very similar.
|
188 |
+
"""
|
189 |
+
merged_summaries = []
|
190 |
+
used_indices = set()
|
191 |
+
for i in range(len(summary_embeddings)):
|
192 |
+
if i in used_indices:
|
193 |
+
continue
|
194 |
+
similarities = cosine_similarity([summary_embeddings[i]], summary_embeddings)[0]
|
195 |
+
similar_indices = np.where(similarities >= threshold)[0]
|
196 |
+
merged_text = " ".join(yearly_summary.iloc[idx]["Summary"] for idx in similar_indices)
|
197 |
+
merged_summaries.append((yearly_summary.iloc[i]["Year"], merged_text))
|
198 |
+
used_indices.update(similar_indices)
|
199 |
+
return pd.DataFrame(merged_summaries, columns=["Year", "Merged Summary"])
|
200 |
+
|
201 |
+
# Optional: Check merged summaries for debugging.
|
202 |
+
merged_summary_df = merge_similar_chunks(threshold=0.95)
|
203 |
+
print("Merged summaries shape:", merged_summary_df.shape)
|
204 |
+
merged_summary_df.head()
|
205 |
+
|
206 |
+
# =============================================================================
|
207 |
+
# 4. UI Development using Gradio (Updated for newer API)
|
208 |
+
# =============================================================================
|
209 |
+
def generate_response(query_text, top_k=3, alpha=0.5):
|
210 |
+
"""
|
211 |
+
Generate an answer for a financial query by:
|
212 |
+
- Validating the query with an input-side guardrail.
|
213 |
+
- Retrieving context using adaptive retrieval.
|
214 |
+
- Generating a refined answer using FLAN-T5-Small.
|
215 |
+
Returns:
|
216 |
+
answer (str): The generated answer.
|
217 |
+
confidence (float): A mock confidence score based on BM25 scores.
|
218 |
+
"""
|
219 |
+
# -----------------------------------------------------------------------------
|
220 |
+
# Guard Rail Implementation (Input-Side)
|
221 |
+
# -----------------------------------------------------------------------------
|
222 |
+
financial_keywords = ["open", "close", "stock", "price", "volume", "trading"]
|
223 |
+
if not any(keyword in query_text.lower() for keyword in financial_keywords):
|
224 |
+
return ("Guardrail Triggered: Your query does not appear to be related to financial data. Please ask a financial question."), 0.0
|
225 |
+
|
226 |
+
# Retrieve context using adaptive retrieval.
|
227 |
+
context_df = adaptive_retrieve(query_text, top_k=top_k, alpha=alpha)
|
228 |
+
context_text = " ".join(context_df["Summary"].tolist())
|
229 |
+
|
230 |
+
# Adjust the prompt to provide clear instructions.
|
231 |
+
prompt = f"Given the following financial data:\n{context_text}\nAnswer this question: {query_text}."
|
232 |
+
|
233 |
+
# Use FLAN-T5-Small for text generation via the text2text-generation pipeline.
|
234 |
+
# Increase max_length to allow longer answers.
|
235 |
+
generator = pipeline('text2text-generation', model='google/flan-t5-small')
|
236 |
+
generated = generator(prompt, max_length=200, num_return_sequences=1)
|
237 |
+
answer = generated[0]['generated_text'].replace(prompt, "").strip()
|
238 |
+
|
239 |
+
# Fallback message if answer is empty.
|
240 |
+
if not answer:
|
241 |
+
answer = "I'm sorry, I couldn't generate a clear answer. Please try rephrasing your question."
|
242 |
+
|
243 |
+
# Compute a mock confidence score using normalized BM25 scores.
|
244 |
+
query_tokens = [token.text.lower() for token in nlp(query_text)]
|
245 |
+
bm25_scores = bm25.get_scores(query_tokens)
|
246 |
+
max_score = np.max(bm25_scores) if np.max(bm25_scores) > 0 else 1
|
247 |
+
confidence = round(np.mean(bm25_scores) / max_score, 2)
|
248 |
+
|
249 |
+
return answer, confidence
|
250 |
+
|
251 |
+
# Create the Gradio interface using the new API.
|
252 |
+
iface = gr.Interface(
|
253 |
+
fn=generate_response,
|
254 |
+
inputs=gr.Textbox(lines=2, placeholder="Enter your financial question here..."),
|
255 |
+
outputs=[gr.Textbox(label="Answer"), gr.Textbox(label="Confidence Score")],
|
256 |
+
title="Financial RAG Model Interface",
|
257 |
+
description=("Ask questions based on the company's financial summaries "
|
258 |
+
)
|
259 |
+
)
|
260 |
+
|
261 |
+
# Launch the Gradio interface.
|
262 |
+
iface.launch()
|
263 |
+
|
264 |
+
# =============================================================================
|
265 |
+
# 6. Testing & Validation (Updated)
|
266 |
+
# =============================================================================
|
267 |
+
def print_test_results(query_text, top_k=3, alpha=0.5):
|
268 |
+
answer, confidence = generate_response(query_text, top_k, alpha)
|
269 |
+
print("Question: ", query_text)
|
270 |
+
print("Answer: ", answer)
|
271 |
+
print("Confidence Score: ", confidence)
|
272 |
+
print("-" * 50)
|
273 |
+
|
274 |
+
# Test 1: High-confidence financial query.
|
275 |
+
query_high = "What year had the lowest stock prices?"
|
276 |
+
print_test_results(query_high)
|
277 |
+
|
278 |
+
# Test 2: Low-confidence financial query.
|
279 |
+
query_low = "How did the trading volume vary?"
|
280 |
+
print_test_results(query_low)
|
281 |
+
|
282 |
+
# Test 3: Irrelevant query (should trigger guardrail).
|
283 |
+
query_irrelevant = "What is the capital of France?"
|
284 |
+
print_test_results(query_irrelevant)
|