Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
|
|
8 |
import re
|
9 |
import numpy as np
|
10 |
import json
|
|
|
11 |
|
12 |
# Set up logging
|
13 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
@@ -17,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
17 |
device = torch.device("cpu")
|
18 |
logger.info(f"Using device: {device}")
|
19 |
|
20 |
-
# Load dataset
|
21 |
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
|
22 |
try:
|
23 |
df = pd.read_csv(csv_path)
|
@@ -30,6 +31,20 @@ except Exception as e:
|
|
30 |
logger.error(f"Error loading dataset: {e}")
|
31 |
df = None
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Response cache with financial data entries
|
34 |
response_cache = {
|
35 |
"hi": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
|
@@ -162,6 +177,9 @@ response_cache = {
|
|
162 |
"what is the average annual growth rate of the s&p 500 from 2011 to 2016?": (
|
163 |
"The S&P 500βs average annual growth rate from 2011 to 2016 was approximately 12.7%, including dividends, driven by post-financial crisis recovery."
|
164 |
),
|
|
|
|
|
|
|
165 |
"what will my return be in 10 years if i invest $5000 into s&p 500 right now?": (
|
166 |
"Assuming a 10% average annual return, a $5,000 investment in the S&P 500 would grow to approximately $12,974 in 10 years with annual compounding. "
|
167 |
"This is based on the historical average return of 10β12% (1927β2025). Future returns vary and are not guaranteed. Consult a financial planner."
|
@@ -225,26 +243,24 @@ prompt_prefix = (
|
|
225 |
"1. This uses the historical average return of 10β12% (1927β2025).\n"
|
226 |
"2. Future returns vary and are not guaranteed.\n\n"
|
227 |
"Example 3:\n"
|
228 |
-
"Q: What
|
229 |
-
"A: The S&P 500βs average annual return
|
230 |
-
"1. This period includes strong recovery
|
231 |
"2. Dividends contribute significantly to total returns.\n\n"
|
232 |
"Q: "
|
233 |
)
|
234 |
prefix_tokens = tokenizer(prompt_prefix, return_tensors="pt", truncation=True, max_length=512).to(device)
|
235 |
|
236 |
-
# Substring matching for cache
|
237 |
def get_closest_cache_key(message, cache_keys):
|
238 |
message = message.lower().strip()
|
239 |
-
|
240 |
-
|
241 |
-
return key
|
242 |
-
return None
|
243 |
|
244 |
# Parse period from user input
|
245 |
def parse_period(query):
|
246 |
-
# Match specific year ranges (e.g., "
|
247 |
-
match = re.search(r'(\d{4})\s*(?:to|-|β)\s*(\d{4})', query, re.IGNORECASE)
|
248 |
if match:
|
249 |
start_year, end_year = map(int, match.groups())
|
250 |
return start_year, end_year, None
|
@@ -255,10 +271,10 @@ def parse_period(query):
|
|
255 |
end_year = start_year + duration - 1
|
256 |
return start_year, end_year, duration
|
257 |
# Match general duration queries (e.g., "past 10 years", "3-year growth rate")
|
258 |
-
match = re.search(r'past\s*(\d+)-year|\b(\d+)-year.*growth\s*rate', query, re.IGNORECASE)
|
259 |
if match:
|
260 |
duration = int(match.group(1) or match.group(2))
|
261 |
-
max_year =
|
262 |
start_year = max_year - duration + 1
|
263 |
end_year = max_year
|
264 |
return start_year, end_year, duration
|
@@ -266,16 +282,16 @@ def parse_period(query):
|
|
266 |
|
267 |
# Calculate average growth rate
|
268 |
def calculate_growth_rate(start_year, end_year, duration=None):
|
269 |
-
if
|
270 |
return None, "Data not available or invalid period."
|
271 |
-
df_period =
|
272 |
if df_period.empty:
|
273 |
return None, f"No data available for {start_year} to {end_year}."
|
274 |
avg_return = df_period['Return'].mean()
|
275 |
if duration:
|
276 |
-
response = f"The S&P 500βs {duration}-year average annual
|
277 |
else:
|
278 |
-
response = f"The S&P 500βs average annual
|
279 |
return avg_return, response
|
280 |
|
281 |
# Parse investment return query
|
@@ -289,7 +305,7 @@ def parse_investment_query(query):
|
|
289 |
|
290 |
# Calculate future value
|
291 |
def calculate_future_value(amount, years):
|
292 |
-
if
|
293 |
return None, "Data not available or invalid input."
|
294 |
avg_annual_return = 10.0 # Historical S&P 500 average (1927β2025)
|
295 |
future_value = amount * (1 + avg_annual_return / 100) ** years
|
@@ -385,7 +401,7 @@ def chat_with_model(user_input, history=None, is_processing=False):
|
|
385 |
gen_start_time = time.time()
|
386 |
outputs = model.generate(
|
387 |
**inputs,
|
388 |
-
max_new_tokens=
|
389 |
min_length=20,
|
390 |
do_sample=False,
|
391 |
repetition_penalty=2.0,
|
|
|
8 |
import re
|
9 |
import numpy as np
|
10 |
import json
|
11 |
+
import difflib
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
|
18 |
device = torch.device("cpu")
|
19 |
logger.info(f"Using device: {device}")
|
20 |
|
21 |
+
# Load dataset and precompute period data
|
22 |
csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
|
23 |
try:
|
24 |
df = pd.read_csv(csv_path)
|
|
|
31 |
logger.error(f"Error loading dataset: {e}")
|
32 |
df = None
|
33 |
|
34 |
+
# Precompute yearly aggregates for faster lookups
|
35 |
+
if df is not None:
|
36 |
+
df_yearly = df.groupby(df['Date'].dt.year).agg({
|
37 |
+
'SP500': 'mean',
|
38 |
+
'Return': 'mean',
|
39 |
+
'Real Return': 'mean',
|
40 |
+
'Dividend': 'mean',
|
41 |
+
'Earnings': 'mean',
|
42 |
+
'PE10': 'mean'
|
43 |
+
}).reset_index()
|
44 |
+
df_yearly = df_yearly.rename(columns={'Date': 'Year'})
|
45 |
+
else:
|
46 |
+
df_yearly = None
|
47 |
+
|
48 |
# Response cache with financial data entries
|
49 |
response_cache = {
|
50 |
"hi": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
|
|
|
177 |
"what is the average annual growth rate of the s&p 500 from 2011 to 2016?": (
|
178 |
"The S&P 500βs average annual growth rate from 2011 to 2016 was approximately 12.7%, including dividends, driven by post-financial crisis recovery."
|
179 |
),
|
180 |
+
"what was the average annual return of the s&p 500 between 2010 and 2020?": (
|
181 |
+
"The S&P 500βs average annual return from 2010 to 2020 was approximately 13.6%, including dividends, driven by post-financial crisis recovery."
|
182 |
+
),
|
183 |
"what will my return be in 10 years if i invest $5000 into s&p 500 right now?": (
|
184 |
"Assuming a 10% average annual return, a $5,000 investment in the S&P 500 would grow to approximately $12,974 in 10 years with annual compounding. "
|
185 |
"This is based on the historical average return of 10β12% (1927β2025). Future returns vary and are not guaranteed. Consult a financial planner."
|
|
|
243 |
"1. This uses the historical average return of 10β12% (1927β2025).\n"
|
244 |
"2. Future returns vary and are not guaranteed.\n\n"
|
245 |
"Example 3:\n"
|
246 |
+
"Q: What was the average annual return of the S&P 500 between 2010 and 2020?\n"
|
247 |
+
"A: The S&P 500βs average annual return from 2010 to 2020 was approximately 13.6%, including dividends.\n"
|
248 |
+
"1. This period includes strong recovery post-financial crisis.\n"
|
249 |
"2. Dividends contribute significantly to total returns.\n\n"
|
250 |
"Q: "
|
251 |
)
|
252 |
prefix_tokens = tokenizer(prompt_prefix, return_tensors="pt", truncation=True, max_length=512).to(device)
|
253 |
|
254 |
+
# Substring matching for cache with fuzzy matching
|
255 |
def get_closest_cache_key(message, cache_keys):
|
256 |
message = message.lower().strip()
|
257 |
+
matches = difflib.get_close_matches(message, cache_keys, n=1, cutoff=0.8)
|
258 |
+
return matches[0] if matches else None
|
|
|
|
|
259 |
|
260 |
# Parse period from user input
|
261 |
def parse_period(query):
|
262 |
+
# Match specific year ranges (e.g., "between 2010 and 2020", "2000β2008")
|
263 |
+
match = re.search(r'(?:between|from)\s*(\d{4})\s*(?:and|to|-|β)\s*(\d{4})', query, re.IGNORECASE)
|
264 |
if match:
|
265 |
start_year, end_year = map(int, match.groups())
|
266 |
return start_year, end_year, None
|
|
|
271 |
end_year = start_year + duration - 1
|
272 |
return start_year, end_year, duration
|
273 |
# Match general duration queries (e.g., "past 10 years", "3-year growth rate")
|
274 |
+
match = re.search(r'past\s*(\d+)-year|\b(\d+)-year.*(?:return|growth\s*rate)', query, re.IGNORECASE)
|
275 |
if match:
|
276 |
duration = int(match.group(1) or match.group(2))
|
277 |
+
max_year = df_yearly['Year'].max() if df_yearly is not None else 2025
|
278 |
start_year = max_year - duration + 1
|
279 |
end_year = max_year
|
280 |
return start_year, end_year, duration
|
|
|
282 |
|
283 |
# Calculate average growth rate
|
284 |
def calculate_growth_rate(start_year, end_year, duration=None):
|
285 |
+
if df_yearly is None or start_year is None or end_year is None:
|
286 |
return None, "Data not available or invalid period."
|
287 |
+
df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
|
288 |
if df_period.empty:
|
289 |
return None, f"No data available for {start_year} to {end_year}."
|
290 |
avg_return = df_period['Return'].mean()
|
291 |
if duration:
|
292 |
+
response = f"The S&P 500βs {duration}-year average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
293 |
else:
|
294 |
+
response = f"The S&P 500βs average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
|
295 |
return avg_return, response
|
296 |
|
297 |
# Parse investment return query
|
|
|
305 |
|
306 |
# Calculate future value
|
307 |
def calculate_future_value(amount, years):
|
308 |
+
if df_yearly is None or amount is None or years is None:
|
309 |
return None, "Data not available or invalid input."
|
310 |
avg_annual_return = 10.0 # Historical S&P 500 average (1927β2025)
|
311 |
future_value = amount * (1 + avg_annual_return / 100) ** years
|
|
|
401 |
gen_start_time = time.time()
|
402 |
outputs = model.generate(
|
403 |
**inputs,
|
404 |
+
max_new_tokens=40, # Reduced for faster inference
|
405 |
min_length=20,
|
406 |
do_sample=False,
|
407 |
repetition_penalty=2.0,
|