AnilNiraula commited on
Commit
5d661b6
Β·
verified Β·
1 Parent(s): 67a27e0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -19
app.py CHANGED
@@ -8,6 +8,7 @@ import pandas as pd
8
  import re
9
  import numpy as np
10
  import json
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
@@ -17,7 +18,7 @@ logger = logging.getLogger(__name__)
17
  device = torch.device("cpu")
18
  logger.info(f"Using device: {device}")
19
 
20
- # Load dataset at startup
21
  csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
22
  try:
23
  df = pd.read_csv(csv_path)
@@ -30,6 +31,20 @@ except Exception as e:
30
  logger.error(f"Error loading dataset: {e}")
31
  df = None
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Response cache with financial data entries
34
  response_cache = {
35
  "hi": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
@@ -162,6 +177,9 @@ response_cache = {
162
  "what is the average annual growth rate of the s&p 500 from 2011 to 2016?": (
163
  "The S&P 500’s average annual growth rate from 2011 to 2016 was approximately 12.7%, including dividends, driven by post-financial crisis recovery."
164
  ),
 
 
 
165
  "what will my return be in 10 years if i invest $5000 into s&p 500 right now?": (
166
  "Assuming a 10% average annual return, a $5,000 investment in the S&P 500 would grow to approximately $12,974 in 10 years with annual compounding. "
167
  "This is based on the historical average return of 10–12% (1927–2025). Future returns vary and are not guaranteed. Consult a financial planner."
@@ -225,26 +243,24 @@ prompt_prefix = (
225
  "1. This uses the historical average return of 10–12% (1927–2025).\n"
226
  "2. Future returns vary and are not guaranteed.\n\n"
227
  "Example 3:\n"
228
- "Q: What is the average return rate of the S&P 500 in the past 10 years?\n"
229
- "A: The S&P 500’s average annual return rate from 2015 to 2024 was approximately 12.2%, including dividends.\n"
230
- "1. This period includes strong recovery years (e.g., 26.89% in 2021) and declines (e.g., -19.44% in 2022).\n"
231
  "2. Dividends contribute significantly to total returns.\n\n"
232
  "Q: "
233
  )
234
  prefix_tokens = tokenizer(prompt_prefix, return_tensors="pt", truncation=True, max_length=512).to(device)
235
 
236
- # Substring matching for cache
237
  def get_closest_cache_key(message, cache_keys):
238
  message = message.lower().strip()
239
- for key in cache_keys:
240
- if key in message:
241
- return key
242
- return None
243
 
244
  # Parse period from user input
245
  def parse_period(query):
246
- # Match specific year ranges (e.g., "2000 to 2008", "2011–2016")
247
- match = re.search(r'(\d{4})\s*(?:to|-|–)\s*(\d{4})', query, re.IGNORECASE)
248
  if match:
249
  start_year, end_year = map(int, match.groups())
250
  return start_year, end_year, None
@@ -255,10 +271,10 @@ def parse_period(query):
255
  end_year = start_year + duration - 1
256
  return start_year, end_year, duration
257
  # Match general duration queries (e.g., "past 10 years", "3-year growth rate")
258
- match = re.search(r'past\s*(\d+)-year|\b(\d+)-year.*growth\s*rate', query, re.IGNORECASE)
259
  if match:
260
  duration = int(match.group(1) or match.group(2))
261
- max_year = df['Date'].dt.year.max() if df is not None else 2025
262
  start_year = max_year - duration + 1
263
  end_year = max_year
264
  return start_year, end_year, duration
@@ -266,16 +282,16 @@ def parse_period(query):
266
 
267
  # Calculate average growth rate
268
  def calculate_growth_rate(start_year, end_year, duration=None):
269
- if df is None or start_year is None or end_year is None:
270
  return None, "Data not available or invalid period."
271
- df_period = df[(df['Date'].dt.year >= start_year) & (df['Date'].dt.year <= end_year)]
272
  if df_period.empty:
273
  return None, f"No data available for {start_year} to {end_year}."
274
  avg_return = df_period['Return'].mean()
275
  if duration:
276
- response = f"The S&P 500’s {duration}-year average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
277
  else:
278
- response = f"The S&P 500’s average annual growth rate from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
279
  return avg_return, response
280
 
281
  # Parse investment return query
@@ -289,7 +305,7 @@ def parse_investment_query(query):
289
 
290
  # Calculate future value
291
  def calculate_future_value(amount, years):
292
- if df is None or amount is None or years is None:
293
  return None, "Data not available or invalid input."
294
  avg_annual_return = 10.0 # Historical S&P 500 average (1927–2025)
295
  future_value = amount * (1 + avg_annual_return / 100) ** years
@@ -385,7 +401,7 @@ def chat_with_model(user_input, history=None, is_processing=False):
385
  gen_start_time = time.time()
386
  outputs = model.generate(
387
  **inputs,
388
- max_new_tokens=50,
389
  min_length=20,
390
  do_sample=False,
391
  repetition_penalty=2.0,
 
8
  import re
9
  import numpy as np
10
  import json
11
+ import difflib
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 
18
  device = torch.device("cpu")
19
  logger.info(f"Using device: {device}")
20
 
21
+ # Load dataset and precompute period data
22
  csv_path = "flat-ui__data-Sun Jul 06 2025.csv"
23
  try:
24
  df = pd.read_csv(csv_path)
 
31
  logger.error(f"Error loading dataset: {e}")
32
  df = None
33
 
34
+ # Precompute yearly aggregates for faster lookups
35
+ if df is not None:
36
+ df_yearly = df.groupby(df['Date'].dt.year).agg({
37
+ 'SP500': 'mean',
38
+ 'Return': 'mean',
39
+ 'Real Return': 'mean',
40
+ 'Dividend': 'mean',
41
+ 'Earnings': 'mean',
42
+ 'PE10': 'mean'
43
+ }).reset_index()
44
+ df_yearly = df_yearly.rename(columns={'Date': 'Year'})
45
+ else:
46
+ df_yearly = None
47
+
48
  # Response cache with financial data entries
49
  response_cache = {
50
  "hi": "Hello! I'm FinChat, your financial advisor. How can I help with investing?",
 
177
  "what is the average annual growth rate of the s&p 500 from 2011 to 2016?": (
178
  "The S&P 500’s average annual growth rate from 2011 to 2016 was approximately 12.7%, including dividends, driven by post-financial crisis recovery."
179
  ),
180
+ "what was the average annual return of the s&p 500 between 2010 and 2020?": (
181
+ "The S&P 500’s average annual return from 2010 to 2020 was approximately 13.6%, including dividends, driven by post-financial crisis recovery."
182
+ ),
183
  "what will my return be in 10 years if i invest $5000 into s&p 500 right now?": (
184
  "Assuming a 10% average annual return, a $5,000 investment in the S&P 500 would grow to approximately $12,974 in 10 years with annual compounding. "
185
  "This is based on the historical average return of 10–12% (1927–2025). Future returns vary and are not guaranteed. Consult a financial planner."
 
243
  "1. This uses the historical average return of 10–12% (1927–2025).\n"
244
  "2. Future returns vary and are not guaranteed.\n\n"
245
  "Example 3:\n"
246
+ "Q: What was the average annual return of the S&P 500 between 2010 and 2020?\n"
247
+ "A: The S&P 500’s average annual return from 2010 to 2020 was approximately 13.6%, including dividends.\n"
248
+ "1. This period includes strong recovery post-financial crisis.\n"
249
  "2. Dividends contribute significantly to total returns.\n\n"
250
  "Q: "
251
  )
252
  prefix_tokens = tokenizer(prompt_prefix, return_tensors="pt", truncation=True, max_length=512).to(device)
253
 
254
+ # Substring matching for cache with fuzzy matching
255
  def get_closest_cache_key(message, cache_keys):
256
  message = message.lower().strip()
257
+ matches = difflib.get_close_matches(message, cache_keys, n=1, cutoff=0.8)
258
+ return matches[0] if matches else None
 
 
259
 
260
  # Parse period from user input
261
  def parse_period(query):
262
+ # Match specific year ranges (e.g., "between 2010 and 2020", "2000–2008")
263
+ match = re.search(r'(?:between|from)\s*(\d{4})\s*(?:and|to|-|–)\s*(\d{4})', query, re.IGNORECASE)
264
  if match:
265
  start_year, end_year = map(int, match.groups())
266
  return start_year, end_year, None
 
271
  end_year = start_year + duration - 1
272
  return start_year, end_year, duration
273
  # Match general duration queries (e.g., "past 10 years", "3-year growth rate")
274
+ match = re.search(r'past\s*(\d+)-year|\b(\d+)-year.*(?:return|growth\s*rate)', query, re.IGNORECASE)
275
  if match:
276
  duration = int(match.group(1) or match.group(2))
277
+ max_year = df_yearly['Year'].max() if df_yearly is not None else 2025
278
  start_year = max_year - duration + 1
279
  end_year = max_year
280
  return start_year, end_year, duration
 
282
 
283
  # Calculate average growth rate
284
  def calculate_growth_rate(start_year, end_year, duration=None):
285
+ if df_yearly is None or start_year is None or end_year is None:
286
  return None, "Data not available or invalid period."
287
+ df_period = df_yearly[(df_yearly['Year'] >= start_year) & (df_yearly['Year'] <= end_year)]
288
  if df_period.empty:
289
  return None, f"No data available for {start_year} to {end_year}."
290
  avg_return = df_period['Return'].mean()
291
  if duration:
292
+ response = f"The S&P 500’s {duration}-year average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
293
  else:
294
+ response = f"The S&P 500’s average annual return from {start_year} to {end_year} was approximately {avg_return:.1f}%, including dividends."
295
  return avg_return, response
296
 
297
  # Parse investment return query
 
305
 
306
  # Calculate future value
307
  def calculate_future_value(amount, years):
308
+ if df_yearly is None or amount is None or years is None:
309
  return None, "Data not available or invalid input."
310
  avg_annual_return = 10.0 # Historical S&P 500 average (1927–2025)
311
  future_value = amount * (1 + avg_annual_return / 100) ** years
 
401
  gen_start_time = time.time()
402
  outputs = model.generate(
403
  **inputs,
404
+ max_new_tokens=40, # Reduced for faster inference
405
  min_length=20,
406
  do_sample=False,
407
  repetition_penalty=2.0,