James McCool commited on
Commit
3cfe4c4
·
1 Parent(s): 579a535

Refactor DataFrame optimization in app.py to enhance memory efficiency. Disable categorical conversion for specific columns to avoid issues with exposure_spread, while implementing smarter handling for numeric types. Introduce a new function to prepare DataFrames for exposure_spread, ensuring compatibility and improved performance during calculations.

Browse files
Files changed (1) hide show
  1. app.py +44 -8
app.py CHANGED
@@ -131,13 +131,29 @@ def chunk_name_matching(portfolio_names, csv_names, chunk_size=1000):
131
 
132
  def optimize_dataframe_dtypes(df):
133
  """Optimize DataFrame data types for memory efficiency"""
 
 
 
 
 
134
  for col in df.columns:
135
- if df[col].dtype == 'object':
136
- # Only convert to category if there are many duplicates AND it's not a player column
137
- # Player columns need to stay as object for mapping operations
138
- excluded_cols = ['salary', 'median', 'Own', 'Finish_percentile', 'Dupes', 'Stack', 'Size', 'Win%', 'Lineup Edge', 'Weighted Own', 'Geomean', 'Diversity']
139
- if col not in excluded_cols and df[col].nunique() / len(df) < 0.3:
140
- df[col] = df[col].astype('category')
 
 
 
 
 
 
 
 
 
 
 
141
  return df
142
 
143
  def create_memory_efficient_mappings(projections_df, site_var, type_var, sport_var):
@@ -284,6 +300,11 @@ def calculate_lineup_metrics(df, player_columns, map_dict, type_var, sport_var,
284
  """Centralized function to calculate salary, median, and ownership efficiently"""
285
  df = df.copy() # Work on a copy to avoid modifying original
286
 
 
 
 
 
 
287
  # Vectorized calculations
288
  df['salary'] = calculate_salary_vectorized(df[player_columns], player_columns, map_dict, type_var, sport_var)
289
  df['median'] = calculate_median_vectorized(df[player_columns], player_columns, map_dict, type_var, sport_var)
@@ -358,6 +379,17 @@ def create_team_filter_mask(df, player_columns, team_map, teams_to_filter, focus
358
 
359
  return mask
360
 
 
 
 
 
 
 
 
 
 
 
 
361
  def create_position_export_dict(column_name, csv_file, site_var, type_var, sport_var):
362
  try:
363
  # Remove any numbers from the column name to get the position
@@ -2037,7 +2069,9 @@ if selected_tab == 'Manage Portfolio':
2037
  exp_submitted = st.form_submit_button("Export")
2038
  if reg_submitted:
2039
  st.session_state['settings_base'] = False
2040
- parsed_frame = exposure_spread(st.session_state['working_frame'], st.session_state['exposure_player'], exposure_target, ignore_stacks, remove_teams_exposure, specific_replacements, specific_columns, st.session_state['projections_df'], sport_var, type_var, salary_max, stacking_sports)
 
 
2041
 
2042
  # Use consolidated calculation function
2043
  parsed_frame = calculate_lineup_metrics(
@@ -2056,7 +2090,9 @@ if selected_tab == 'Manage Portfolio':
2056
  st.session_state['export_merge'] = st.session_state['working_frame'].copy()
2057
  elif exp_submitted:
2058
  st.session_state['settings_base'] = False
2059
- parsed_frame = exposure_spread(st.session_state['export_base'], st.session_state['exposure_player'], exposure_target, ignore_stacks, remove_teams_exposure, specific_replacements, specific_columns, st.session_state['projections_df'], sport_var, type_var, salary_max, stacking_sports)
 
 
2060
 
2061
  # Use consolidated calculation function for export
2062
  parsed_frame = calculate_lineup_metrics(
 
131
 
132
  def optimize_dataframe_dtypes(df):
133
  """Optimize DataFrame data types for memory efficiency"""
134
+ # For now, disable categorical conversion entirely to avoid issues with exposure_spread and other operations
135
+ # This maintains compatibility while still providing other memory optimizations
136
+ # Future enhancement: implement smarter categorical handling that preserves mutability
137
+
138
+ # Only optimize numeric columns to more efficient dtypes
139
  for col in df.columns:
140
+ if df[col].dtype == 'float64':
141
+ # Convert float64 to float32 if possible without significant precision loss
142
+ try:
143
+ if df[col].max() < 3.4e+38 and df[col].min() > -3.4e+38: # float32 range
144
+ df[col] = df[col].astype('float32')
145
+ except:
146
+ pass
147
+ elif df[col].dtype == 'int64':
148
+ # Convert int64 to smaller int types if possible
149
+ try:
150
+ if df[col].max() <= 32767 and df[col].min() >= -32768:
151
+ df[col] = df[col].astype('int16')
152
+ elif df[col].max() <= 2147483647 and df[col].min() >= -2147483648:
153
+ df[col] = df[col].astype('int32')
154
+ except:
155
+ pass
156
+
157
  return df
158
 
159
  def create_memory_efficient_mappings(projections_df, site_var, type_var, sport_var):
 
300
  """Centralized function to calculate salary, median, and ownership efficiently"""
301
  df = df.copy() # Work on a copy to avoid modifying original
302
 
303
+ # Ensure player columns are object type to avoid categorical issues with exposure_spread
304
+ for col in player_columns:
305
+ if df[col].dtype.name == 'category':
306
+ df[col] = df[col].astype('object')
307
+
308
  # Vectorized calculations
309
  df['salary'] = calculate_salary_vectorized(df[player_columns], player_columns, map_dict, type_var, sport_var)
310
  df['median'] = calculate_median_vectorized(df[player_columns], player_columns, map_dict, type_var, sport_var)
 
379
 
380
  return mask
381
 
382
+ def prepare_dataframe_for_exposure_spread(df, player_columns):
383
+ """Ensure DataFrame is ready for exposure_spread by converting player columns to object type"""
384
+ df_prepared = df.copy()
385
+
386
+ # Convert any categorical player columns back to object type
387
+ for col in player_columns:
388
+ if col in df_prepared.columns and df_prepared[col].dtype.name == 'category':
389
+ df_prepared[col] = df_prepared[col].astype('object')
390
+
391
+ return df_prepared
392
+
393
  def create_position_export_dict(column_name, csv_file, site_var, type_var, sport_var):
394
  try:
395
  # Remove any numbers from the column name to get the position
 
2069
  exp_submitted = st.form_submit_button("Export")
2070
  if reg_submitted:
2071
  st.session_state['settings_base'] = False
2072
+ # Prepare DataFrame for exposure_spread to avoid categorical issues
2073
+ working_frame_prepared = prepare_dataframe_for_exposure_spread(st.session_state['working_frame'], st.session_state['player_columns'])
2074
+ parsed_frame = exposure_spread(working_frame_prepared, st.session_state['exposure_player'], exposure_target, ignore_stacks, remove_teams_exposure, specific_replacements, specific_columns, st.session_state['projections_df'], sport_var, type_var, salary_max, stacking_sports)
2075
 
2076
  # Use consolidated calculation function
2077
  parsed_frame = calculate_lineup_metrics(
 
2090
  st.session_state['export_merge'] = st.session_state['working_frame'].copy()
2091
  elif exp_submitted:
2092
  st.session_state['settings_base'] = False
2093
+ # Prepare DataFrame for exposure_spread to avoid categorical issues
2094
+ export_base_prepared = prepare_dataframe_for_exposure_spread(st.session_state['export_base'], st.session_state['player_columns'])
2095
+ parsed_frame = exposure_spread(export_base_prepared, st.session_state['exposure_player'], exposure_target, ignore_stacks, remove_teams_exposure, specific_replacements, specific_columns, st.session_state['projections_df'], sport_var, type_var, salary_max, stacking_sports)
2096
 
2097
  # Use consolidated calculation function for export
2098
  parsed_frame = calculate_lineup_metrics(