James McCool
commited on
Commit
·
91e473e
1
Parent(s):
5de7ed9
Refactor name matching logic and update dependencies
Browse files- Replaced the fuzzywuzzy library with rapidfuzz for improved performance in name matching operations.
- Removed the deprecated find_name_mismatches function and introduced a new get_contest_names function to streamline the retrieval of unique player names from contest data.
- Enhanced the load_contest_file function to utilize the new name matching logic, ensuring consistent player name handling across the application.
- Maintained existing functionality while improving code clarity and efficiency.
- app.py +1 -1
- global_func/find_name_mismatches.py +0 -99
- global_func/get_contest_names.py +26 -0
- global_func/load_contest_file.py +20 -1
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import streamlit as st
|
|
| 2 |
st.set_page_config(layout="wide")
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
-
from
|
| 6 |
from collections import Counter
|
| 7 |
from pymongo.mongo_client import MongoClient
|
| 8 |
from pymongo.server_api import ServerApi
|
|
|
|
| 2 |
st.set_page_config(layout="wide")
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
+
from rapidfuzz import process, fuzz
|
| 6 |
from collections import Counter
|
| 7 |
from pymongo.mongo_client import MongoClient
|
| 8 |
from pymongo.server_api import ServerApi
|
global_func/find_name_mismatches.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
from fuzzywuzzy import process
|
| 3 |
-
|
| 4 |
-
def find_name_mismatches(contest_df, projections_df, ownership_df, fpts_df):
|
| 5 |
-
|
| 6 |
-
name_columns = [col for col in contest_df.columns if not col in ['BaseName', 'EntryCount']]
|
| 7 |
-
|
| 8 |
-
if 'player_names' not in projections_df.columns:
|
| 9 |
-
st.error("No 'player_names' column found in projections file")
|
| 10 |
-
return contest_df, projections_df
|
| 11 |
-
|
| 12 |
-
# Get unique player names from portfolio and projections
|
| 13 |
-
portfolio_players = set()
|
| 14 |
-
for col in name_columns:
|
| 15 |
-
portfolio_players.update(contest_df[col].unique())
|
| 16 |
-
projection_players = set(projections_df['player_names'].unique())
|
| 17 |
-
portfolio_players_list = list(portfolio_players)
|
| 18 |
-
projection_players_list = list(projection_players)
|
| 19 |
-
|
| 20 |
-
# Find players in portfolio that are missing from projections
|
| 21 |
-
players_missing_from_projections = list(projection_players - portfolio_players)
|
| 22 |
-
|
| 23 |
-
# Automatically handle 90%+ matches before starting interactive process
|
| 24 |
-
auto_matches = {}
|
| 25 |
-
players_to_process = []
|
| 26 |
-
for player in players_missing_from_projections:
|
| 27 |
-
if not isinstance(player, str):
|
| 28 |
-
st.warning(f"Skipping non-string value: {player}")
|
| 29 |
-
continue
|
| 30 |
-
closest_matches = process.extract(player, portfolio_players_list, limit=1)
|
| 31 |
-
if closest_matches[0][1] >= 95: # If high confidence match found
|
| 32 |
-
match_name = closest_matches[0][0]
|
| 33 |
-
auto_matches[player] = match_name
|
| 34 |
-
st.success(f"Automatically matched '{player}' with '{match_name}' ({closest_matches[0][1]}% match)")
|
| 35 |
-
elif closest_matches[0][1] >= 75:
|
| 36 |
-
players_to_process.append(player)
|
| 37 |
-
else:
|
| 38 |
-
st.warning(f"No match found for '{player}'")
|
| 39 |
-
|
| 40 |
-
if players_to_process:
|
| 41 |
-
st.warning(f"Found {len(players_to_process)} players that need manual matching")
|
| 42 |
-
|
| 43 |
-
# Create a form for batch processing
|
| 44 |
-
with st.form("name_matching_form"):
|
| 45 |
-
# Create tabs for each player
|
| 46 |
-
tabs = st.tabs([f"Player {i+1}" for i in range(len(players_to_process))])
|
| 47 |
-
|
| 48 |
-
# Dictionary to store selections
|
| 49 |
-
selections = {}
|
| 50 |
-
|
| 51 |
-
# Populate each tab
|
| 52 |
-
for i, player in enumerate(players_to_process):
|
| 53 |
-
with tabs[i]:
|
| 54 |
-
st.write(f"**Projection Name:** {player}")
|
| 55 |
-
|
| 56 |
-
# Find the top 3 closest matches
|
| 57 |
-
closest_matches = process.extract(player, portfolio_players_list, limit=3)
|
| 58 |
-
|
| 59 |
-
# Create radio buttons for selection
|
| 60 |
-
options = [f"{match[0]} ({match[1]}%)" for match in closest_matches]
|
| 61 |
-
options.append("None of these")
|
| 62 |
-
|
| 63 |
-
selections[player] = st.radio(
|
| 64 |
-
f"Select correct match:",
|
| 65 |
-
options,
|
| 66 |
-
key=f"radio_{player}"
|
| 67 |
-
)
|
| 68 |
-
|
| 69 |
-
# Submit button for the entire form
|
| 70 |
-
submitted = st.form_submit_button("Apply All Changes")
|
| 71 |
-
|
| 72 |
-
if submitted:
|
| 73 |
-
# Process automatic matches
|
| 74 |
-
for projection_name, contest_name in auto_matches.items():
|
| 75 |
-
for col in name_columns:
|
| 76 |
-
contest_df[col] = contest_df[col].replace(contest_name, projection_name)
|
| 77 |
-
ownership_df['Player'] = ownership_df['Player'].replace(contest_name, projection_name)
|
| 78 |
-
fpts_df['Player'] = fpts_df['Player'].replace(contest_name, projection_name)
|
| 79 |
-
|
| 80 |
-
# Process manual selections
|
| 81 |
-
for projection_name, selection in selections.items():
|
| 82 |
-
if selection != "None of these":
|
| 83 |
-
selected_name = selection.split(" (")[0]
|
| 84 |
-
for col in name_columns:
|
| 85 |
-
contest_df[col] = contest_df[col].replace(selected_name, projection_name)
|
| 86 |
-
ownership_df['Player'] = ownership_df['Player'].replace(selected_name, projection_name)
|
| 87 |
-
fpts_df['Player'] = fpts_df['Player'].replace(selected_name, projection_name)
|
| 88 |
-
st.success(f"Replaced '{selected_name}' with '{projection_name}'")
|
| 89 |
-
st.success("All changes applied successfully!")
|
| 90 |
-
return contest_df, projections_df, ownership_df, fpts_df
|
| 91 |
-
else:
|
| 92 |
-
st.success("All players have been automatically matched!")
|
| 93 |
-
# Apply automatic matches
|
| 94 |
-
for projection_name, contest_name in auto_matches.items():
|
| 95 |
-
for col in name_columns:
|
| 96 |
-
contest_df[col] = contest_df[col].replace(contest_name, projection_name)
|
| 97 |
-
ownership_df['Player'] = ownership_df['Player'].replace(contest_name, projection_name)
|
| 98 |
-
fpts_df['Player'] = fpts_df['Player'].replace(contest_name, projection_name)
|
| 99 |
-
return contest_df, projections_df, ownership_df, fpts_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
global_func/get_contest_names.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import time
|
| 5 |
+
from rapidfuzz import process, fuzz
|
| 6 |
+
|
| 7 |
+
def get_contest_names(contest_frame):
|
| 8 |
+
"""
|
| 9 |
+
Get all unique names from the contest dataframe's player columns.
|
| 10 |
+
|
| 11 |
+
Args:
|
| 12 |
+
contest_frame: DataFrame containing contest data
|
| 13 |
+
|
| 14 |
+
Returns:
|
| 15 |
+
list: List of unique player names
|
| 16 |
+
"""
|
| 17 |
+
# Get columns that contain player names (excluding non-player columns)
|
| 18 |
+
player_columns = [col for col in contest_frame.columns
|
| 19 |
+
if col not in ['BaseName', 'EntryCount']]
|
| 20 |
+
|
| 21 |
+
# Get all unique values from these columns
|
| 22 |
+
unique_names = contest_frame[player_columns].values.flatten()
|
| 23 |
+
unique_names = pd.unique(unique_names) # Remove duplicates
|
| 24 |
+
unique_names = unique_names[~pd.isna(unique_names)] # Remove any NaN values
|
| 25 |
+
|
| 26 |
+
return list(unique_names)
|
global_func/load_contest_file.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def load_contest_file(upload, helper = None, sport = None):
|
| 5 |
if sport == 'MLB':
|
|
@@ -52,6 +54,23 @@ def load_contest_file(upload, helper = None, sport = None):
|
|
| 52 |
df_helper = helper_df[['Player', 'Salary', 'Team']]
|
| 53 |
|
| 54 |
print('Made it through helper')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# Create separate dataframes for different player attributes
|
| 57 |
if helper is not None:
|
|
@@ -95,7 +114,7 @@ def load_contest_file(upload, helper = None, sport = None):
|
|
| 95 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
| 96 |
elif sport == 'GOLF':
|
| 97 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
| 98 |
-
|
| 99 |
print('Made it through check_lineups')
|
| 100 |
|
| 101 |
# Get unique entry names
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
from get_contest_names import get_contest_names
|
| 4 |
+
from rapidfuzz import process, fuzz
|
| 5 |
|
| 6 |
def load_contest_file(upload, helper = None, sport = None):
|
| 7 |
if sport == 'MLB':
|
|
|
|
| 54 |
df_helper = helper_df[['Player', 'Salary', 'Team']]
|
| 55 |
|
| 56 |
print('Made it through helper')
|
| 57 |
+
|
| 58 |
+
contest_names = df.Player.unique()
|
| 59 |
+
helper_names = helper_df.Player.unique()
|
| 60 |
+
|
| 61 |
+
contest_match_dict = {}
|
| 62 |
+
for names in helper_names:
|
| 63 |
+
match = process.extractOne(
|
| 64 |
+
names,
|
| 65 |
+
contest_names,
|
| 66 |
+
score_cutoff = 85
|
| 67 |
+
)
|
| 68 |
+
if match:
|
| 69 |
+
contest_match_dict[names] = match[0]
|
| 70 |
+
else:
|
| 71 |
+
contest_match_dict[names] = names
|
| 72 |
+
|
| 73 |
+
df_helper['Player'] = df_helper['Player'].map(contest_match_dict)
|
| 74 |
|
| 75 |
# Create separate dataframes for different player attributes
|
| 76 |
if helper is not None:
|
|
|
|
| 114 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
| 115 |
elif sport == 'GOLF':
|
| 116 |
cleaned_df = cleaned_df[['BaseName', 'EntryCount', 'Guy', 'Dude', 'Pooba', 'Bub', 'Chief', 'Buddy']]
|
| 117 |
+
|
| 118 |
print('Made it through check_lineups')
|
| 119 |
|
| 120 |
# Get unique entry names
|