James McCool
Refactor import statements across multiple files to replace 'fuzzywuzzy' with 'rapidfuzz' for improved performance and consistency in string matching functionality. Additionally, clean up unused imports in app.py and related global functions to enhance code clarity and maintainability.
d9db89f
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
from rapidfuzz import process | |
def find_csv_mismatches(csv_df, projections_df): | |
# Create copies of the dataframes to avoid modifying the originals | |
csv_df = csv_df.copy() | |
projections_df = projections_df.copy() | |
if 'Name' not in csv_df.columns: | |
st.error("No 'Name' column found in CSV file") | |
return csv_df | |
if 'player_names' not in projections_df.columns: | |
st.error("No 'player_names' column found in projections file") | |
return csv_df | |
# Get unique player names from CSV and projections | |
csv_players = set(csv_df['Name'].dropna().unique()) | |
projection_players = set(projections_df['player_names'].unique()) | |
projection_players_list = list(csv_players) | |
# Find players in CSV that are missing from projections | |
players_missing_from_projections = list(projection_players - csv_players) | |
# Automatically handle 100% matches before starting interactive process | |
players_to_process = [] | |
for player in players_missing_from_projections: | |
if not isinstance(player, str): | |
st.warning(f"Skipping non-string value: {player}") | |
continue | |
closest_matches = process.extract(player, projection_players_list, limit=1) | |
if closest_matches[0][1] == 100: # If perfect match found | |
match_name = closest_matches[0][0] | |
# Update CSV DataFrame to use the projection name | |
csv_df.loc[csv_df['Name'] == player, 'Name'] = match_name | |
st.success(f"Automatically matched '{player}' with '{match_name}' (100% match)") | |
else: | |
players_to_process.append(player) | |
# Initialize session state for tracking current player if not exists | |
if 'csv_current_player_index' not in st.session_state: | |
st.session_state.csv_current_player_index = 0 | |
st.session_state.csv_players_to_process = players_to_process | |
# Display results | |
if players_missing_from_projections: | |
st.warning("Players in CSV but missing from projections") | |
# Display remaining players | |
remaining_players = st.session_state.csv_players_to_process[st.session_state.csv_current_player_index:] | |
st.info(f"Remaining players to process ({len(remaining_players)}):\n" + | |
"\n".join(f"- {player}" for player in remaining_players)) | |
if st.session_state.csv_current_player_index < len(st.session_state.csv_players_to_process): | |
current_player = st.session_state.csv_players_to_process[st.session_state.csv_current_player_index] | |
# Find the top 3 closest matches | |
closest_matches = process.extract(current_player, projection_players_list, limit=3) | |
st.write(f"**Missing Player {st.session_state.csv_current_player_index + 1} of {len(st.session_state.csv_players_to_process)}:** {current_player}") | |
# Create radio buttons for selection | |
options = [f"{match[0]} ({match[1]}%)" for match in closest_matches] | |
options.append("None of these") | |
selected_option = st.radio( | |
f"Select correct match:", | |
options, | |
key=f"csv_radio_{current_player}" | |
) | |
if st.button("Confirm Selection", key="csv_confirm"): | |
if selected_option != "None of these": | |
selected_name = selected_option.split(" (")[0] | |
# Update CSV DataFrame | |
csv_df.loc[csv_df['Name'] == current_player, 'Name'] = selected_name | |
st.success(f"Replaced '{current_player}' with '{selected_name}'") | |
st.session_state['csv_file'] = csv_df | |
# Move to next player | |
st.session_state.csv_current_player_index += 1 | |
st.rerun() | |
else: | |
st.success("All players have been processed!") | |
# Reset the index for future runs | |
st.session_state.csv_current_player_index = 0 | |
st.session_state.csv_players_to_process = [] | |
else: | |
st.success("All CSV players found in projections!") | |
return csv_df |