James McCool
commited on
Commit
·
dd908a8
1
Parent(s):
42199ca
Add player similarity score calculation to predict_dupes function
Browse filesThis update introduces a new function, calculate_player_similarity_score, which computes a similarity score for each lineup based on player selections. The scores are normalized to a 0-1 scale, where higher values indicate more unique lineups. Additionally, player_columns are dynamically defined based on the portfolio structure to ensure accurate processing of player data.
- global_func/predict_dupes.py +69 -0
global_func/predict_dupes.py
CHANGED
|
@@ -37,6 +37,67 @@ def calculate_weighted_ownership(row_ownerships):
|
|
| 37 |
# Convert back to percentage form to match input format
|
| 38 |
return weighted * 10000
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
|
| 41 |
if strength_var == 'Weak':
|
| 42 |
dupes_multiplier = .75
|
|
@@ -54,6 +115,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
| 54 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
|
| 55 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
|
| 56 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
|
| 57 |
flex_ownerships = pd.concat([
|
| 58 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
| 59 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
@@ -95,6 +157,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
| 95 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
| 96 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
| 97 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
|
| 98 |
for i in range(1, num_players + 1):
|
| 99 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
| 100 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
|
@@ -118,6 +181,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
| 118 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
| 119 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
| 120 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
|
| 121 |
flex_ownerships = pd.concat([
|
| 122 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
| 123 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
@@ -162,6 +226,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
| 162 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
| 163 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
| 164 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
|
| 165 |
flex_ownerships = pd.concat([
|
| 166 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
| 167 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
@@ -206,6 +271,7 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
| 206 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
| 207 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
| 208 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
|
|
|
| 209 |
for i in range(1, num_players + 1):
|
| 210 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
| 211 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
|
@@ -263,6 +329,9 @@ def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, streng
|
|
| 263 |
portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
|
| 264 |
portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
|
| 265 |
portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
|
|
|
|
|
|
|
|
|
|
| 266 |
portfolio = portfolio.drop(columns=dup_count_columns)
|
| 267 |
portfolio = portfolio.drop(columns=own_columns)
|
| 268 |
portfolio = portfolio.drop(columns=calc_columns)
|
|
|
|
| 37 |
# Convert back to percentage form to match input format
|
| 38 |
return weighted * 10000
|
| 39 |
|
| 40 |
+
def calculate_player_similarity_score(portfolio, player_columns):
|
| 41 |
+
"""
|
| 42 |
+
Calculate a similarity score that measures how different each row is from all other rows
|
| 43 |
+
based on actual player selection (not ownership values).
|
| 44 |
+
Higher scores indicate more unique/different lineups.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
portfolio: DataFrame containing the portfolio data
|
| 48 |
+
player_columns: List of column names containing player names
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
Series: Similarity scores for each row
|
| 52 |
+
"""
|
| 53 |
+
# Extract player data and create a matrix where each row represents a lineup
|
| 54 |
+
player_data = portfolio[player_columns].fillna('')
|
| 55 |
+
|
| 56 |
+
# Get all unique players across all lineups
|
| 57 |
+
all_players = set()
|
| 58 |
+
for col in player_columns:
|
| 59 |
+
all_players.update(player_data[col].unique())
|
| 60 |
+
all_players = sorted(list(all_players))
|
| 61 |
+
|
| 62 |
+
# Create a binary matrix: 1 if player is in lineup, 0 if not
|
| 63 |
+
binary_matrix = np.zeros((len(portfolio), len(all_players)))
|
| 64 |
+
|
| 65 |
+
for i, row in player_data.iterrows():
|
| 66 |
+
for j, player in enumerate(all_players):
|
| 67 |
+
if player in row.values:
|
| 68 |
+
binary_matrix[i, j] = 1
|
| 69 |
+
|
| 70 |
+
# Calculate Jaccard distance between all pairs of lineups
|
| 71 |
+
# Jaccard distance = 1 - (intersection / union)
|
| 72 |
+
similarity_scores = []
|
| 73 |
+
|
| 74 |
+
for i in range(len(portfolio)):
|
| 75 |
+
distances = []
|
| 76 |
+
for j in range(len(portfolio)):
|
| 77 |
+
if i != j:
|
| 78 |
+
# Calculate intersection and union
|
| 79 |
+
intersection = np.sum((binary_matrix[i] == 1) & (binary_matrix[j] == 1))
|
| 80 |
+
union = np.sum((binary_matrix[i] == 1) | (binary_matrix[j] == 1))
|
| 81 |
+
|
| 82 |
+
# Avoid division by zero
|
| 83 |
+
if union == 0:
|
| 84 |
+
jaccard_distance = 1.0 # Completely different if both are empty
|
| 85 |
+
else:
|
| 86 |
+
jaccard_distance = 1 - (intersection / union)
|
| 87 |
+
|
| 88 |
+
distances.append(jaccard_distance)
|
| 89 |
+
|
| 90 |
+
# Average distance to all other lineups
|
| 91 |
+
avg_distance = np.mean(distances) if distances else 0
|
| 92 |
+
similarity_scores.append(avg_distance)
|
| 93 |
+
|
| 94 |
+
# Normalize to 0-1 scale where 1 = most unique/different
|
| 95 |
+
similarity_scores = np.array(similarity_scores)
|
| 96 |
+
if similarity_scores.max() > similarity_scores.min():
|
| 97 |
+
similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())
|
| 98 |
+
|
| 99 |
+
return similarity_scores
|
| 100 |
+
|
| 101 |
def predict_dupes(portfolio, maps_dict, site_var, type_var, Contest_Size, strength_var, sport_var):
|
| 102 |
if strength_var == 'Weak':
|
| 103 |
dupes_multiplier = .75
|
|
|
|
| 115 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank']
|
| 116 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own']
|
| 117 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
| 118 |
+
player_columns = [col for col in portfolio.columns[:5] if col not in ['salary', 'median', 'Own']]
|
| 119 |
flex_ownerships = pd.concat([
|
| 120 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
| 121 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
|
|
| 157 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
| 158 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
| 159 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'own_ratio', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
| 160 |
+
player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
|
| 161 |
for i in range(1, num_players + 1):
|
| 162 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
| 163 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
|
|
|
| 181 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
| 182 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
| 183 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
| 184 |
+
player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
|
| 185 |
flex_ownerships = pd.concat([
|
| 186 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
| 187 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
|
|
| 226 |
dup_count_columns = ['CPT_Own_percent_rank', 'FLEX1_Own_percent_rank', 'FLEX2_Own_percent_rank', 'FLEX3_Own_percent_rank', 'FLEX4_Own_percent_rank', 'FLEX5_Own_percent_rank']
|
| 227 |
own_columns = ['CPT_Own', 'FLEX1_Own', 'FLEX2_Own', 'FLEX3_Own', 'FLEX4_Own', 'FLEX5_Own']
|
| 228 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
| 229 |
+
player_columns = [col for col in portfolio.columns[:6] if col not in ['salary', 'median', 'Own']]
|
| 230 |
flex_ownerships = pd.concat([
|
| 231 |
portfolio.iloc[:,1].map(maps_dict['own_map']),
|
| 232 |
portfolio.iloc[:,2].map(maps_dict['own_map']),
|
|
|
|
| 271 |
dup_count_columns = [f'player_{i}_percent_rank' for i in range(1, num_players + 1)]
|
| 272 |
own_columns = [f'player_{i}_own' for i in range(1, num_players + 1)]
|
| 273 |
calc_columns = ['own_product', 'own_average', 'own_sum', 'avg_own_rank', 'dupes_calc', 'low_own_count', 'Ref_Proj', 'Max_Proj', 'Min_Proj', 'Avg_Ref', 'own_ratio']
|
| 274 |
+
player_columns = [col for col in portfolio.columns[:num_players] if col not in ['salary', 'median', 'Own']]
|
| 275 |
for i in range(1, num_players + 1):
|
| 276 |
portfolio[f'player_{i}_percent_rank'] = portfolio.iloc[:,i-1].map(maps_dict['own_percent_rank'])
|
| 277 |
portfolio[f'player_{i}_own'] = portfolio.iloc[:,i-1].map(maps_dict['own_map']) / 100
|
|
|
|
| 329 |
portfolio['Lineup Edge'] = portfolio['Lineup Edge'] - portfolio['Lineup Edge'].mean()
|
| 330 |
portfolio['Weighted Own'] = portfolio[own_columns].apply(calculate_weighted_ownership, axis=1)
|
| 331 |
portfolio['Geomean'] = np.power((portfolio[own_columns] * 100).product(axis=1), 1 / len(own_columns))
|
| 332 |
+
|
| 333 |
+
portfolio['Similarity Score'] = calculate_player_similarity_score(portfolio, player_columns)
|
| 334 |
+
|
| 335 |
portfolio = portfolio.drop(columns=dup_count_columns)
|
| 336 |
portfolio = portfolio.drop(columns=own_columns)
|
| 337 |
portfolio = portfolio.drop(columns=calc_columns)
|