DFS_Portfolio_Manager / global_func /recalc_diversity.py
James McCool
introduced a button to recalculate diversity
872a007
raw
history blame
2.32 kB
import streamlit as st
import numpy as np
import pandas as pd
import time
import math
from difflib import SequenceMatcher
def recalc_diversity(portfolio, player_columns):
"""
Vectorized version of recalc_diversity using NumPy operations.
"""
# Extract player data and convert to string array
player_data = portfolio[player_columns].astype(str).fillna('').values
# Get all unique players and create a mapping to numeric IDs
all_players = set()
for row in player_data:
for val in row:
if isinstance(val, str) and val.strip() != '':
all_players.add(val)
# Create player ID mapping
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
# Convert each row to a binary vector (1 if player is present, 0 if not)
n_players = len(all_players)
n_rows = len(portfolio)
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
# Vectorized binary matrix creation
for i, row in enumerate(player_data):
for val in row:
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
binary_matrix[i, player_to_id[str(val)]] = 1
# Vectorized Jaccard distance calculation
intersection_matrix = np.dot(binary_matrix, binary_matrix.T)
row_sums = np.sum(binary_matrix, axis=1)
union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
# Calculate Jaccard distance: 1 - (intersection / union)
with np.errstate(divide='ignore', invalid='ignore'):
jaccard_similarity = np.divide(intersection_matrix, union_matrix,
out=np.zeros_like(intersection_matrix, dtype=float),
where=union_matrix != 0)
jaccard_distance = 1 - jaccard_similarity
# Exclude self-comparison and calculate average distance for each row
np.fill_diagonal(jaccard_distance, 0)
row_counts = n_rows - 1
similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
# Normalize to 0-1 scale
score_range = similarity_scores.max() - similarity_scores.min()
if score_range > 0:
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
return similarity_scores