|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
import time |
|
import math |
|
from difflib import SequenceMatcher |
|
|
|
def recalc_diversity(portfolio, player_columns): |
|
""" |
|
Vectorized version of recalc_diversity using NumPy operations. |
|
""" |
|
|
|
player_data = portfolio[player_columns].astype(str).fillna('').values |
|
|
|
|
|
all_players = set() |
|
for row in player_data: |
|
for val in row: |
|
if isinstance(val, str) and val.strip() != '': |
|
all_players.add(val) |
|
|
|
|
|
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))} |
|
|
|
|
|
n_players = len(all_players) |
|
n_rows = len(portfolio) |
|
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8) |
|
|
|
|
|
for i, row in enumerate(player_data): |
|
for val in row: |
|
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id: |
|
binary_matrix[i, player_to_id[str(val)]] = 1 |
|
|
|
|
|
intersection_matrix = np.dot(binary_matrix, binary_matrix.T) |
|
row_sums = np.sum(binary_matrix, axis=1) |
|
union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix |
|
|
|
|
|
with np.errstate(divide='ignore', invalid='ignore'): |
|
jaccard_similarity = np.divide(intersection_matrix, union_matrix, |
|
out=np.zeros_like(intersection_matrix, dtype=float), |
|
where=union_matrix != 0) |
|
|
|
jaccard_distance = 1 - jaccard_similarity |
|
|
|
|
|
np.fill_diagonal(jaccard_distance, 0) |
|
row_counts = n_rows - 1 |
|
similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts |
|
|
|
|
|
score_range = similarity_scores.max() - similarity_scores.min() |
|
if score_range > 0: |
|
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range |
|
|
|
return similarity_scores |