import streamlit as st import numpy as np import pandas as pd import time import math from difflib import SequenceMatcher def recalc_diversity(portfolio, player_columns): """ Vectorized version of recalc_diversity using NumPy operations. """ # Extract player data and convert to string array player_data = portfolio[player_columns].astype(str).fillna('').values # Get all unique players and create a mapping to numeric IDs all_players = set() for row in player_data: for val in row: if isinstance(val, str) and val.strip() != '': all_players.add(val) # Create player ID mapping player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))} # Convert each row to a binary vector (1 if player is present, 0 if not) n_players = len(all_players) n_rows = len(portfolio) binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8) # Vectorized binary matrix creation for i, row in enumerate(player_data): for val in row: if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id: binary_matrix[i, player_to_id[str(val)]] = 1 # Vectorized Jaccard distance calculation intersection_matrix = np.dot(binary_matrix, binary_matrix.T) row_sums = np.sum(binary_matrix, axis=1) union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix # Calculate Jaccard distance: 1 - (intersection / union) with np.errstate(divide='ignore', invalid='ignore'): jaccard_similarity = np.divide(intersection_matrix, union_matrix, out=np.zeros_like(intersection_matrix, dtype=float), where=union_matrix != 0) jaccard_distance = 1 - jaccard_similarity # Exclude self-comparison and calculate average distance for each row np.fill_diagonal(jaccard_distance, 0) row_counts = n_rows - 1 similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts # Normalize to 0-1 scale score_range = similarity_scores.max() - similarity_scores.min() if score_range > 0: similarity_scores = (similarity_scores - similarity_scores.min()) / score_range return similarity_scores