File size: 2,322 Bytes
872a007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import streamlit as st
import numpy as np
import pandas as pd
import time
import math
from difflib import SequenceMatcher

def recalc_diversity(portfolio, player_columns):
    """
    Vectorized version of recalc_diversity using NumPy operations.
    """
    # Extract player data and convert to string array
    player_data = portfolio[player_columns].astype(str).fillna('').values
    
    # Get all unique players and create a mapping to numeric IDs
    all_players = set()
    for row in player_data:
        for val in row:
            if isinstance(val, str) and val.strip() != '':
                all_players.add(val)
    
    # Create player ID mapping
    player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
    
    # Convert each row to a binary vector (1 if player is present, 0 if not)
    n_players = len(all_players)
    n_rows = len(portfolio)
    binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
    
    # Vectorized binary matrix creation
    for i, row in enumerate(player_data):
        for val in row:
            if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
                binary_matrix[i, player_to_id[str(val)]] = 1
    
    # Vectorized Jaccard distance calculation
    intersection_matrix = np.dot(binary_matrix, binary_matrix.T)
    row_sums = np.sum(binary_matrix, axis=1)
    union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
    
    # Calculate Jaccard distance: 1 - (intersection / union)
    with np.errstate(divide='ignore', invalid='ignore'):
        jaccard_similarity = np.divide(intersection_matrix, union_matrix, 
                                     out=np.zeros_like(intersection_matrix, dtype=float), 
                                     where=union_matrix != 0)
    
    jaccard_distance = 1 - jaccard_similarity
    
    # Exclude self-comparison and calculate average distance for each row
    np.fill_diagonal(jaccard_distance, 0)
    row_counts = n_rows - 1
    similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
    
    # Normalize to 0-1 scale
    score_range = similarity_scores.max() - similarity_scores.min()
    if score_range > 0:
        similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
    
    return similarity_scores