Spaces:
Sleeping
Sleeping
James McCool
commited on
Commit
·
ebc0082
1
Parent(s):
d4f6599
Implement chunked processing for player similarity score calculation to enhance memory efficiency in predict_dupes.py
Browse files- global_func/predict_dupes.py +30 -24
global_func/predict_dupes.py
CHANGED
@@ -52,53 +52,59 @@ def calculate_weighted_ownership_wrapper(row_ownerships):
|
|
52 |
ownership_array = row_ownerships.values.reshape(1, -1)
|
53 |
return calculate_weighted_ownership_vectorized(ownership_array)[0]
|
54 |
|
55 |
-
def
|
56 |
"""
|
57 |
-
|
58 |
"""
|
59 |
-
#
|
60 |
player_data = portfolio[player_columns].astype(str).fillna('').values
|
61 |
|
62 |
-
# Get all unique players and create a mapping to numeric IDs
|
63 |
all_players = set()
|
64 |
for row in player_data:
|
65 |
for val in row:
|
66 |
if isinstance(val, str) and val.strip() != '':
|
67 |
all_players.add(val)
|
68 |
|
69 |
-
# Create player ID mapping
|
70 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
71 |
|
72 |
-
# Convert each row to a binary vector (1 if player is present, 0 if not)
|
73 |
n_players = len(all_players)
|
74 |
n_rows = len(portfolio)
|
75 |
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
76 |
|
77 |
-
# Vectorized binary matrix creation
|
78 |
for i, row in enumerate(player_data):
|
79 |
for val in row:
|
80 |
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
81 |
binary_matrix[i, player_to_id[str(val)]] = 1
|
82 |
|
83 |
-
#
|
84 |
-
|
85 |
-
row_sums = np.sum(binary_matrix, axis=1)
|
86 |
-
union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
# Normalize
|
102 |
score_range = similarity_scores.max() - similarity_scores.min()
|
103 |
if score_range > 0:
|
104 |
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
|
|
|
52 |
ownership_array = row_ownerships.values.reshape(1, -1)
|
53 |
return calculate_weighted_ownership_vectorized(ownership_array)[0]
|
54 |
|
55 |
+
def calculate_player_similarity_score_chunked(portfolio, player_columns, chunk_size=1000):
|
56 |
"""
|
57 |
+
Memory-efficient version that processes similarities in chunks
|
58 |
"""
|
59 |
+
# Same setup as before
|
60 |
player_data = portfolio[player_columns].astype(str).fillna('').values
|
61 |
|
|
|
62 |
all_players = set()
|
63 |
for row in player_data:
|
64 |
for val in row:
|
65 |
if isinstance(val, str) and val.strip() != '':
|
66 |
all_players.add(val)
|
67 |
|
|
|
68 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
69 |
|
|
|
70 |
n_players = len(all_players)
|
71 |
n_rows = len(portfolio)
|
72 |
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
73 |
|
|
|
74 |
for i, row in enumerate(player_data):
|
75 |
for val in row:
|
76 |
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
77 |
binary_matrix[i, player_to_id[str(val)]] = 1
|
78 |
|
79 |
+
# Process similarities in chunks to avoid massive matrices
|
80 |
+
similarity_scores = np.zeros(n_rows)
|
|
|
|
|
81 |
|
82 |
+
for i in range(0, n_rows, chunk_size):
|
83 |
+
end_i = min(i + chunk_size, n_rows)
|
84 |
+
chunk_binary = binary_matrix[i:end_i]
|
85 |
+
|
86 |
+
# Calculate similarities for this chunk only
|
87 |
+
intersection = np.dot(chunk_binary, binary_matrix.T)
|
88 |
+
chunk_row_sums = np.sum(chunk_binary, axis=1)
|
89 |
+
all_row_sums = np.sum(binary_matrix, axis=1)
|
90 |
+
|
91 |
+
union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection
|
92 |
+
|
93 |
+
with np.errstate(divide='ignore', invalid='ignore'):
|
94 |
+
jaccard_sim = np.divide(intersection, union,
|
95 |
+
out=np.zeros_like(intersection, dtype=float),
|
96 |
+
where=union != 0)
|
97 |
+
|
98 |
+
jaccard_dist = 1 - jaccard_sim
|
99 |
+
|
100 |
+
# Exclude self-comparison and calculate average
|
101 |
+
for j in range(len(jaccard_dist)):
|
102 |
+
actual_idx = i + j
|
103 |
+
jaccard_dist[j, actual_idx] = 0 # Exclude self
|
104 |
+
|
105 |
+
similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1)
|
106 |
|
107 |
+
# Normalize
|
108 |
score_range = similarity_scores.max() - similarity_scores.min()
|
109 |
if score_range > 0:
|
110 |
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
|