Spaces:
Sleeping
Sleeping
James McCool
commited on
Commit
·
ebc0082
1
Parent(s):
d4f6599
Implement chunked processing for player similarity score calculation to enhance memory efficiency in predict_dupes.py
Browse files- global_func/predict_dupes.py +30 -24
global_func/predict_dupes.py
CHANGED
|
@@ -52,53 +52,59 @@ def calculate_weighted_ownership_wrapper(row_ownerships):
|
|
| 52 |
ownership_array = row_ownerships.values.reshape(1, -1)
|
| 53 |
return calculate_weighted_ownership_vectorized(ownership_array)[0]
|
| 54 |
|
| 55 |
-
def
|
| 56 |
"""
|
| 57 |
-
|
| 58 |
"""
|
| 59 |
-
#
|
| 60 |
player_data = portfolio[player_columns].astype(str).fillna('').values
|
| 61 |
|
| 62 |
-
# Get all unique players and create a mapping to numeric IDs
|
| 63 |
all_players = set()
|
| 64 |
for row in player_data:
|
| 65 |
for val in row:
|
| 66 |
if isinstance(val, str) and val.strip() != '':
|
| 67 |
all_players.add(val)
|
| 68 |
|
| 69 |
-
# Create player ID mapping
|
| 70 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
| 71 |
|
| 72 |
-
# Convert each row to a binary vector (1 if player is present, 0 if not)
|
| 73 |
n_players = len(all_players)
|
| 74 |
n_rows = len(portfolio)
|
| 75 |
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
| 76 |
|
| 77 |
-
# Vectorized binary matrix creation
|
| 78 |
for i, row in enumerate(player_data):
|
| 79 |
for val in row:
|
| 80 |
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
| 81 |
binary_matrix[i, player_to_id[str(val)]] = 1
|
| 82 |
|
| 83 |
-
#
|
| 84 |
-
|
| 85 |
-
row_sums = np.sum(binary_matrix, axis=1)
|
| 86 |
-
union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
-
# Normalize
|
| 102 |
score_range = similarity_scores.max() - similarity_scores.min()
|
| 103 |
if score_range > 0:
|
| 104 |
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
|
|
|
|
| 52 |
ownership_array = row_ownerships.values.reshape(1, -1)
|
| 53 |
return calculate_weighted_ownership_vectorized(ownership_array)[0]
|
| 54 |
|
| 55 |
+
def calculate_player_similarity_score_chunked(portfolio, player_columns, chunk_size=1000):
|
| 56 |
"""
|
| 57 |
+
Memory-efficient version that processes similarities in chunks
|
| 58 |
"""
|
| 59 |
+
# Same setup as before
|
| 60 |
player_data = portfolio[player_columns].astype(str).fillna('').values
|
| 61 |
|
|
|
|
| 62 |
all_players = set()
|
| 63 |
for row in player_data:
|
| 64 |
for val in row:
|
| 65 |
if isinstance(val, str) and val.strip() != '':
|
| 66 |
all_players.add(val)
|
| 67 |
|
|
|
|
| 68 |
player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
|
| 69 |
|
|
|
|
| 70 |
n_players = len(all_players)
|
| 71 |
n_rows = len(portfolio)
|
| 72 |
binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
|
| 73 |
|
|
|
|
| 74 |
for i, row in enumerate(player_data):
|
| 75 |
for val in row:
|
| 76 |
if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
|
| 77 |
binary_matrix[i, player_to_id[str(val)]] = 1
|
| 78 |
|
| 79 |
+
# Process similarities in chunks to avoid massive matrices
|
| 80 |
+
similarity_scores = np.zeros(n_rows)
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
for i in range(0, n_rows, chunk_size):
|
| 83 |
+
end_i = min(i + chunk_size, n_rows)
|
| 84 |
+
chunk_binary = binary_matrix[i:end_i]
|
| 85 |
+
|
| 86 |
+
# Calculate similarities for this chunk only
|
| 87 |
+
intersection = np.dot(chunk_binary, binary_matrix.T)
|
| 88 |
+
chunk_row_sums = np.sum(chunk_binary, axis=1)
|
| 89 |
+
all_row_sums = np.sum(binary_matrix, axis=1)
|
| 90 |
+
|
| 91 |
+
union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection
|
| 92 |
+
|
| 93 |
+
with np.errstate(divide='ignore', invalid='ignore'):
|
| 94 |
+
jaccard_sim = np.divide(intersection, union,
|
| 95 |
+
out=np.zeros_like(intersection, dtype=float),
|
| 96 |
+
where=union != 0)
|
| 97 |
+
|
| 98 |
+
jaccard_dist = 1 - jaccard_sim
|
| 99 |
+
|
| 100 |
+
# Exclude self-comparison and calculate average
|
| 101 |
+
for j in range(len(jaccard_dist)):
|
| 102 |
+
actual_idx = i + j
|
| 103 |
+
jaccard_dist[j, actual_idx] = 0 # Exclude self
|
| 104 |
+
|
| 105 |
+
similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1)
|
| 106 |
|
| 107 |
+
# Normalize
|
| 108 |
score_range = similarity_scores.max() - similarity_scores.min()
|
| 109 |
if score_range > 0:
|
| 110 |
similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
|