James McCool commited on
Commit
ebc0082
·
1 Parent(s): d4f6599

Implement chunked processing for player similarity score calculation to enhance memory efficiency in predict_dupes.py

Browse files
Files changed (1) hide show
  1. global_func/predict_dupes.py +30 -24
global_func/predict_dupes.py CHANGED
@@ -52,53 +52,59 @@ def calculate_weighted_ownership_wrapper(row_ownerships):
52
  ownership_array = row_ownerships.values.reshape(1, -1)
53
  return calculate_weighted_ownership_vectorized(ownership_array)[0]
54
 
55
- def calculate_player_similarity_score_vectorized(portfolio, player_columns):
56
  """
57
- Vectorized version of calculate_player_similarity_score using NumPy operations.
58
  """
59
- # Extract player data and convert to string array
60
  player_data = portfolio[player_columns].astype(str).fillna('').values
61
 
62
- # Get all unique players and create a mapping to numeric IDs
63
  all_players = set()
64
  for row in player_data:
65
  for val in row:
66
  if isinstance(val, str) and val.strip() != '':
67
  all_players.add(val)
68
 
69
- # Create player ID mapping
70
  player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
71
 
72
- # Convert each row to a binary vector (1 if player is present, 0 if not)
73
  n_players = len(all_players)
74
  n_rows = len(portfolio)
75
  binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
76
 
77
- # Vectorized binary matrix creation
78
  for i, row in enumerate(player_data):
79
  for val in row:
80
  if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
81
  binary_matrix[i, player_to_id[str(val)]] = 1
82
 
83
- # Vectorized Jaccard distance calculation
84
- intersection_matrix = np.dot(binary_matrix, binary_matrix.T)
85
- row_sums = np.sum(binary_matrix, axis=1)
86
- union_matrix = row_sums[:, np.newaxis] + row_sums - intersection_matrix
87
 
88
- # Calculate Jaccard distance: 1 - (intersection / union)
89
- with np.errstate(divide='ignore', invalid='ignore'):
90
- jaccard_similarity = np.divide(intersection_matrix, union_matrix,
91
- out=np.zeros_like(intersection_matrix, dtype=float),
92
- where=union_matrix != 0)
93
-
94
- jaccard_distance = 1 - jaccard_similarity
95
-
96
- # Exclude self-comparison and calculate average distance for each row
97
- np.fill_diagonal(jaccard_distance, 0)
98
- row_counts = n_rows - 1
99
- similarity_scores = np.sum(jaccard_distance, axis=1) / row_counts
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Normalize to 0-1 scale
102
  score_range = similarity_scores.max() - similarity_scores.min()
103
  if score_range > 0:
104
  similarity_scores = (similarity_scores - similarity_scores.min()) / score_range
 
52
  ownership_array = row_ownerships.values.reshape(1, -1)
53
  return calculate_weighted_ownership_vectorized(ownership_array)[0]
54
 
55
+ def calculate_player_similarity_score_chunked(portfolio, player_columns, chunk_size=1000):
56
  """
57
+ Memory-efficient version that processes similarities in chunks
58
  """
59
+ # Same setup as before
60
  player_data = portfolio[player_columns].astype(str).fillna('').values
61
 
 
62
  all_players = set()
63
  for row in player_data:
64
  for val in row:
65
  if isinstance(val, str) and val.strip() != '':
66
  all_players.add(val)
67
 
 
68
  player_to_id = {player: idx for idx, player in enumerate(sorted(all_players))}
69
 
 
70
  n_players = len(all_players)
71
  n_rows = len(portfolio)
72
  binary_matrix = np.zeros((n_rows, n_players), dtype=np.int8)
73
 
 
74
  for i, row in enumerate(player_data):
75
  for val in row:
76
  if isinstance(val, str) and str(val).strip() != '' and str(val) in player_to_id:
77
  binary_matrix[i, player_to_id[str(val)]] = 1
78
 
79
+ # Process similarities in chunks to avoid massive matrices
80
+ similarity_scores = np.zeros(n_rows)
 
 
81
 
82
+ for i in range(0, n_rows, chunk_size):
83
+ end_i = min(i + chunk_size, n_rows)
84
+ chunk_binary = binary_matrix[i:end_i]
85
+
86
+ # Calculate similarities for this chunk only
87
+ intersection = np.dot(chunk_binary, binary_matrix.T)
88
+ chunk_row_sums = np.sum(chunk_binary, axis=1)
89
+ all_row_sums = np.sum(binary_matrix, axis=1)
90
+
91
+ union = chunk_row_sums[:, np.newaxis] + all_row_sums - intersection
92
+
93
+ with np.errstate(divide='ignore', invalid='ignore'):
94
+ jaccard_sim = np.divide(intersection, union,
95
+ out=np.zeros_like(intersection, dtype=float),
96
+ where=union != 0)
97
+
98
+ jaccard_dist = 1 - jaccard_sim
99
+
100
+ # Exclude self-comparison and calculate average
101
+ for j in range(len(jaccard_dist)):
102
+ actual_idx = i + j
103
+ jaccard_dist[j, actual_idx] = 0 # Exclude self
104
+
105
+ similarity_scores[i:end_i] = np.sum(jaccard_dist, axis=1) / (n_rows - 1)
106
 
107
+ # Normalize
108
  score_range = similarity_scores.max() - similarity_scores.min()
109
  if score_range > 0:
110
  similarity_scores = (similarity_scores - similarity_scores.min()) / score_range