sunga25 commited on
Commit
e8874dd
·
verified ·
1 Parent(s): 0986083

Update src/main.py

Browse files
Files changed (1) hide show
  1. src/main.py +435 -0
src/main.py CHANGED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ from torch.utils.data import DataLoader, TensorDataset
7
+ from torch.optim.lr_scheduler import ReduceLROnPlateau
8
+ from sklearn.model_selection import KFold
9
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
10
+ from sklearn.cluster import DBSCAN
11
+ from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
12
+ from sklearn.linear_model import LinearRegression
13
+ from sklearn.metrics import mean_squared_error
14
+ from sklearn.base import BaseEstimator, TransformerMixin
15
+ import dask.dataframe as dd
16
+ import optuna
17
+ import numpy as np
18
+ import pandas as pd
19
+ import pytorch_lightning as pl
20
+ import json
21
+
22
+ # Set up logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
+
25
+ # Constants
26
+ RANDOM_SEED = 42
27
+ N_SPLITS = 5
28
+
29
+ # Define paths
30
+ MODEL_CONFIG_PATH = 'model_config.json'
31
+ BEST_MODEL_PATH = 'best_model.pt'
32
+ FINAL_MODEL_PATH = 'final_model.pt'
33
+
34
+ class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
35
+ """Custom transformer for advanced feature engineering."""
36
+ def __init__(self, numeric_cols):
37
+ self.numeric_cols = numeric_cols
38
+
39
+ def fit(self, X, y=None):
40
+ return self
41
+
42
+ def transform(self, X):
43
+ X = X.copy()
44
+ # Feature engineering for rank and elo differences with adjusted weighting
45
+ X['rank_difference'] = X['winner_rank'] - X['loser_rank']
46
+ X['elo_difference'] = (X['winner_eloRating'] - X['loser_eloRating']) * 2 # Further emphasizing ELO ratings
47
+
48
+ # Optional feature engineering based on available columns
49
+ if 'winner_eloRatingDelta' in X.columns and 'loser_eloRatingDelta' in X.columns:
50
+ X['elo_differenceDelta'] = (X['winner_eloRatingDelta'] - X['loser_eloRatingDelta']) * 2
51
+ X['elo_differenceDelta_abs'] = np.abs(X['elo_differenceDelta'])
52
+
53
+ # Convert dates to numeric days since the earliest date
54
+ try:
55
+ X['date'] = pd.to_datetime(X['date'], format='%Y%m%d', errors='coerce')
56
+ min_date = X['date'].min()
57
+ X['date'] = (X['date'] - min_date).dt.days
58
+ except Exception as e:
59
+ logging.warning(f"Error converting dates: {str(e)}. Setting 'date' to NaN.")
60
+ X['date'] = np.nan
61
+
62
+ # Convert numeric columns to float, handling errors
63
+ for col in self.numeric_cols:
64
+ X[col] = pd.to_numeric(X[col], errors='coerce')
65
+
66
+ return X
67
+
68
+ def load_data(first_number=2, last_number=15):
69
+ """Load and combine player match data from multiple CSV files."""
70
+ dfs = []
71
+ core_columns = ['date', 'tournament', 'winner_name', 'winner_rank', 'winner_eloRating',
72
+ 'loser_name', 'loser_rank', 'loser_eloRating']
73
+ optional_columns = ['level', 'bestOf', 'surface', 'indoor', 'speed', 'round',
74
+ 'winner_seed', 'winner_country_name', 'winner_country_id',
75
+ 'winner_eloRatingDelta', 'loser_seed', 'loser_country_name',
76
+ 'loser_country_id', 'loser_eloRatingDelta', 'score', 'outcome', 'loser_entry']
77
+ all_columns = core_columns + optional_columns
78
+
79
+ dtype_dict = {
80
+ 'loser_entry': 'object',
81
+ 'outcome': 'object',
82
+ 'winner_rank': 'float64',
83
+ 'loser_rank': 'float64',
84
+ 'winner_eloRating': 'float64',
85
+ 'loser_eloRating': 'float64',
86
+ 'bestOf': 'float64',
87
+ 'speed': 'float64',
88
+ 'winner_eloRatingDelta': 'float64',
89
+ 'loser_eloRatingDelta': 'float64',
90
+ 'indoor': 'float64',
91
+ 'winner_seed': 'float64',
92
+ 'loser_seed': 'float64'
93
+ }
94
+
95
+ for i in range(first_number, last_number + 1):
96
+ file_path = f'PlayerMatches{i}.csv'
97
+ try:
98
+ df = dd.read_csv(file_path, low_memory=False, assume_missing=True, dtype=dtype_dict)
99
+ missing_core_columns = [col for col in core_columns if col not in df.columns]
100
+ if missing_core_columns:
101
+ logging.warning(f"Missing core columns in {file_path}: {missing_core_columns}. Skipping this file.")
102
+ continue
103
+ available_columns = [col for col in all_columns if col in df.columns]
104
+ df = df[available_columns].drop_duplicates().compute()
105
+ dfs.append(df)
106
+ logging.info(f"Loaded {file_path}")
107
+ except FileNotFoundError:
108
+ logging.warning(f"{file_path} not found. Skipping this file.")
109
+ except Exception as e:
110
+ logging.warning(f"An error occurred while loading {file_path}: {str(e)}. Skipping this file.")
111
+
112
+ if not dfs:
113
+ raise ValueError("No valid data found.")
114
+
115
+ combined_df = pd.concat(dfs, ignore_index=True).drop_duplicates()
116
+ if combined_df.empty:
117
+ raise ValueError("The combined dataframe is empty.")
118
+
119
+ return combined_df
120
+
121
+ def determine_column_types(df):
122
+ """Determine numeric and categorical column types in the dataframe."""
123
+ numeric_cols = ['winner_rank', 'loser_rank', 'winner_eloRating', 'loser_eloRating']
124
+ potential_numeric_cols = ['bestOf', 'speed', 'winner_eloRatingDelta', 'loser_eloRatingDelta', 'indoor', 'winner_seed', 'loser_seed']
125
+
126
+ for col in potential_numeric_cols:
127
+ if col in df.columns:
128
+ if pd.api.types.is_numeric_dtype(df[col]) or df[col].str.isnumeric().all():
129
+ numeric_cols.append(col)
130
+
131
+ categorical_cols = [col for col in df.columns if col not in numeric_cols and col != 'date']
132
+
133
+ return numeric_cols, categorical_cols
134
+
135
+ def preprocess_data(df, numeric_cols, categorical_cols):
136
+ """Preprocess the dataframe, including encoding categorical variables and handling missing values."""
137
+ logging.info(f"Shape before preprocessing: {df.shape}")
138
+
139
+ label_encoders = {}
140
+ for col in categorical_cols:
141
+ le = LabelEncoder()
142
+ df[col] = df[col].astype(str)
143
+ df[col] = le.fit_transform(df[col])
144
+ label_encoders[col] = le
145
+
146
+ feature_engineer = AdvancedFeatureEngineering(numeric_cols)
147
+ df = feature_engineer.fit_transform(df)
148
+
149
+ logging.info(f"Shape after feature engineering: {df.shape}")
150
+
151
+ # Handle NaN values in numeric and categorical columns
152
+ for col in df.columns:
153
+ nan_count = df[col].isna().sum()
154
+ if nan_count > 0:
155
+ logging.warning(f"Column {col} has {nan_count} NaN values")
156
+
157
+ for col in numeric_cols:
158
+ df[col] = pd.to_numeric(df[col], errors='coerce')
159
+ df[col] = df[col].fillna(df[col].median())
160
+
161
+ for col in categorical_cols:
162
+ df[col] = df[col].fillna(-1)
163
+
164
+ # Additional check for NaNs in rank_difference
165
+ if df['rank_difference'].isna().any():
166
+ logging.error("NaN values found in 'rank_difference' after preprocessing. Identifying rows with NaN values...")
167
+ missing_rank_rows = df[df['rank_difference'].isna()]
168
+ logging.info(f"Rows with missing 'rank_difference':\n{missing_rank_rows[['winner_rank', 'loser_rank']]}")
169
+ df.dropna(subset=['rank_difference'], inplace=True) # Drop rows with NaN in 'rank_difference'
170
+ logging.info(f"Shape after dropping NaN rows in 'rank_difference': {df.shape}")
171
+
172
+ return df, label_encoders
173
+
174
+ class JointEmbeddedModel(pl.LightningModule):
175
+ """A PyTorch Lightning module for a neural network with categorical embeddings and numeric inputs."""
176
+ def __init__(self, categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate=0.3, learning_rate=1e-3):
177
+ super().__init__()
178
+ self.embeddings = nn.ModuleList([nn.Embedding(dim, embedding_dim) for dim in categorical_dims])
179
+ self.fc1 = nn.Linear(len(categorical_dims) * embedding_dim + numerical_dim, hidden_dim)
180
+ self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
181
+ self.fc3 = nn.Linear(hidden_dim // 2, 1)
182
+ self.relu = nn.ReLU()
183
+ self.dropout = nn.Dropout(dropout_rate)
184
+ self.criterion = nn.MSELoss()
185
+ self.learning_rate = learning_rate
186
+
187
+ def forward(self, x_cat, x_num):
188
+ embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
189
+ x = torch.cat(embedded + [x_num], dim=1)
190
+ x = self.dropout(self.relu(self.fc1(x)))
191
+ x = self.dropout(self.relu(self.fc2(x)))
192
+ return self.fc3(x).squeeze()
193
+
194
+ def training_step(self, batch, batch_idx):
195
+ x_cat, x_num, y = batch
196
+ y_hat = self(x_cat, x_num)
197
+ loss = self.criterion(y_hat, y)
198
+ self.log('train_loss', loss)
199
+ return loss
200
+
201
+ def configure_optimizers(self):
202
+ optimizer = optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=1e-4) # Using AdamW optimizer
203
+ scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
204
+ return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'train_loss'}
205
+
206
+ def create_dataloader(X, y, batch_size=64):
207
+ """Create a DataLoader for training and evaluation."""
208
+ x_cat, x_num = X
209
+ x_cat = torch.tensor(x_cat, dtype=torch.long)
210
+ x_num = torch.tensor(x_num, dtype=torch.float32)
211
+ y = torch.tensor(y, dtype=torch.float32)
212
+ dataset = TensorDataset(x_cat, x_num, y)
213
+ return DataLoader(dataset, batch_size=batch_size, shuffle=True)
214
+
215
+ def ensemble_predictions(models, X):
216
+ """Aggregate predictions from an ensemble of models."""
217
+ preds = [model.predict(X) for model in models]
218
+ return np.mean(preds, axis=0)
219
+
220
+ def save_model_config(config, path):
221
+ """Save the model configuration to a JSON file."""
222
+ with open(path, 'w') as f:
223
+ json.dump(config, f)
224
+
225
+ def load_model_config(path):
226
+ """Load the model configuration from a JSON file."""
227
+ with open(path, 'r') as f:
228
+ return json.load(f)
229
+
230
+ def objective(trial):
231
+ """Objective function for hyperparameter optimization with Optuna."""
232
+ embedding_dim = trial.suggest_int('embedding_dim', 16, 128)
233
+ hidden_dim = trial.suggest_int('hidden_dim', 64, 512)
234
+ learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-2, log=True)
235
+ batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
236
+ dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
237
+
238
+ model = JointEmbeddedModel(categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate, learning_rate)
239
+ dataloader = create_dataloader(X_train, y_train, batch_size=batch_size)
240
+
241
+ trainer = pl.Trainer(
242
+ max_epochs=20,
243
+ accelerator='gpu' if torch.cuda.is_available() else 'cpu', # Use GPU if available
244
+ devices=1,
245
+ logger=False,
246
+ enable_checkpointing=False
247
+ )
248
+ trainer.fit(model, dataloader)
249
+
250
+ val_predictions = model(torch.tensor(X_val[0], dtype=torch.long), torch.tensor(X_val[1], dtype=torch.float32)).detach().cpu().numpy()
251
+ if np.isnan(y_val).any() or np.isnan(val_predictions).any():
252
+ raise ValueError("Validation targets or predictions contain NaN values.")
253
+
254
+ val_loss = mean_squared_error(y_val, val_predictions)
255
+
256
+ return val_loss
257
+
258
+ def analyze_winning_streaks(model, X, df_subset, eps=0.5, min_samples=5, threshold=0.5):
259
+ """Analyze winning streaks using the trained model and clustering techniques."""
260
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
261
+ model.eval()
262
+ x_cat, x_num = X
263
+
264
+ with torch.no_grad():
265
+ embedded = [emb(torch.tensor(x_cat[:, i], dtype=torch.long).to(device)) for i, emb in enumerate(model.embeddings)]
266
+ embeddings = torch.cat(embedded, dim=1).cpu().numpy()
267
+ outputs = model(torch.tensor(x_cat, dtype=torch.long).to(device),
268
+ torch.tensor(x_num, dtype=torch.float32).to(device)).cpu().numpy()
269
+
270
+ scaler = StandardScaler()
271
+ embeddings = scaler.fit_transform(embeddings)
272
+
273
+ dbscan = DBSCAN(eps=eps, min_samples=min_samples)
274
+ labels = dbscan.fit_predict(embeddings)
275
+
276
+ df_subset['cluster'] = labels
277
+ df_subset['predicted_rank_difference'] = outputs
278
+
279
+ df_subset['easy_draw'] = (df_subset['rank_difference'] - df_subset['predicted_rank_difference']) > threshold
280
+ df_subset['hard_draw'] = (df_subset['predicted_rank_difference'] - df_subset['rank_difference']) > threshold
281
+
282
+ results = df_subset.groupby('winner_name').agg({
283
+ 'cluster': 'count',
284
+ 'easy_draw': 'sum',
285
+ 'hard_draw': 'sum'
286
+ }).reset_index()
287
+
288
+ results['easy_draw_ratio'] = results['easy_draw'] / results['cluster']
289
+ results['hard_draw_ratio'] = results['hard_draw'] / results['cluster']
290
+
291
+ results.sort_values('hard_draw_ratio', ascending=False, inplace=True)
292
+ results.to_csv('winning_streak_analysis.csv', index=False)
293
+
294
+ logging.info(f"Analysis results saved to winning_streak_analysis.csv")
295
+
296
+ return results
297
+
298
+ if __name__ == "__main__":
299
+ try:
300
+ df = load_data()
301
+ logging.info(f"Data loaded successfully. Shape: {df.shape}")
302
+
303
+ numeric_columns, categorical_columns = determine_column_types(df)
304
+ logging.info(f"Numeric columns: {numeric_columns}")
305
+ logging.info(f"Categorical columns: {categorical_columns}")
306
+
307
+ df, label_encoders = preprocess_data(df, numeric_columns, categorical_columns)
308
+ logging.info(f"Data preprocessed. Shape after preprocessing: {df.shape}")
309
+
310
+ # Ensure all numeric columns are properly handled
311
+ for col in numeric_columns:
312
+ if not pd.api.types.is_numeric_dtype(df[col]):
313
+ raise ValueError(f"Column {col} contains non-numeric data after preprocessing")
314
+
315
+ if df.shape[0] < N_SPLITS:
316
+ raise ValueError(f"Not enough samples ({df.shape[0]}) for {N_SPLITS}-fold cross-validation.")
317
+
318
+ X_cat = df[categorical_columns].values
319
+ X_num = df[numeric_columns].values.astype(float)
320
+ y = df['rank_difference'].values.astype(float)
321
+
322
+ # Remove NaN values from y
323
+ if np.isnan(y).any():
324
+ raise ValueError("Target variable contains NaN values.")
325
+
326
+ logging.info(f"Shape of X_cat: {X_cat.shape}")
327
+ logging.info(f"Shape of X_num: {X_num.shape}")
328
+ logging.info(f"Shape of y: {y.shape}")
329
+
330
+ kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
331
+ scores = []
332
+
333
+ for train_index, val_index in kf.split(X_cat):
334
+ X_cat_train, X_cat_val = X_cat[train_index], X_cat[val_index]
335
+ X_num_train, X_num_val = X_num[train_index], X_num[val_index]
336
+ y_train, y_val = y[train_index], y[val_index]
337
+
338
+ # Additional NaN checks for validation and training sets
339
+ if np.isnan(X_cat_train).any() or np.isnan(X_num_train).any() or np.isnan(y_train).any():
340
+ raise ValueError("Training data contains NaN values.")
341
+ if np.isnan(X_cat_val).any() or np.isnan(X_num_val).any() or np.isnan(y_val).any():
342
+ raise ValueError("Validation data contains NaN values.")
343
+
344
+ X_train = (X_cat_train, X_num_train)
345
+ X_val = (X_cat_val, X_num_val)
346
+
347
+ categorical_dims = [len(label_encoders[col].classes_) for col in categorical_columns]
348
+ numerical_dim = len(numeric_columns)
349
+
350
+ try:
351
+ study = optuna.create_study(direction='minimize')
352
+ study.optimize(objective, n_trials=100) # Further increased trials for finer parameter search
353
+
354
+ best_params = study.best_params
355
+ logging.info(f"Best Hyperparameters: {best_params}")
356
+
357
+ # Save the model configuration
358
+ model_config = {
359
+ 'categorical_dims': categorical_dims,
360
+ 'numerical_dim': numerical_dim,
361
+ 'embedding_dim': best_params['embedding_dim'],
362
+ 'hidden_dim': best_params['hidden_dim'],
363
+ 'dropout_rate': best_params['dropout_rate'],
364
+ 'learning_rate': best_params['learning_rate']
365
+ }
366
+ save_model_config(model_config, MODEL_CONFIG_PATH)
367
+
368
+ model = JointEmbeddedModel(**model_config)
369
+ dataloader = create_dataloader(X_train, y_train, batch_size=best_params['batch_size'])
370
+
371
+ trainer = pl.Trainer(
372
+ max_epochs=100, # Further increased max_epochs for deeper training
373
+ accelerator='gpu' if torch.cuda.is_available() else 'cpu',
374
+ devices=1,
375
+ logger=False,
376
+ enable_checkpointing=False
377
+ )
378
+ trainer.fit(model, dataloader)
379
+
380
+ val_predictions = model(torch.tensor(X_val[0], dtype=torch.long), torch.tensor(X_val[1], dtype=torch.float32)).detach().cpu().numpy()
381
+ if np.isnan(val_predictions).any():
382
+ raise ValueError("Validation predictions contain NaN values.")
383
+
384
+ val_loss = mean_squared_error(y_val, val_predictions)
385
+ scores.append(val_loss)
386
+
387
+ # Save the model state
388
+ torch.save(model.state_dict(), BEST_MODEL_PATH)
389
+
390
+ except Exception as e:
391
+ logging.error(f"An error occurred during optimization: {str(e)}")
392
+ logging.error("Exception details:", exc_info=True)
393
+
394
+ logging.info(f"Cross-Validation MSE: {np.mean(scores):.4f}")
395
+
396
+ # Train ensemble models and evaluate
397
+ ensemble_models = [
398
+ RandomForestRegressor(n_estimators=300, random_state=RANDOM_SEED),
399
+ GradientBoostingRegressor(n_estimators=300, random_state=RANDOM_SEED),
400
+ LinearRegression()
401
+ ]
402
+
403
+ # Check for NaNs in ensemble training data
404
+ if np.isnan(np.hstack((X_cat, X_num))).any() or np.isnan(y).any():
405
+ raise ValueError("Ensemble training data contains NaN values.")
406
+
407
+ ensemble_models = [model.fit(np.hstack((X_cat, X_num)), y) for model in ensemble_models]
408
+ ensemble_preds = ensemble_predictions(ensemble_models, np.hstack((X_cat, X_num)))
409
+ ensemble_mse = mean_squared_error(y, ensemble_preds)
410
+
411
+ logging.info(f"Ensemble Test MSE: {ensemble_mse:.4f}")
412
+
413
+ # Load the best model configuration and state for final analysis
414
+ if os.path.exists(BEST_MODEL_PATH) and os.path.exists(MODEL_CONFIG_PATH):
415
+ model_config = load_model_config(MODEL_CONFIG_PATH)
416
+ model = JointEmbeddedModel(**model_config)
417
+ model.load_state_dict(torch.load(BEST_MODEL_PATH))
418
+ model.eval()
419
+
420
+ test_predictions = model(torch.tensor(X_cat, dtype=torch.long), torch.tensor(X_num, dtype=torch.float32)).detach().cpu().numpy()
421
+ if np.isnan(test_predictions).any():
422
+ raise ValueError("Test predictions contain NaN values.")
423
+
424
+ test_mse = mean_squared_error(y, test_predictions)
425
+ logging.info(f"Final Test MSE: {test_mse}")
426
+
427
+ winning_streak_analysis = analyze_winning_streaks(model, (X_cat, X_num), df)
428
+ torch.save(model.state_dict(), FINAL_MODEL_PATH)
429
+ logging.info("Script execution completed successfully.")
430
+ else:
431
+ logging.error("Best model or configuration not found. Ensure training is completed before running analysis.")
432
+
433
+ except Exception as e:
434
+ logging.error(f"An error occurred during script execution: {str(e)}")
435
+ logging.error("Exception details:", exc_info=True)