Update src/main.py
Browse files- src/main.py +435 -0
src/main.py
CHANGED
@@ -0,0 +1,435 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.optim as optim
|
6 |
+
from torch.utils.data import DataLoader, TensorDataset
|
7 |
+
from torch.optim.lr_scheduler import ReduceLROnPlateau
|
8 |
+
from sklearn.model_selection import KFold
|
9 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
10 |
+
from sklearn.cluster import DBSCAN
|
11 |
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
12 |
+
from sklearn.linear_model import LinearRegression
|
13 |
+
from sklearn.metrics import mean_squared_error
|
14 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
15 |
+
import dask.dataframe as dd
|
16 |
+
import optuna
|
17 |
+
import numpy as np
|
18 |
+
import pandas as pd
|
19 |
+
import pytorch_lightning as pl
|
20 |
+
import json
|
21 |
+
|
22 |
+
# Set up logging
|
23 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
24 |
+
|
25 |
+
# Constants
|
26 |
+
RANDOM_SEED = 42
|
27 |
+
N_SPLITS = 5
|
28 |
+
|
29 |
+
# Define paths
|
30 |
+
MODEL_CONFIG_PATH = 'model_config.json'
|
31 |
+
BEST_MODEL_PATH = 'best_model.pt'
|
32 |
+
FINAL_MODEL_PATH = 'final_model.pt'
|
33 |
+
|
34 |
+
class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
|
35 |
+
"""Custom transformer for advanced feature engineering."""
|
36 |
+
def __init__(self, numeric_cols):
|
37 |
+
self.numeric_cols = numeric_cols
|
38 |
+
|
39 |
+
def fit(self, X, y=None):
|
40 |
+
return self
|
41 |
+
|
42 |
+
def transform(self, X):
|
43 |
+
X = X.copy()
|
44 |
+
# Feature engineering for rank and elo differences with adjusted weighting
|
45 |
+
X['rank_difference'] = X['winner_rank'] - X['loser_rank']
|
46 |
+
X['elo_difference'] = (X['winner_eloRating'] - X['loser_eloRating']) * 2 # Further emphasizing ELO ratings
|
47 |
+
|
48 |
+
# Optional feature engineering based on available columns
|
49 |
+
if 'winner_eloRatingDelta' in X.columns and 'loser_eloRatingDelta' in X.columns:
|
50 |
+
X['elo_differenceDelta'] = (X['winner_eloRatingDelta'] - X['loser_eloRatingDelta']) * 2
|
51 |
+
X['elo_differenceDelta_abs'] = np.abs(X['elo_differenceDelta'])
|
52 |
+
|
53 |
+
# Convert dates to numeric days since the earliest date
|
54 |
+
try:
|
55 |
+
X['date'] = pd.to_datetime(X['date'], format='%Y%m%d', errors='coerce')
|
56 |
+
min_date = X['date'].min()
|
57 |
+
X['date'] = (X['date'] - min_date).dt.days
|
58 |
+
except Exception as e:
|
59 |
+
logging.warning(f"Error converting dates: {str(e)}. Setting 'date' to NaN.")
|
60 |
+
X['date'] = np.nan
|
61 |
+
|
62 |
+
# Convert numeric columns to float, handling errors
|
63 |
+
for col in self.numeric_cols:
|
64 |
+
X[col] = pd.to_numeric(X[col], errors='coerce')
|
65 |
+
|
66 |
+
return X
|
67 |
+
|
68 |
+
def load_data(first_number=2, last_number=15):
|
69 |
+
"""Load and combine player match data from multiple CSV files."""
|
70 |
+
dfs = []
|
71 |
+
core_columns = ['date', 'tournament', 'winner_name', 'winner_rank', 'winner_eloRating',
|
72 |
+
'loser_name', 'loser_rank', 'loser_eloRating']
|
73 |
+
optional_columns = ['level', 'bestOf', 'surface', 'indoor', 'speed', 'round',
|
74 |
+
'winner_seed', 'winner_country_name', 'winner_country_id',
|
75 |
+
'winner_eloRatingDelta', 'loser_seed', 'loser_country_name',
|
76 |
+
'loser_country_id', 'loser_eloRatingDelta', 'score', 'outcome', 'loser_entry']
|
77 |
+
all_columns = core_columns + optional_columns
|
78 |
+
|
79 |
+
dtype_dict = {
|
80 |
+
'loser_entry': 'object',
|
81 |
+
'outcome': 'object',
|
82 |
+
'winner_rank': 'float64',
|
83 |
+
'loser_rank': 'float64',
|
84 |
+
'winner_eloRating': 'float64',
|
85 |
+
'loser_eloRating': 'float64',
|
86 |
+
'bestOf': 'float64',
|
87 |
+
'speed': 'float64',
|
88 |
+
'winner_eloRatingDelta': 'float64',
|
89 |
+
'loser_eloRatingDelta': 'float64',
|
90 |
+
'indoor': 'float64',
|
91 |
+
'winner_seed': 'float64',
|
92 |
+
'loser_seed': 'float64'
|
93 |
+
}
|
94 |
+
|
95 |
+
for i in range(first_number, last_number + 1):
|
96 |
+
file_path = f'PlayerMatches{i}.csv'
|
97 |
+
try:
|
98 |
+
df = dd.read_csv(file_path, low_memory=False, assume_missing=True, dtype=dtype_dict)
|
99 |
+
missing_core_columns = [col for col in core_columns if col not in df.columns]
|
100 |
+
if missing_core_columns:
|
101 |
+
logging.warning(f"Missing core columns in {file_path}: {missing_core_columns}. Skipping this file.")
|
102 |
+
continue
|
103 |
+
available_columns = [col for col in all_columns if col in df.columns]
|
104 |
+
df = df[available_columns].drop_duplicates().compute()
|
105 |
+
dfs.append(df)
|
106 |
+
logging.info(f"Loaded {file_path}")
|
107 |
+
except FileNotFoundError:
|
108 |
+
logging.warning(f"{file_path} not found. Skipping this file.")
|
109 |
+
except Exception as e:
|
110 |
+
logging.warning(f"An error occurred while loading {file_path}: {str(e)}. Skipping this file.")
|
111 |
+
|
112 |
+
if not dfs:
|
113 |
+
raise ValueError("No valid data found.")
|
114 |
+
|
115 |
+
combined_df = pd.concat(dfs, ignore_index=True).drop_duplicates()
|
116 |
+
if combined_df.empty:
|
117 |
+
raise ValueError("The combined dataframe is empty.")
|
118 |
+
|
119 |
+
return combined_df
|
120 |
+
|
121 |
+
def determine_column_types(df):
|
122 |
+
"""Determine numeric and categorical column types in the dataframe."""
|
123 |
+
numeric_cols = ['winner_rank', 'loser_rank', 'winner_eloRating', 'loser_eloRating']
|
124 |
+
potential_numeric_cols = ['bestOf', 'speed', 'winner_eloRatingDelta', 'loser_eloRatingDelta', 'indoor', 'winner_seed', 'loser_seed']
|
125 |
+
|
126 |
+
for col in potential_numeric_cols:
|
127 |
+
if col in df.columns:
|
128 |
+
if pd.api.types.is_numeric_dtype(df[col]) or df[col].str.isnumeric().all():
|
129 |
+
numeric_cols.append(col)
|
130 |
+
|
131 |
+
categorical_cols = [col for col in df.columns if col not in numeric_cols and col != 'date']
|
132 |
+
|
133 |
+
return numeric_cols, categorical_cols
|
134 |
+
|
135 |
+
def preprocess_data(df, numeric_cols, categorical_cols):
|
136 |
+
"""Preprocess the dataframe, including encoding categorical variables and handling missing values."""
|
137 |
+
logging.info(f"Shape before preprocessing: {df.shape}")
|
138 |
+
|
139 |
+
label_encoders = {}
|
140 |
+
for col in categorical_cols:
|
141 |
+
le = LabelEncoder()
|
142 |
+
df[col] = df[col].astype(str)
|
143 |
+
df[col] = le.fit_transform(df[col])
|
144 |
+
label_encoders[col] = le
|
145 |
+
|
146 |
+
feature_engineer = AdvancedFeatureEngineering(numeric_cols)
|
147 |
+
df = feature_engineer.fit_transform(df)
|
148 |
+
|
149 |
+
logging.info(f"Shape after feature engineering: {df.shape}")
|
150 |
+
|
151 |
+
# Handle NaN values in numeric and categorical columns
|
152 |
+
for col in df.columns:
|
153 |
+
nan_count = df[col].isna().sum()
|
154 |
+
if nan_count > 0:
|
155 |
+
logging.warning(f"Column {col} has {nan_count} NaN values")
|
156 |
+
|
157 |
+
for col in numeric_cols:
|
158 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
159 |
+
df[col] = df[col].fillna(df[col].median())
|
160 |
+
|
161 |
+
for col in categorical_cols:
|
162 |
+
df[col] = df[col].fillna(-1)
|
163 |
+
|
164 |
+
# Additional check for NaNs in rank_difference
|
165 |
+
if df['rank_difference'].isna().any():
|
166 |
+
logging.error("NaN values found in 'rank_difference' after preprocessing. Identifying rows with NaN values...")
|
167 |
+
missing_rank_rows = df[df['rank_difference'].isna()]
|
168 |
+
logging.info(f"Rows with missing 'rank_difference':\n{missing_rank_rows[['winner_rank', 'loser_rank']]}")
|
169 |
+
df.dropna(subset=['rank_difference'], inplace=True) # Drop rows with NaN in 'rank_difference'
|
170 |
+
logging.info(f"Shape after dropping NaN rows in 'rank_difference': {df.shape}")
|
171 |
+
|
172 |
+
return df, label_encoders
|
173 |
+
|
174 |
+
class JointEmbeddedModel(pl.LightningModule):
|
175 |
+
"""A PyTorch Lightning module for a neural network with categorical embeddings and numeric inputs."""
|
176 |
+
def __init__(self, categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate=0.3, learning_rate=1e-3):
|
177 |
+
super().__init__()
|
178 |
+
self.embeddings = nn.ModuleList([nn.Embedding(dim, embedding_dim) for dim in categorical_dims])
|
179 |
+
self.fc1 = nn.Linear(len(categorical_dims) * embedding_dim + numerical_dim, hidden_dim)
|
180 |
+
self.fc2 = nn.Linear(hidden_dim, hidden_dim // 2)
|
181 |
+
self.fc3 = nn.Linear(hidden_dim // 2, 1)
|
182 |
+
self.relu = nn.ReLU()
|
183 |
+
self.dropout = nn.Dropout(dropout_rate)
|
184 |
+
self.criterion = nn.MSELoss()
|
185 |
+
self.learning_rate = learning_rate
|
186 |
+
|
187 |
+
def forward(self, x_cat, x_num):
|
188 |
+
embedded = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
|
189 |
+
x = torch.cat(embedded + [x_num], dim=1)
|
190 |
+
x = self.dropout(self.relu(self.fc1(x)))
|
191 |
+
x = self.dropout(self.relu(self.fc2(x)))
|
192 |
+
return self.fc3(x).squeeze()
|
193 |
+
|
194 |
+
def training_step(self, batch, batch_idx):
|
195 |
+
x_cat, x_num, y = batch
|
196 |
+
y_hat = self(x_cat, x_num)
|
197 |
+
loss = self.criterion(y_hat, y)
|
198 |
+
self.log('train_loss', loss)
|
199 |
+
return loss
|
200 |
+
|
201 |
+
def configure_optimizers(self):
|
202 |
+
optimizer = optim.AdamW(self.parameters(), lr=self.learning_rate, weight_decay=1e-4) # Using AdamW optimizer
|
203 |
+
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
|
204 |
+
return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'train_loss'}
|
205 |
+
|
206 |
+
def create_dataloader(X, y, batch_size=64):
|
207 |
+
"""Create a DataLoader for training and evaluation."""
|
208 |
+
x_cat, x_num = X
|
209 |
+
x_cat = torch.tensor(x_cat, dtype=torch.long)
|
210 |
+
x_num = torch.tensor(x_num, dtype=torch.float32)
|
211 |
+
y = torch.tensor(y, dtype=torch.float32)
|
212 |
+
dataset = TensorDataset(x_cat, x_num, y)
|
213 |
+
return DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
214 |
+
|
215 |
+
def ensemble_predictions(models, X):
|
216 |
+
"""Aggregate predictions from an ensemble of models."""
|
217 |
+
preds = [model.predict(X) for model in models]
|
218 |
+
return np.mean(preds, axis=0)
|
219 |
+
|
220 |
+
def save_model_config(config, path):
|
221 |
+
"""Save the model configuration to a JSON file."""
|
222 |
+
with open(path, 'w') as f:
|
223 |
+
json.dump(config, f)
|
224 |
+
|
225 |
+
def load_model_config(path):
|
226 |
+
"""Load the model configuration from a JSON file."""
|
227 |
+
with open(path, 'r') as f:
|
228 |
+
return json.load(f)
|
229 |
+
|
230 |
+
def objective(trial):
|
231 |
+
"""Objective function for hyperparameter optimization with Optuna."""
|
232 |
+
embedding_dim = trial.suggest_int('embedding_dim', 16, 128)
|
233 |
+
hidden_dim = trial.suggest_int('hidden_dim', 64, 512)
|
234 |
+
learning_rate = trial.suggest_float('learning_rate', 1e-6, 1e-2, log=True)
|
235 |
+
batch_size = trial.suggest_categorical('batch_size', [32, 64, 128, 256])
|
236 |
+
dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
|
237 |
+
|
238 |
+
model = JointEmbeddedModel(categorical_dims, numerical_dim, embedding_dim, hidden_dim, dropout_rate, learning_rate)
|
239 |
+
dataloader = create_dataloader(X_train, y_train, batch_size=batch_size)
|
240 |
+
|
241 |
+
trainer = pl.Trainer(
|
242 |
+
max_epochs=20,
|
243 |
+
accelerator='gpu' if torch.cuda.is_available() else 'cpu', # Use GPU if available
|
244 |
+
devices=1,
|
245 |
+
logger=False,
|
246 |
+
enable_checkpointing=False
|
247 |
+
)
|
248 |
+
trainer.fit(model, dataloader)
|
249 |
+
|
250 |
+
val_predictions = model(torch.tensor(X_val[0], dtype=torch.long), torch.tensor(X_val[1], dtype=torch.float32)).detach().cpu().numpy()
|
251 |
+
if np.isnan(y_val).any() or np.isnan(val_predictions).any():
|
252 |
+
raise ValueError("Validation targets or predictions contain NaN values.")
|
253 |
+
|
254 |
+
val_loss = mean_squared_error(y_val, val_predictions)
|
255 |
+
|
256 |
+
return val_loss
|
257 |
+
|
258 |
+
def analyze_winning_streaks(model, X, df_subset, eps=0.5, min_samples=5, threshold=0.5):
|
259 |
+
"""Analyze winning streaks using the trained model and clustering techniques."""
|
260 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
261 |
+
model.eval()
|
262 |
+
x_cat, x_num = X
|
263 |
+
|
264 |
+
with torch.no_grad():
|
265 |
+
embedded = [emb(torch.tensor(x_cat[:, i], dtype=torch.long).to(device)) for i, emb in enumerate(model.embeddings)]
|
266 |
+
embeddings = torch.cat(embedded, dim=1).cpu().numpy()
|
267 |
+
outputs = model(torch.tensor(x_cat, dtype=torch.long).to(device),
|
268 |
+
torch.tensor(x_num, dtype=torch.float32).to(device)).cpu().numpy()
|
269 |
+
|
270 |
+
scaler = StandardScaler()
|
271 |
+
embeddings = scaler.fit_transform(embeddings)
|
272 |
+
|
273 |
+
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
|
274 |
+
labels = dbscan.fit_predict(embeddings)
|
275 |
+
|
276 |
+
df_subset['cluster'] = labels
|
277 |
+
df_subset['predicted_rank_difference'] = outputs
|
278 |
+
|
279 |
+
df_subset['easy_draw'] = (df_subset['rank_difference'] - df_subset['predicted_rank_difference']) > threshold
|
280 |
+
df_subset['hard_draw'] = (df_subset['predicted_rank_difference'] - df_subset['rank_difference']) > threshold
|
281 |
+
|
282 |
+
results = df_subset.groupby('winner_name').agg({
|
283 |
+
'cluster': 'count',
|
284 |
+
'easy_draw': 'sum',
|
285 |
+
'hard_draw': 'sum'
|
286 |
+
}).reset_index()
|
287 |
+
|
288 |
+
results['easy_draw_ratio'] = results['easy_draw'] / results['cluster']
|
289 |
+
results['hard_draw_ratio'] = results['hard_draw'] / results['cluster']
|
290 |
+
|
291 |
+
results.sort_values('hard_draw_ratio', ascending=False, inplace=True)
|
292 |
+
results.to_csv('winning_streak_analysis.csv', index=False)
|
293 |
+
|
294 |
+
logging.info(f"Analysis results saved to winning_streak_analysis.csv")
|
295 |
+
|
296 |
+
return results
|
297 |
+
|
298 |
+
if __name__ == "__main__":
|
299 |
+
try:
|
300 |
+
df = load_data()
|
301 |
+
logging.info(f"Data loaded successfully. Shape: {df.shape}")
|
302 |
+
|
303 |
+
numeric_columns, categorical_columns = determine_column_types(df)
|
304 |
+
logging.info(f"Numeric columns: {numeric_columns}")
|
305 |
+
logging.info(f"Categorical columns: {categorical_columns}")
|
306 |
+
|
307 |
+
df, label_encoders = preprocess_data(df, numeric_columns, categorical_columns)
|
308 |
+
logging.info(f"Data preprocessed. Shape after preprocessing: {df.shape}")
|
309 |
+
|
310 |
+
# Ensure all numeric columns are properly handled
|
311 |
+
for col in numeric_columns:
|
312 |
+
if not pd.api.types.is_numeric_dtype(df[col]):
|
313 |
+
raise ValueError(f"Column {col} contains non-numeric data after preprocessing")
|
314 |
+
|
315 |
+
if df.shape[0] < N_SPLITS:
|
316 |
+
raise ValueError(f"Not enough samples ({df.shape[0]}) for {N_SPLITS}-fold cross-validation.")
|
317 |
+
|
318 |
+
X_cat = df[categorical_columns].values
|
319 |
+
X_num = df[numeric_columns].values.astype(float)
|
320 |
+
y = df['rank_difference'].values.astype(float)
|
321 |
+
|
322 |
+
# Remove NaN values from y
|
323 |
+
if np.isnan(y).any():
|
324 |
+
raise ValueError("Target variable contains NaN values.")
|
325 |
+
|
326 |
+
logging.info(f"Shape of X_cat: {X_cat.shape}")
|
327 |
+
logging.info(f"Shape of X_num: {X_num.shape}")
|
328 |
+
logging.info(f"Shape of y: {y.shape}")
|
329 |
+
|
330 |
+
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
|
331 |
+
scores = []
|
332 |
+
|
333 |
+
for train_index, val_index in kf.split(X_cat):
|
334 |
+
X_cat_train, X_cat_val = X_cat[train_index], X_cat[val_index]
|
335 |
+
X_num_train, X_num_val = X_num[train_index], X_num[val_index]
|
336 |
+
y_train, y_val = y[train_index], y[val_index]
|
337 |
+
|
338 |
+
# Additional NaN checks for validation and training sets
|
339 |
+
if np.isnan(X_cat_train).any() or np.isnan(X_num_train).any() or np.isnan(y_train).any():
|
340 |
+
raise ValueError("Training data contains NaN values.")
|
341 |
+
if np.isnan(X_cat_val).any() or np.isnan(X_num_val).any() or np.isnan(y_val).any():
|
342 |
+
raise ValueError("Validation data contains NaN values.")
|
343 |
+
|
344 |
+
X_train = (X_cat_train, X_num_train)
|
345 |
+
X_val = (X_cat_val, X_num_val)
|
346 |
+
|
347 |
+
categorical_dims = [len(label_encoders[col].classes_) for col in categorical_columns]
|
348 |
+
numerical_dim = len(numeric_columns)
|
349 |
+
|
350 |
+
try:
|
351 |
+
study = optuna.create_study(direction='minimize')
|
352 |
+
study.optimize(objective, n_trials=100) # Further increased trials for finer parameter search
|
353 |
+
|
354 |
+
best_params = study.best_params
|
355 |
+
logging.info(f"Best Hyperparameters: {best_params}")
|
356 |
+
|
357 |
+
# Save the model configuration
|
358 |
+
model_config = {
|
359 |
+
'categorical_dims': categorical_dims,
|
360 |
+
'numerical_dim': numerical_dim,
|
361 |
+
'embedding_dim': best_params['embedding_dim'],
|
362 |
+
'hidden_dim': best_params['hidden_dim'],
|
363 |
+
'dropout_rate': best_params['dropout_rate'],
|
364 |
+
'learning_rate': best_params['learning_rate']
|
365 |
+
}
|
366 |
+
save_model_config(model_config, MODEL_CONFIG_PATH)
|
367 |
+
|
368 |
+
model = JointEmbeddedModel(**model_config)
|
369 |
+
dataloader = create_dataloader(X_train, y_train, batch_size=best_params['batch_size'])
|
370 |
+
|
371 |
+
trainer = pl.Trainer(
|
372 |
+
max_epochs=100, # Further increased max_epochs for deeper training
|
373 |
+
accelerator='gpu' if torch.cuda.is_available() else 'cpu',
|
374 |
+
devices=1,
|
375 |
+
logger=False,
|
376 |
+
enable_checkpointing=False
|
377 |
+
)
|
378 |
+
trainer.fit(model, dataloader)
|
379 |
+
|
380 |
+
val_predictions = model(torch.tensor(X_val[0], dtype=torch.long), torch.tensor(X_val[1], dtype=torch.float32)).detach().cpu().numpy()
|
381 |
+
if np.isnan(val_predictions).any():
|
382 |
+
raise ValueError("Validation predictions contain NaN values.")
|
383 |
+
|
384 |
+
val_loss = mean_squared_error(y_val, val_predictions)
|
385 |
+
scores.append(val_loss)
|
386 |
+
|
387 |
+
# Save the model state
|
388 |
+
torch.save(model.state_dict(), BEST_MODEL_PATH)
|
389 |
+
|
390 |
+
except Exception as e:
|
391 |
+
logging.error(f"An error occurred during optimization: {str(e)}")
|
392 |
+
logging.error("Exception details:", exc_info=True)
|
393 |
+
|
394 |
+
logging.info(f"Cross-Validation MSE: {np.mean(scores):.4f}")
|
395 |
+
|
396 |
+
# Train ensemble models and evaluate
|
397 |
+
ensemble_models = [
|
398 |
+
RandomForestRegressor(n_estimators=300, random_state=RANDOM_SEED),
|
399 |
+
GradientBoostingRegressor(n_estimators=300, random_state=RANDOM_SEED),
|
400 |
+
LinearRegression()
|
401 |
+
]
|
402 |
+
|
403 |
+
# Check for NaNs in ensemble training data
|
404 |
+
if np.isnan(np.hstack((X_cat, X_num))).any() or np.isnan(y).any():
|
405 |
+
raise ValueError("Ensemble training data contains NaN values.")
|
406 |
+
|
407 |
+
ensemble_models = [model.fit(np.hstack((X_cat, X_num)), y) for model in ensemble_models]
|
408 |
+
ensemble_preds = ensemble_predictions(ensemble_models, np.hstack((X_cat, X_num)))
|
409 |
+
ensemble_mse = mean_squared_error(y, ensemble_preds)
|
410 |
+
|
411 |
+
logging.info(f"Ensemble Test MSE: {ensemble_mse:.4f}")
|
412 |
+
|
413 |
+
# Load the best model configuration and state for final analysis
|
414 |
+
if os.path.exists(BEST_MODEL_PATH) and os.path.exists(MODEL_CONFIG_PATH):
|
415 |
+
model_config = load_model_config(MODEL_CONFIG_PATH)
|
416 |
+
model = JointEmbeddedModel(**model_config)
|
417 |
+
model.load_state_dict(torch.load(BEST_MODEL_PATH))
|
418 |
+
model.eval()
|
419 |
+
|
420 |
+
test_predictions = model(torch.tensor(X_cat, dtype=torch.long), torch.tensor(X_num, dtype=torch.float32)).detach().cpu().numpy()
|
421 |
+
if np.isnan(test_predictions).any():
|
422 |
+
raise ValueError("Test predictions contain NaN values.")
|
423 |
+
|
424 |
+
test_mse = mean_squared_error(y, test_predictions)
|
425 |
+
logging.info(f"Final Test MSE: {test_mse}")
|
426 |
+
|
427 |
+
winning_streak_analysis = analyze_winning_streaks(model, (X_cat, X_num), df)
|
428 |
+
torch.save(model.state_dict(), FINAL_MODEL_PATH)
|
429 |
+
logging.info("Script execution completed successfully.")
|
430 |
+
else:
|
431 |
+
logging.error("Best model or configuration not found. Ensure training is completed before running analysis.")
|
432 |
+
|
433 |
+
except Exception as e:
|
434 |
+
logging.error(f"An error occurred during script execution: {str(e)}")
|
435 |
+
logging.error("Exception details:", exc_info=True)
|