linguask / src /cross_validate.py
GitHub Action
refs/heads/ci-cd/hugging-face
8b414b0
from copy import deepcopy
from pathlib import Path
from typing import Optional, Union
import numpy as np
import pandas as pd
import torch.cuda
from sklearn.model_selection import KFold
from src.metrics import MSEMetric
from src.solutions.base_solution import BaseSolution
from src.utils import validate_x, validate_y
class CrossValidation:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
def __init__(self, saving_dir: str, n_splits: int = 5):
_saving_dir = Path(saving_dir)
self.k_fold = KFold(n_splits=n_splits)
self.metric = MSEMetric()
if not _saving_dir.is_dir():
_saving_dir.mkdir(exist_ok=True, parents=True)
self.saving_dir = _saving_dir
self.base_solution: Optional[BaseSolution] = None
def fit(self, model: BaseSolution, X: pd.DataFrame, y: pd.DataFrame) -> pd.DataFrame:
"""Makes average fold prediction
:param model: predictor from BaseSolution class
:param X: Dataframe that has text_id and full_text columns
:param y: Dataframe that has text_id, cohesion, ... columns
:return: Dataframe with class scores for each split and overall CV score
"""
validate_x(X)
validate_y(y)
scores = []
self.base_solution = model
for ii, (train_ind, test_ind) in enumerate(self.k_fold.split(X)):
print(f"Training fold={ii}...")
X_train, X_test = X.iloc[train_ind], X.iloc[test_ind]
y_train, y_test = y.iloc[train_ind], y.iloc[test_ind]
training_model = deepcopy(model)
training_model.fit(X_train, y_train, val_X=X_test, val_y=y_test, fold=ii)
y_pred = training_model.predict(X_test)
class_rmse = self.metric.evaluate_class_rmse(y_pred, y_test)
scores.append(class_rmse)
training_model.save(self.saving_dir / f"cv_fold_{ii}")
del training_model
_scores = pd.DataFrame(scores)
mean_values = [_scores.mean(axis='rows').values.tolist()]
overall = pd.DataFrame(mean_values, columns=_scores.columns, index=['overall'])
print("\n")
_scores = pd.concat([_scores, overall], axis='rows')
return _scores
def predict(self, X: pd.DataFrame) -> pd.DataFrame:
"""Makes average fold prediction
:param X: Dataframe that have text_id and full_text columns
:return: prediction Dataframe that have text_id, cohesion, ... columns
"""
assert list(self.saving_dir.iterdir()) is not [], "Cross validation is not trained yet"
validate_x(X)
predictions = []
for ii in range(self.k_fold.n_splits):
model_path = self.saving_dir / f"cv_fold_{ii}"
if not self.base_solution:
raise TypeError
model = deepcopy(self.base_solution)
model.load(model_path)
pred = model.predict(X)
predictions.append(pred)
mean_class_predictions = {}
for column in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']:
values = [item[column].values for item in predictions]
mean_pred = np.mean(values, axis=0)
mean_class_predictions[column] = mean_pred
mean_class_predictions = pd.DataFrame(mean_class_predictions)
X = X.copy().drop(columns=['full_text'])
X = pd.concat([X, mean_class_predictions], axis='columns')
return X
def save(self, path: Union[str, Path]):
path = Path(path)
if not path.is_dir():
path.mkdir(parents=True)
if not self.base_solution or not self.base_solution.models:
raise TypeError
for ii, model in enumerate(self.base_solution.models):
cv_model_path = path / f"cv_fold_{ii}"
model.save(cv_model_path)
print(f"Saved weights successfully to: {path.resolve()}.")
def load(self, path: Union[str, Path], predictor: BaseSolution):
path = Path(path)
assert path.is_dir(), f"Weights dir. not exists: {path.resolve()}"
for ii in range(self.k_fold.n_splits):
cv_model_path = path / f"cv_fold_{ii}"
assert cv_model_path.is_dir(), f"Dir. with fold={ii} not exists: {cv_model_path.resolve()}"
predictor_copy = deepcopy(predictor)
predictor_copy.load(cv_model_path)
if not self.base_solution or not self.base_solution.models:
raise TypeError
self.base_solution.models.append(predictor_copy)
print(f"Loaded model successfully from: {path.resolve()}.")