import math import pandas as pd from sktime.forecasting.base import ForecastingHorizon def split_x_y( data: pd.DataFrame, window_length: int, n_predict: int, freq: str, ): # print('[prep_data] ----- Start -----') datetime_index = data.index y = data['y'] X_train, X_forecast = None, None has_X = len(data.columns) > 1 if has_X: # print('[prep_data] - additional feature columns found') X = data.drop(columns='y').reset_index(drop=True) X_columns = X.columns X_train = pd.DataFrame() # ------------------------ # # Build lags of the X data # # ------------------------ # # print('[prep_data] - Building lags of features data') for n in range(0, window_length): # print('[prep_data],', n) shifted_columns = {} for col in X_columns: shifted_columns[col] = f'{col}_-{n_predict + n}' shifted = X.shift(n).rename(columns=shifted_columns) X_train = pd.concat( [X_train, shifted], axis=1) # print('[prep_data],', X_train) # print('[prep_data] - Backward fill lags of exog data') X_train = X_train.bfill() # Split last n_predict rows from exog_train as exog_pred X_forecast = X_train[-n_predict:] X_train = X_train[:-n_predict] # For both y and datetime index, need to cut off n_predict value to keep data consistent # print('[prep_data] - Cutting off y and datetime index be n_predict') y = y[n_predict:] datetime_index = datetime_index[n_predict:] X_train.set_index(datetime_index, inplace=True) fh = ForecastingHorizon( list(range(1, n_predict+1)), is_relative=True, freq=freq) # Cutoff is the last datetime value in the given data # meaning we'll forecast right after this point of time cutoff = datetime_index[-1] fh = fh.to_absolute(cutoff=cutoff) if X_forecast is not None: X_forecast.set_index(fh.to_pandas(), inplace=True) return (fh, y, X_train, X_forecast) def k_folds( data: pd.DataFrame, period: int, window_length: int, n_predict: int, freq: str ): ''' Amount of folds for testing is data size - window length and 2 seasonality period This will make sure the smallest fold will still have 2 seasons and n_predict value, these will be sufficient to train a minimal model ''' print('[k_folds] ----- START -----') k = math.floor((len(data) - n_predict - (2*period)) / period) folds = [] print('k', k) # Make sure k is not large than 10 k = min(k, 10) if k == 0: raise ValueError( f'Data should at least have length of 2 seasons + n_predict rows, \ currently length {len(data)}, expected length {2 * period + n_predict}') for i in reversed(range(1, k + 1)): d = data[: (-i * period)] folds.append( split_x_y( d, window_length, n_predict, freq )) print('[k_folds] ----- END -----') return folds