Spaces:
Runtime error
Runtime error
import math | |
import pandas as pd | |
from sktime.forecasting.base import ForecastingHorizon | |
def split_x_y( | |
data: pd.DataFrame, | |
window_length: int, | |
n_predict: int, | |
freq: str, | |
): | |
# print('[prep_data] ----- Start -----') | |
datetime_index = data.index | |
y = data['y'] | |
X_train, X_forecast = None, None | |
has_X = len(data.columns) > 1 | |
if has_X: | |
# print('[prep_data] - additional feature columns found') | |
X = data.drop(columns='y').reset_index(drop=True) | |
X_columns = X.columns | |
X_train = pd.DataFrame() | |
# ------------------------ # | |
# Build lags of the X data # | |
# ------------------------ # | |
# print('[prep_data] - Building lags of features data') | |
for n in range(0, window_length): | |
# print('[prep_data],', n) | |
shifted_columns = {} | |
for col in X_columns: | |
shifted_columns[col] = f'{col}_-{n_predict + n}' | |
shifted = X.shift(n).rename(columns=shifted_columns) | |
X_train = pd.concat( | |
[X_train, shifted], | |
axis=1) | |
# print('[prep_data],', X_train) | |
# print('[prep_data] - Backward fill lags of exog data') | |
X_train = X_train.bfill() | |
# Split last n_predict rows from exog_train as exog_pred | |
X_forecast = X_train[-n_predict:] | |
X_train = X_train[:-n_predict] | |
# For both y and datetime index, need to cut off n_predict value to keep data consistent | |
# print('[prep_data] - Cutting off y and datetime index be n_predict') | |
y = y[n_predict:] | |
datetime_index = datetime_index[n_predict:] | |
X_train.set_index(datetime_index, inplace=True) | |
fh = ForecastingHorizon( | |
list(range(1, n_predict+1)), is_relative=True, freq=freq) | |
# Cutoff is the last datetime value in the given data | |
# meaning we'll forecast right after this point of time | |
cutoff = datetime_index[-1] | |
fh = fh.to_absolute(cutoff=cutoff) | |
if X_forecast is not None: | |
X_forecast.set_index(fh.to_pandas(), inplace=True) | |
return (fh, y, X_train, X_forecast) | |
def k_folds( | |
data: pd.DataFrame, | |
period: int, | |
window_length: int, | |
n_predict: int, | |
freq: str | |
): | |
''' | |
Amount of folds for testing is data size - window length and 2 seasonality period | |
This will make sure the smallest fold will still have 2 seasons and n_predict value, these will be sufficient to train a minimal model | |
''' | |
print('[k_folds] ----- START -----') | |
k = math.floor((len(data) - n_predict - (2*period)) / period) | |
folds = [] | |
print('k', k) | |
# Make sure k is not large than 10 | |
k = min(k, 10) | |
if k == 0: | |
raise ValueError( | |
f'Data should at least have length of 2 seasons + n_predict rows, \ | |
currently length {len(data)}, expected length {2 * period + n_predict}') | |
for i in reversed(range(1, k + 1)): | |
d = data[: (-i * period)] | |
folds.append( | |
split_x_y( | |
d, | |
window_length, | |
n_predict, | |
freq | |
)) | |
print('[k_folds] ----- END -----') | |
return folds | |