zhang qiao
Upload folder using huggingface_hub
8cf4695
import math
import pandas as pd
from sktime.forecasting.base import ForecastingHorizon
def split_x_y(
data: pd.DataFrame,
window_length: int,
n_predict: int,
freq: str,
):
# print('[prep_data] ----- Start -----')
datetime_index = data.index
y = data['y']
X_train, X_forecast = None, None
has_X = len(data.columns) > 1
if has_X:
# print('[prep_data] - additional feature columns found')
X = data.drop(columns='y').reset_index(drop=True)
X_columns = X.columns
X_train = pd.DataFrame()
# ------------------------ #
# Build lags of the X data #
# ------------------------ #
# print('[prep_data] - Building lags of features data')
for n in range(0, window_length):
# print('[prep_data],', n)
shifted_columns = {}
for col in X_columns:
shifted_columns[col] = f'{col}_-{n_predict + n}'
shifted = X.shift(n).rename(columns=shifted_columns)
X_train = pd.concat(
[X_train, shifted],
axis=1)
# print('[prep_data],', X_train)
# print('[prep_data] - Backward fill lags of exog data')
X_train = X_train.bfill()
# Split last n_predict rows from exog_train as exog_pred
X_forecast = X_train[-n_predict:]
X_train = X_train[:-n_predict]
# For both y and datetime index, need to cut off n_predict value to keep data consistent
# print('[prep_data] - Cutting off y and datetime index be n_predict')
y = y[n_predict:]
datetime_index = datetime_index[n_predict:]
X_train.set_index(datetime_index, inplace=True)
fh = ForecastingHorizon(
list(range(1, n_predict+1)), is_relative=True, freq=freq)
# Cutoff is the last datetime value in the given data
# meaning we'll forecast right after this point of time
cutoff = datetime_index[-1]
fh = fh.to_absolute(cutoff=cutoff)
if X_forecast is not None:
X_forecast.set_index(fh.to_pandas(), inplace=True)
return (fh, y, X_train, X_forecast)
def k_folds(
data: pd.DataFrame,
period: int,
window_length: int,
n_predict: int,
freq: str
):
'''
Amount of folds for testing is data size - window length and 2 seasonality period
This will make sure the smallest fold will still have 2 seasons and n_predict value, these will be sufficient to train a minimal model
'''
print('[k_folds] ----- START -----')
k = math.floor((len(data) - n_predict - (2*period)) / period)
folds = []
print('k', k)
# Make sure k is not large than 10
k = min(k, 10)
if k == 0:
raise ValueError(
f'Data should at least have length of 2 seasons + n_predict rows, \
currently length {len(data)}, expected length {2 * period + n_predict}')
for i in reversed(range(1, k + 1)):
d = data[: (-i * period)]
folds.append(
split_x_y(
d,
window_length,
n_predict,
freq
))
print('[k_folds] ----- END -----')
return folds