File size: 3,212 Bytes
8cf4695
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import math

import pandas as pd
from sktime.forecasting.base import ForecastingHorizon


def split_x_y(
    data: pd.DataFrame,
    window_length: int,
    n_predict: int,
    freq: str,
):
    # print('[prep_data] ----- Start -----')
    datetime_index = data.index
    y = data['y']
    X_train, X_forecast = None, None

    has_X = len(data.columns) > 1

    if has_X:
        # print('[prep_data] - additional feature columns found')

        X = data.drop(columns='y').reset_index(drop=True)
        X_columns = X.columns

        X_train = pd.DataFrame()

        # ------------------------ #
        # Build lags of the X data #
        # ------------------------ #

        # print('[prep_data] - Building lags of features data')
        for n in range(0, window_length):
            # print('[prep_data],', n)
            shifted_columns = {}
            for col in X_columns:
                shifted_columns[col] = f'{col}_-{n_predict + n}'

            shifted = X.shift(n).rename(columns=shifted_columns)

            X_train = pd.concat(
                [X_train, shifted],
                axis=1)
            # print('[prep_data],', X_train)

        # print('[prep_data] - Backward fill lags of exog data')
        X_train = X_train.bfill()

        # Split last n_predict rows from exog_train as exog_pred
        X_forecast = X_train[-n_predict:]
        X_train = X_train[:-n_predict]

        # For both y and datetime index, need to cut off n_predict value to keep data consistent
        # print('[prep_data] - Cutting off y and datetime index be n_predict')
        y = y[n_predict:]
        datetime_index = datetime_index[n_predict:]

        X_train.set_index(datetime_index, inplace=True)

    fh = ForecastingHorizon(
        list(range(1, n_predict+1)), is_relative=True, freq=freq)
    # Cutoff is the last datetime value in the given data
    # meaning we'll forecast right after this point of time
    cutoff = datetime_index[-1]
    fh = fh.to_absolute(cutoff=cutoff)

    if X_forecast is not None:
        X_forecast.set_index(fh.to_pandas(), inplace=True)

    return (fh, y, X_train, X_forecast)


def k_folds(
        data: pd.DataFrame,
        period: int,
        window_length: int,
        n_predict: int,
        freq: str
):
    '''
    Amount of folds for testing is data size - window length and 2 seasonality period
    This will make sure the smallest fold will still have 2 seasons and n_predict value, these will be sufficient to train a minimal model
    '''
    print('[k_folds] ----- START -----')
    k = math.floor((len(data) - n_predict - (2*period)) / period)
    folds = []
    print('k', k)

    # Make sure k is not large than 10
    k = min(k, 10)

    if k == 0:
        raise ValueError(
            f'Data should at least have length of 2 seasons + n_predict rows,  \
                currently length {len(data)}, expected length {2 * period + n_predict}')

    for i in reversed(range(1, k + 1)):
        d = data[: (-i * period)]
        folds.append(
            split_x_y(
                d,
                window_length,
                n_predict,
                freq
            ))

    print('[k_folds] ----- END -----')
    return folds