from typing import List import logging import pandas as pd from statsmodels.tsa.tsatools import freq_to_period from sklearn.metrics import mean_squared_error from math import sqrt from .models import AllModels logging.basicConfig(level=logging.DEBUG) class Forecaster(): def __init__( self, ) -> None: logging.debug('Forecaster init') self.models = {} # Init models dict def fit(self, data): ''' Fot data into the forecaster ''' self.data = data pass def forecast( self, data: pd.DataFrame, models: str or List[str] = 'all', test: bool = False, enable_exog: bool = True ): ''' Main function, will perform the entire forecast operation data : pd.DataFrame, required Data for training the model, must contain "datetime", "y" columns, any additional column will be considered as exogenuous columns and be used for multivariate forecasting data must be cleaned without any missing value data's datetime column must be valid datetime strings, the frequency must be able to inference models : str or List[str], default='all' Selected model(s) to use fore forecasting. Default is "all", which will use all available models registered in models.AllModels test : bool, default=False Decide if the forecasting purpose is for testing or actual prediction Testing and prediction will not happen at the same time. 20% of the data will be splitted for testing enable_exog : bool, default=True If disabled, exog data will not be used in the model training, and the data will be considered as univariate data If enabled, and the data does contains exog data, for multivariate forecasting purpose, the data must be shifted by n_predict steps. This will cause a few things: 1. y column will be remapped to exog data that is n_predict unit of time ago 2. n_predict length of the oldest y will be trimmed off 3. n_predict length of exog values will be used for the forecasting ''' logging.debug('Start forecasting ...') self.enable_exog = enable_exog # Below properties will be init by prep_data() self.data: pd.DataFrame = None self.y = None self.exog = None self.freq: str = None self.period: int = None self.y_test = None self.n_predict: int = None # init by calculate_n_predict() self.kwargs = {} self.results = [] # Contains all result value # Prepare data, including set the datetime index, slit y and exog columns self.prep_data(data) # Calculate n_predict value based on self.period self.calculate_n_predict() # Init the basic kwargs for models to use self.init_kwargs() # Shift exog value by n_predict unit of time self.shift_exog() # Split test set for testing purpose if test: logging.debug('Testing ...') self.train_test_split() # ================================ # # Train models and make prediction # # ================================ # self.init_models(models) for model_name, model in self.models.items(): result = { 'model': model_name, 'result': None, 'evaluate': None, 'rmse': None, } fcst = model.forecast() # Assign the models result to the result dict if 'forecast' in fcst.keys(): result['result'] = fcst['forecast'] else: result['result'] = fcst if 'evaluate' in fcst.keys(): result['evaluate'] = fcst['evaluate'] if test: mse = mean_squared_error(self.y_test, result['result']) result['rmse'] = sqrt(mse) self.results.append(result) # - END of forecast - # def init_models(self, models): ''' Initialize models based on the provided parameter. Get self.models ready for forecasting ''' logging.debug('Init models') all_models = AllModels(models) self.models = all_models.init_models( self.y, self.n_predict, self.exog, **self.kwargs) def prep_data( self, data: pd.DataFrame ) -> None: logging.debug('Prep data') self.data = data.copy() self.data.set_index('datetime', inplace=True) self.data.index = pd.to_datetime(self.data.index) logging.debug('Inferencing freq and period') self.freq = pd.infer_freq(self.data.index) self.period = freq_to_period(self.freq) self.y = self.data['y'] if len(self.data.columns) > 1 and self.enable_exog: self.exog = self.data.drop(columns='y') def calculate_n_predict(self): ''' The n_predict will be the smaller number in 20, self.period value By default, try only predict 1 seasonal cycle ''' n_predict = min(20, self.period) # Set a max prediction size to be 20% of given data size if n_predict > int(len(self.data)*0.2): n_predict = int(len(self.data)*0.2) # Set a min prediction to be 4 if n_predict < 4: n_predict = 4 self.n_predict = n_predict def init_kwargs(self): ''' kwargs will be used for initializing models. kwargs contains all necessary information about the data ''' self.kwargs['period'] = self.period def train_test_split(self): ''' n_predict length of y value will be splitted out for testing although, each model will probably have it's own cross validator ''' logging.debug('Train test split') self.y_test = self.y[-self.n_predict:] self.y = self.y[:-self.n_predict] if self.exog is not None: self.exog = self.exog[:-self.n_predict] def shift_exog(self): if self.exog is not None: logging.debug('Shifted exog datetime index by n_predict period') self.exog.index = self.exog.index.shift( self.n_predict, freq=self.freq) logging.debug( 'Trimmed y by n_predict, so it is aligned with shifted exog') self.y = self.y[self.n_predict:]