Spaces:

zq13648
/

demand-forecasting

Runtime error

demand-forecasting / src /analytics /ts_analytics.py

zhang qiao

Upload folder using huggingface_hub

8cf4695 about 1 year ago

5.81 kB

	import pandas as pd
	import numpy as np
	import math

	import matplotlib.pyplot as plt
	import seaborn as sns

	from statsmodels.graphics.tsaplots import plot_pacf
	from statsmodels.tsa.seasonal import MSTL, seasonal_decompose
	from statsmodels.tsa.tsatools import freq_to_period

	import statsmodels.api as sm

	from scipy import stats
	from sklearn.preprocessing import MinMaxScaler


	class Ts_Analytics():
	def __init__(self):
	self.log_transformed = False
	self.scaler = MinMaxScaler()
	pass

	def analyse(
	self,
	ts_df: pd.DataFrame,
	auto_correlations={}):
	'''
	ts_df: timeseries dataframe, will assume the datetime column is the index and time encoded
	auto_correlations: dictionary to input customised auto correlations
	'''
	self.ts_df = ts_df.copy()

	self.ts_df['datetime'] = pd.to_datetime(self.ts_df['datetime'])
	self.ts_df.set_index('datetime', inplace=True)

	self.ar = auto_correlations

	self.__infer_frequency()

	# Annual maps to 1, quarterly maps to 4, monthly to 12, weekly to 52.
	# Using statsmodel's freq_to_period function
	self.period = freq_to_period(self.freq)
	pass

	def set_ar(self, col, ar):
	'''
	Set the auto correlation
	'''
	self.ar[col] = ar

	def set_period(self, period):
	self.period = period

	def create_target_lag_columns(self):
	print('create_target_lag_columns')

	def create_lag_dfs(col, n):
	print('create_lag_dfs', col, n)
	for i in range(n):
	self.ts_df[f'{col}_t-{i+1}'] = self.ts_df[col].shift(-(i+1))

	for col, n in self.ar.items():
	create_lag_dfs(col, n)

	print('drop all null values')
	self.ts_df.ffill(inplace=True)

	def log_transform(self):
	self.log_transformed = True
	self.ts_df = np.log2(self.ts_df)

	def exp_transform(self):
	self.log_transformed = False
	self.ts_df = np.exp(self.ts_df)

	def train_multiple_regression(self):
	print('train_multiple_regression')

	x_cols = self.ts_df.columns.tolist()
	x_cols.remove('y')

	_X = self.ts_df[x_cols]
	y = self.ts_df['y']

	X = sm.add_constant(_X)

	# ----------------------------------------------------------------------- #
	# Train an additional model with standardized data, to get the Beta value #
	# ----------------------------------------------------------------------- #
	std_ts_df = pd.DataFrame(self.scaler.fit_transform(
	self.ts_df), columns=self.ts_df.columns)

	std_X = sm.add_constant(std_ts_df[x_cols])
	std_y = std_ts_df['y']

	self.multiple_regression = sm.OLS(y, X).fit()

	coef = self.multiple_regression.params

	self.multiple_regression_formula = f'{coef[0]} + {" + ".join([f"{c} * {round(n, 3)}" for c, n in zip(x_cols, coef[1:])]) }'

	self.std_multiple_regression = sm.OLS(std_y, std_X).fit()
	beta = self.std_multiple_regression.params

	self.multiple_regression_beta = pd.DataFrame(
	np.array(beta[1:]) ** 2, index=x_cols, columns=['Beta (influence on "y")'])
	self.multiple_regression_beta['Beta (influence on "y")'] = self.multiple_regression_beta['Beta (influence on "y")'].round(
	3)

	return self.multiple_regression.summary()

	# ===== #
	# Plots #
	# ===== #

	def plot_correlation(self):
	# Generate a mask for the upper triangle
	corr = self.ts_df.corr(numeric_only=True)
	mask = np.triu(np.ones_like(corr, dtype=bool))
	fig, ax = plt.subplots(figsize=(8, 8))

	sns.heatmap(
	corr,
	mask=mask,
	square=True,
	annot=True,
	cmap='coolwarm',
	linewidths=.5,
	cbar_kws={"shrink": .5},
	ax=ax)

	return fig

	def plot_target_pacf(self):
	fig, ax = plt.subplots(figsize=(12, 4))
	plot_pacf(self.ts_df['y'], ax=ax)
	fig.tight_layout()
	return fig

	def plot_distributions(self):
	plot_col = min(math.ceil(math.sqrt(self.ts_df.shape[1])), 5)
	plot_row = math.ceil(self.ts_df.shape[1] / plot_col)

	fig, axs = plt.subplots(plot_row, plot_col)

	for idx, col in enumerate(self.ts_df.columns):

	axs_x = math.floor(idx/plot_col)
	axs_y = idx - axs_x * plot_col

	# sns.distplot(self.ts_df[col], ax=axs[axs_x, axs_y])
	sns.histplot(self.ts_df[col], ax=axs[axs_x, axs_y], kde=True)

	fig.tight_layout()

	return fig

	def plot_target_seasonality(self):

	if isinstance(self.period, list):
	seasonal = MSTL(
	self.ts_df['y'], periods=self.period).fit()
	else:
	seasonal = seasonal_decompose(self.ts_df['y'], period=self.period)
	return seasonal

	def plot_beta(self):
	fig, ax = plt.subplots(figsize=(6, 4))

	beta_plot = sns.barplot(
	self.multiple_regression_beta['Beta (influence on "y")'], gap=2, ax=ax)
	beta_plot.set_xticklabels(beta_plot.get_xticklabels(), rotation=45)
	ax.bar_label(ax.containers[-1], fmt='%.2f', label_type='center')
	return fig

	def __infer_frequency(self):
	# Attempt to get the frequency from the provided datetime column
	freq = pd.infer_freq(self.ts_df.index)
	if freq is not None:
	self.freq = freq

	# Always make sure the frequency is not None
	if self.freq is None:
	raise ValueError(
	'Unable inference freq from datetime column, please make timeseries interval consistent or provide customized frequency.')