# Basics
import re, numpy as np, pandas as pd
import datetime

# Statsmodel package for inference
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.seasonal import seasonal_decompose

# DARTS for timeseries modeling
from darts import TimeSeries
from darts.models import AutoARIMA, ARIMA
from darts.models import ExponentialSmoothing
from darts.models import Prophet
from darts.models import NBEATSModel
from darts.utils.utils import ModelMode
from darts.metrics import mape, mae

# Graphics
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Read marriage data into a pandas dataframe
path = '/dbfs/mnt/timeseries/NL_marriages_1995_2019.csv'
marriages = pd.read_csv(path, sep=';', parse_dates=['week_startdate'])
marriages.head(5)


def plot_df(df, x, y, title="", xlabel='', ylabel='Number of marriages', dpi=100):
    plt.figure(figsize=(16,5), dpi=dpi)
    plt.plot(x, y, color='tab:blue')
    plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
    plt.show()
    
plot_df(df=marriages, x=marriages.week_startdate, y=marriages.number_of_marriages, title='Weekly Dutch marriages from 1995-2019')


fig, (axes1, axes2) = plt.subplots(2, 1, figsize=(15,9), dpi= 80)
sns.boxplot(x=marriages['year'], y=marriages['number_of_marriages'], data=marriages, ax=axes1)
sns.boxplot(x=marriages['week'], y=marriages['number_of_marriages'], data=marriages, ax=axes2)
axes1.set_title('Boxplots of marriages per year')
axes2.set_title('Boxplots of marriages per week')
fig.subplots_adjust(hspace=.3)


# seasonal_decompose requires an input with a string containing the dates and the values for marriages
marriages['year_week'] = marriages['year'].astype(str) + '_' + marriages['week'].astype(str)
marriages_short = marriages[['year_week','number_of_marriages']]

# Make a decomposition where the actual values are additive: trend + seasonal + noise
result = seasonal_decompose(marriages_short['number_of_marriages'], period= 52, model='additive')
plt.rcParams["figure.figsize"] = (15,10)
result.plot(observed=True)


# We test for stationarity around a trend: regression = 'ct'

# ADF Test
result = adfuller(marriages['number_of_marriages'], autolag='AIC', regression = 'ct')
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
print(f'ADF result: the series is {"not " if result[1] > 0.05 else ""}stationary')

# KPSS Test
result = kpss(marriages['number_of_marriages'], regression='ct')
print('\nKPSS Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print(f'KPSS result: the series is {"not " if result[1] < 0.05 else ""}stationary')


# create timeseries object for DARTS
series_week = TimeSeries.from_dataframe(marriages, 'week_startdate', 'number_of_marriages')

# Create train and validation set
train, val = series_week.split_before(pd.Timestamp('20190101'))


model_arima = AutoARIMA(
                        m = 52, # the number of periods in each season
                        seasonal=True, 
                        maxiter = 20, 
                        n_jobs = -1,
                        trace = True, 
                        suppress_warnings=True
                        )
model_arima.fit(train)
prediction_arima = model_arima.predict(len(val))

print("Mean absolute percentage error for Auto-ARIMA: {:.2f}%.".format(
      mape(series_week, prediction_arima)))
print("Mean absolute error for Auto-ARIMA: {:.0f}.".format(
      mae(series_week, prediction_arima)))


plt.rcParams["figure.figsize"] = (20,5)
series_week.plot(label='actual', color='grey' , lw=1.3)
prediction_arima.plot(color='red' , lw=1.3, label='Auto-ARIMA MAPE={:.2f}%'.format(mape(series_week, prediction_arima)))
plt.legend(loc='center right', bbox_to_anchor=(1.15, 0.5))
plt.title('Actual and forecast values of Auto-ARIMA with DARTS')
plt.xlim('20150101', '20200101')
plt.xlabel('')
display()


# Model parameters 
seasonal_periods = 52
trend = ModelMode.ADDITIVE
seasonal = ModelMode.ADDITIVE
damped = True

# Model specification
model_expsmthng = ExponentialSmoothing(  damped = damped
                                       , seasonal_periods = seasonal_periods
                                       , seasonal = seasonal
                                       , trend=trend)

model_expsmthng.fit(train)
prediction_expsmthng = model_expsmthng.predict(52)

print("Mean absolute percentage error for Exponential Smoothing: {:.2f}%.".format(
      mape(series_week, prediction_expsmthng)))
print("Mean absolute error for Exponential Smoothing: {:.0f}.".format(
      mae(series_week, prediction_expsmthng)))


plt.rcParams["figure.figsize"] = (20,5)
series_week.plot(label='actual', color='grey' , lw=1.3)
prediction_expsmthng.plot(color='red' , lw=1.3, label='Exponential smoothing MAPE={:.2f}%'.format(mape(series_week, prediction_expsmthng)))
plt.legend(loc='center right', bbox_to_anchor=(1.2, 0.5))
plt.title('Actual and forecast values of Exponential smoothing with DARTS')
plt.xlim('20150101', '20200101')
plt.xlabel('')
display()


# Model parameters Prophet
import holidays
country_holidays = holidays.NL()

# Model specification
model_prophet = Prophet(weekly_seasonality=True
                        ,country_holidays = 'NL'
                       )
model_prophet.add_seasonality('weekly_seasonality', seasonal_periods = 52, fourier_order = 2, mode='additive')

model_prophet.fit(train)
prediction_prophet = model_prophet.predict(52)

print("Mean absolute percentage error for Facebook Prophet: {:.2f}%.".format(
      mape(series_week, prediction_prophet)))
print("Mean absolute error for Facebook Prophet: {:.0f}.".format(
      mae(series_week, prediction_prophet)))


# Display Prophet predictions
series_week.plot(label='actual', color='grey' , lw=1.3)
prediction_prophet.plot(color='red' , lw=1.3, label='Prophet MAPE={:.2f}%'.format(mape(series_week, prediction_prophet)))
plt.legend(loc='center right', bbox_to_anchor=(1.15, 0.5))
plt.title('Actual and forecast values of using Prophet with DARTS')
plt.xlim('20150101', '20200101')
plt.xlabel('')
display()


# Model parameters NBEATS
input_chunk_length=104
output_chunk_length=52
generic_architecture=True
n_epochs=50
    
# Model specification
model_nbeats = NBEATSModel(input_chunk_length = input_chunk_length
                          ,output_chunk_length = output_chunk_length
                          ,n_epochs=n_epochs
                          ,generic_architecture=generic_architecture
                          )

model_nbeats.fit(train)
prediction_nbeats = model_nbeats.predict(52)

print("Mean absolute percentage error for NBEATS: {:.2f}%.".format(
      mape(series_week, prediction_nbeats)))
print("Mean absolute error for NBEATS: {:.0f}.".format(
      mae(series_week, prediction_nbeats)))


# Display NBEATS predictions
series_week.plot(label='actual', color='grey' , lw=1.3)
prediction_nbeats.plot(color='red' , lw=1.3, label='NBEATS MAPE={:.2f}%'.format(mape(series_week, prediction_nbeats)))
plt.legend(loc='center right', bbox_to_anchor=(1.15, 0.5))
plt.title('Actual and forecast values of NBEATS prediction')
plt.xlim('20150101', '20200101')
plt.xlabel('')
display()


plt.rcParams["figure.figsize"] = (10,5)
expsm_err = mape(series_week, prediction_expsmthng)
prophet_err = mape(series_week, prediction_prophet)
nbeats_err = mape(series_week, prediction_nbeats)
arima_err = mape(series_week, prediction_arima)

# Display predictions of various models
series_week.plot(label='Actual', color='#2CA02C')
prediction_expsmthng.plot(label='Exponential Smoothing MAPE={:.2f}%'.format(expsm_err), lw=2, color='#ACACAC')
prediction_prophet.plot(label='Prophet MAPE={:.2f}%'.format(prophet_err), lw=2, color='#7C3F00')
prediction_nbeats.plot(color='#F5A507', lw=2, label='NBEATS MAPE={:.2f}%'.format(nbeats_err))
prediction_arima.plot(color='#003D7C', lw=2, label='ARIMA MAPE={:.2f}%'.format(arima_err))
plt.ylabel("Number of marriages")
plt.xlabel("")
plt.legend(loc='right', bbox_to_anchor=(1.4, 0.5))
plt.xlim('20190101', '20191231')
display()


# Model parameters 
seasonal_periods = 52
trend = ModelMode.ADDITIVE
seasonal = ModelMode.ADDITIVE
damped = True

# Model specification
model_expsmthng = ExponentialSmoothing(  damped = damped
                                       , seasonal_periods = seasonal_periods
                                       , seasonal = seasonal
                                       , trend=trend)
# Fit model on entire dataset
model_expsmthng.fit(series_week)

# Predict values for 2020 and 2021
prediction_2020_2021_expsmthng = model_expsmthng.predict(104)


# Display predictions 2020 and 2021
series_week.plot(label='actual', color='grey' , lw=1.3)
prediction_2020_2021_expsmthng.plot(color='red' , lw=1.3, label='Predicted marriages')
plt.legend(loc='center right', bbox_to_anchor=(1.15, 0.5))
plt.title('Actual and forecast values of Exponential Smoothing model')
plt.xlabel('')
display()


# Read 2020/2021 marriages actuals into a pandas dataframe
path = '/dbfs/mnt/timeseries/NL_marriages_2020_2021.csv'
marriages_2020_2021 = pd.read_csv(path, sep=';', parse_dates=['month_startdate'])
marriages_2020_2021.head(5)


# Predicted marriages per year for 2020 and 2021
prediction_2020_2021 = prediction_2020_2021_expsmthng.pd_dataframe().reset_index()
prediction_2020_2021['time'] = pd.to_datetime(prediction_2020_2021['time'])
prediction_2020_2021['cumulative'] = prediction_2020_2021['number_of_marriages'].cumsum()

# Actual marriages per year for 2020 and 2021
marriages_2020_2021['cumulative'] = marriages_2020_2021['number_of_marriages'].cumsum()

# Create time series objects for both actual and predicted
series_actual = TimeSeries.from_dataframe(marriages_2020_2021, 'month_startdate', 'cumulative')
series_predicted = TimeSeries.from_dataframe(prediction_2020_2021, 'time', 'cumulative')


# Plot of cumulative differences
series_actual.plot(label='Actual marriages 2020 and 2021', color='grey' , lw=1.3)
series_predicted.plot(color='red' , lw=1.3, label='Predicted marriages')
plt.legend(loc='center right', bbox_to_anchor=(1.15, 0.5))
plt.title('Predicted marriages for 2020 and 2021 compared to actual marriages')
plt.xlabel('Date')
plt.ylabel('Cumulative marriages')
display()


# Cumulative comparison, actuals available until the end of October
date_comparison = pd.to_datetime(datetime.date(2021, 10, 31)) 
actual_cumulative = marriages_2020_2021['cumulative'].max()
predicted_cumulative = round(prediction_2020_2021[prediction_2020_2021['time']<=date_comparison]['cumulative'].max(),2)
diff_act_pred = round(predicted_cumulative - actual_cumulative, 2)

print("The actual marriages in 2020 and 2021 until the 1st of November: ", actual_cumulative)
print("The predicted marriages in 2020 and 2021 until the 1st of November: ", predicted_cumulative)
print("Therefore, the marriages that are probably postponed because of Corona are around: ", diff_act_pred)

	year	week	week_startdate	number_of_marriages
0	1995	1	1995-01-02	616
1	1995	2	1995-01-09	616
2	1995	3	1995-01-16	621
3	1995	4	1995-01-23	675
4	1995	5	1995-01-30	644

	year	month	month_startdate	number_of_marriages
0	2020	1	2020-01-01	2756
1	2020	2	2020-02-01	3772
2	2020	3	2020-03-01	2786
3	2020	4	2020-04-01	2298
4	2020	5	2020-05-01	2969

Setting up our toolbox¶

Load preprocessed data¶

Exploratory data analysis¶

Preparing for timeseries analyses¶

Timeseries modeling¶

DARTS: ARIMA-model¶

DARTS: Exponential Smoothing¶

DARTS: Prophet¶

DARTS: NBEATS¶

And the winner is...¶

Forecast for 2020 and 2021¶

Expand your wardrobe quickly, because around 7000 marriages have to be catched up¶