Stat 153/248 - Lecture 19 - Stat 153 / 248

Here we will extend the concepts from Lecture 18 to ARIMA.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
import warnings
warnings.filterwarnings('ignore')

# statsmodels imports
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox


import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

plt.rcParams.update({
    'figure.figsize': (12, 4),
    'axes.spines.top': False,
    'axes.spines.right': False,
    'font.size': 16,
})

The ARIMA model and GDP¶

Here we’ll use data from FRED (Federal Reserve Economic Data) on the GDP.

# US GDP (seasonally adjusted)
# Source: FRED series GDPC1

gdp = pd.read_csv('GDPC1.csv', parse_dates=['observation_date'], index_col='observation_date').dropna()
y_gdp = np.log(gdp['GDPC1'].values)
log_change = np.diff(y_gdp)

fig, ax = plt.subplots(figsize=(14, 4))
ax.plot(gdp.index, y_gdp, linewidth=0.8)
ax.set(xlabel='Date', ylabel='Billions of Chained 2017 $', 
       title='US GDP')
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(14, 4))
ax.plot(gdp.index[1:], log_change, linewidth=0.8)
ax.set(xlabel='Date', ylabel='Percent change', 
       title='Rate of change')
plt.tight_layout()
plt.show()

print(f"Series length: {len(y_gdp)} observations")
print(f"Mean: {np.mean(y_gdp):.2f}, Std: {np.std(y_gdp):.2f}")

Series length: 316 observations
Mean: 9.02, Std: 0.70

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

plot_acf(y_gdp, lags=40, ax=axes[0], title='ACF - GDP')
plot_pacf(y_gdp, lags=40, ax=axes[1], title='PACF - GDP', method='ywm')

plt.tight_layout()
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(14, 4))
plot_acf(log_change, lags=40, ax=axes[0], title='ACF - Percent change GDP')
plot_pacf(log_change, lags=40, ax=axes[1], title='PACF - Percent change GDP', method='ywm')

plt.tight_layout()
plt.show()

Fitting the model¶

First we will fit up until the last 20 quarters (5 years) of data and forecast to the latest 5, using several different models that incorporate AR(), MA(), and differencing terms, and comparing them

# ---- 2. Train/test split ----
h = 20 # 5 years of data

y_train = y_gdp[:-h] # take up to the last five years
y_test = y_gdp[-h:]  # predict last five years
dates_train = gdp.index[:-h]
dates_test = gdp.index[-h:]

# ---- 3. Model grid ----
specs = [
    (1, 0, 0), (2, 0, 0), (3, 0, 0), # p, d, q - AR models
    (1, 0, 1), (2, 0, 1), (3, 0, 1), # ARMA models
    (1, 1, 0), (2, 1, 0), (3, 1, 0), # ARI (differencing, no MA)
    (1, 1, 1), (2, 1, 1), (3, 1, 1), # ARIMA d=1
    (1, 2, 1), (2, 2, 1), (3, 2, 1), # ARIMA d=2
]

# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
    try:
        if d == 0:
            trend = 'c'
        elif d == 1:
            trend = 't'   # was 'n'
        else:
            trend = 'n'
        fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
        fcast = fit.get_forecast(steps=h)
        mean = np.asarray(fcast.predicted_mean)
        ci = np.asarray(fcast.conf_int(alpha=0.05))
        rmse = np.sqrt(np.mean((mean - y_test) ** 2))
        results.append({
            'spec': f'ARIMA({p},{d},{q})',
            'p': p, 'd': d, 'q': q,
            'n_params': p + q,
            'rmse': rmse,
        })
        fits[(p, d, q)] = (mean, ci)
    except Exception as e:
        print(f'ARIMA({p},{d},{q}) failed: {e}')

df = pd.DataFrame(results)

plt.figure()

# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}

for d_val in sorted(df['d'].unique()):
    sub = df[df['d'] == d_val]
    plt.scatter(sub['n_params'], sub['rmse'],
               c=colors[d_val], s=80,
               label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
    for _, row in sub.iterrows():
        plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
                    xytext=(5, 5), textcoords='offset points')

plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)

# Forecast


plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(dates_train[-context:], y_train[-context:],
        color='black', lw=1.2, label='Training (recent)')
plt.plot(dates_test, y_test, color='black', lw=2, label='Held-out actual')

highlight = {
    (2, 0, 0): ('AR(2)', 'C0'),
    (1, 1, 1): ('ARIMA(1,1,1)', 'C3'),
    (1, 2, 1): ('ARIMA(1,2,1)', 'C2'),
}

for spec, (label, color) in highlight.items():
    if spec not in fits:
        continue
    mean, ci = fits[spec]
    plt.plot(dates_test, mean, color=color, lw=1.8, label=label)
    plt.fill_between(dates_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)

plt.axvline(dates_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

Changes in linear trends¶

Here our data includes the COVID-19 pandemic years in the training set, where there were strong changes to the linear trend in GDP that can’t be captured by first order differencing. What if we restrict our time points to pre-COVID?

# ---- 2. Train/test split ----
# Truncate to pre-COVID
mask = gdp.index <= '2019-12-31'
y_pre = y_gdp[mask]
dates_pre = gdp.index[mask]

h = 20
y_train = y_pre[:-h]
y_test = y_pre[-h:]
dates_train = dates_pre[:-h]
dates_test = dates_pre[-h:]

# ---- 3. Model grid ----
specs = [
    (1, 0, 0), (2, 0, 0), (3, 0, 0),
    (1, 0, 1), (2, 0, 1), (3, 0, 1),
    (1, 1, 0), (2, 1, 0), (3, 1, 0),
    (1, 1, 1), (2, 1, 1), (3, 1, 1),
    (1, 2, 1), (2, 2, 1), (3, 2, 1),
]

# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
    try:
        if d == 0:
            trend = 'c'
        elif d == 1:
            trend = 't'   # was 'n'
        else:
            trend = 'n'
        fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
        fcast = fit.get_forecast(steps=h)
        mean = np.asarray(fcast.predicted_mean)
        ci = np.asarray(fcast.conf_int(alpha=0.05))
        rmse = np.sqrt(np.mean((mean - y_test) ** 2))
        results.append({
            'spec': f'ARIMA({p},{d},{q})',
            'p': p, 'd': d, 'q': q,
            'n_params': p + q,
            'rmse': rmse,
        })
        fits[(p, d, q)] = (mean, ci)
    except Exception as e:
        print(f'ARIMA({p},{d},{q}) failed: {e}')

df = pd.DataFrame(results)

plt.figure()

# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}

for d_val in sorted(df['d'].unique()):
    sub = df[df['d'] == d_val]
    plt.scatter(sub['n_params'], sub['rmse'],
               c=colors[d_val], s=80,
               label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
    for _, row in sub.iterrows():
        plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
                    xytext=(5, 5), textcoords='offset points')

plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)

# Forecast


plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(dates_train[-context:], y_train[-context:],
        color='black', lw=1.2, label='Training (recent)')
plt.plot(dates_test, y_test, color='black', lw=2, label='Held-out actual')

highlight = {
    (2, 0, 0): ('AR(2)', 'C0'),
    (1, 1, 1): ('ARIMA(1,1,1)', 'C3'),
    (1, 2, 1): ('ARIMA(1,2,1)', 'C2'),
}

for spec, (label, color) in highlight.items():
    if spec not in fits:
        continue
    mean, ci = fits[spec]
    plt.plot(dates_test, mean, color=color, lw=1.8, label=label)
    plt.fill_between(dates_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)

plt.axvline(dates_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

What does this mean?¶

So what does this mean about AR, ARMA, and ARIMA models and their use? In each case, it’s very important to understand and plot the underlying data and to look for trends before fitting models. For time series data that have multiple underlying trends or more complex behavior, state space methods (which we will discuss later) may be more helpful. The steps for building any ARIMA models should be:

Plot the data
Possibly transform the data (log transform, differencing, etc)
Identify the dependence orders of the model (inspect ACF, PACF)
Parameter estimation
Diagnostics
Choosing a model

One more example¶

Let’s look at one more example from the Shumway and Stoffer book. This dataset gives the thickness of the yearly varves collected from one location in Massachusetts for 634 years (beginning 11,834 years ago!). Varves are sedimentary deposits of sand and silt that are deposited by melting glaciers during the spring melting seasons (see Example 2.8 of the Shumway-Stoffer book, 5th edition). These deposits can be used as proxies for paleoclimatic parameters, such as temperature, because, in a warm year, more sand and silt are deposited from the receding glacier.

varve_data = pd.read_csv('varve.csv')

yraw = varve_data['x']
#plt.figure(figsize = (10, 8))
plt.plot(yraw)
plt.xlabel('Time')
plt.ylabel('Thickness')
plt.title('Glacial Varve Thickness')
plt.show()

More plot diagnostics¶

From just looking at the plot, it appears that the variance may be nonstationary. We can also look at the QQ-plot, which shows us whether the data come from an underlying distribution (default: normal), and whether they violate the assumption of homoscedasticity. Here, we’ll see that the QQ plot deviates significantly from the line, so we probably do need a variance stabilizing transform (like the logarithm).

from statsmodels.graphics.gofplots import qqplot

plt.figure()
qqplot(yraw, fit=True, line="45");

<Figure size 1200x400 with 0 Axes>

If we log-transform the data, this will be better behaved.

#Log transform the data first
ylog = np.log(yraw)
#plt.figure(figsize = (12, 6))
plt.plot(ylog)
plt.xlabel('Time')
plt.ylabel('log(Thickness)')
plt.title('Logarithm of Glacial Varve Thickness')
plt.show()

plt.figure()
qqplot(ylog, fit=True, line="45");

<Figure size 1200x400 with 0 Axes>

ACF and PACF¶

In addition to the log transform, we’re going to difference our data, then look at the ACF and PACF to see what to do next.

ylogdiff = np.diff(ylog)
#plt.figure(figsize = (12, 6))
plt.plot(ylogdiff)
plt.xlabel('Time')
plt.ylabel('diff(log(Thickness))')
plt.title('Differenced Logarithm of Glacial Varve Thickness')
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(14, 4))

plot_acf(ylogdiff, lags=40, ax=axes[0], title='ACF - diff(log(thickness))')
plot_pacf(ylogdiff, lags=40, ax=axes[1], title='PACF - diff(log(thickness))', method='ywm')

plt.tight_layout()
plt.show()

Function	AR(p)	MA(q)	ARMA(p,q)
ACF	tails off	cuts off after lag q	tails off
PACF	cuts off after lag p	tails off	tails off

What kind of model should we try?

h = 20
y_train = ylog[:-h]
y_test = ylog[-h:]

times = np.arange(len(ylog))
times_train = times[:-h]
times_test = times[-h:]

# ---- 3. Model grid ----
specs = [
    (1, 0, 0), (2, 0, 0), (3, 0, 0), # AR
    (0, 0, 1), (0, 0, 2), (0, 0, 3), # MA
    (0, 1, 1), (0, 1, 2), (0, 1, 3), # 
    (1, 0, 1), (2, 0, 1), (3, 0, 1), # ARMA
    (1, 1, 0), (2, 1, 0), (3, 1, 0), # ARIMA
]

# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
    try:
        if d == 0:
            trend = 'c'
        elif d == 1:
            trend = 'n'
        else:
            trend = 'n'
        fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
        fcast = fit.get_forecast(steps=h)
        mean = np.asarray(fcast.predicted_mean)
        ci = np.asarray(fcast.conf_int(alpha=0.05))
        rmse = np.sqrt(np.mean((mean - y_test) ** 2))
        results.append({
            'spec': f'ARIMA({p},{d},{q})',
            'p': p, 'd': d, 'q': q,
            'n_params': p + q,
            'rmse': rmse,
        })
        fits[(p, d, q)] = (mean, ci)
    except Exception as e:
        print(f'ARIMA({p},{d},{q}) failed: {e}')

df = pd.DataFrame(results)

plt.figure()

# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}

for d_val in sorted(df['d'].unique()):
    sub = df[df['d'] == d_val]
    plt.scatter(sub['n_params'], sub['rmse'],
               c=colors[d_val], s=80,
               label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
    for _, row in sub.iterrows():
        plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
                    xytext=(5, 5), textcoords='offset points')

plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)

# Forecast

plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(times_train[-context:], y_train[-context:],
        color='black', lw=1.2, label='Training (recent)')
plt.plot(times_test, y_test, color='black', lw=2, label='Held-out actual')

highlight = {
    (0, 0, 1): ('MA(2)', 'C0'),
    (0, 1, 1): ('ARIMA(0,1,1)', 'C3'),
    (1, 1, 1): ('ARIMA(1,1,1)', 'C2'),
}

for spec, (label, color) in highlight.items():
    if spec not in fits:
        continue
    mean, ci = fits[spec]
    plt.plot(times_test, mean, color=color, lw=1.8, label=label)
    plt.fill_between(times_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)

plt.axvline(times_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

p=0
d=1
q=1
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()

fig = plt.figure(figsize=(10,6))
fit.plot_diagnostics(fig=fig);
plt.tight_layout()

Ljung-Box p-values¶

These diagnostics allow us to look at properties of our residuals, but there is one more statistic that is helpful to look at when choosing - the Ljung-Box-Pierce Q statistic.

Looking at the residual ACF plot lag-by-lag (above called “correlogram”) can miss a problem where every individual $\hat{\rho}_e(h)$ is just barely under the significance threshold but collectively they’re too large. The Q statistic tests them jointly:

Q=n(n+2) \sum_{h=1}^{H} \frac{\hat{\rho}_e^2(h)}{n-h}

(1)

where $n$ is the number of residuals (time points estimated), $h$ is the lag index, and $H$ is the upper cutoff for lags we’ll pool over.

acorr_ljungbox(fit.resid, lags=20, model_df=p+q)

p=1
d=1
q=1
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()

fig = plt.figure(figsize=(10,6))
fit.plot_diagnostics(fig=fig);
plt.tight_layout()

acorr_ljungbox(fit.resid, lags=20, model_df=p+q)

fit.summary()

Stat 153/248 - Lecture 19