Here we will extend the concepts from Lecture 18 to ARIMA.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import signal
import warnings
warnings.filterwarnings('ignore')
# statsmodels imports
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)
plt.rcParams.update({
'figure.figsize': (12, 4),
'axes.spines.top': False,
'axes.spines.right': False,
'font.size': 16,
})The ARIMA model and GDP¶
Here we’ll use data from FRED (Federal Reserve Economic Data) on the GDP.
# US GDP (seasonally adjusted)
# Source: FRED series GDPC1
gdp = pd.read_csv('GDPC1.csv', parse_dates=['observation_date'], index_col='observation_date').dropna()
y_gdp = np.log(gdp['GDPC1'].values)
log_change = np.diff(y_gdp)
fig, ax = plt.subplots(figsize=(14, 4))
ax.plot(gdp.index, y_gdp, linewidth=0.8)
ax.set(xlabel='Date', ylabel='Billions of Chained 2017 $',
title='US GDP')
plt.tight_layout()
plt.show()
fig, ax = plt.subplots(figsize=(14, 4))
ax.plot(gdp.index[1:], log_change, linewidth=0.8)
ax.set(xlabel='Date', ylabel='Percent change',
title='Rate of change')
plt.tight_layout()
plt.show()
print(f"Series length: {len(y_gdp)} observations")
print(f"Mean: {np.mean(y_gdp):.2f}, Std: {np.std(y_gdp):.2f}")


Series length: 316 observations
Mean: 9.02, Std: 0.70
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
plot_acf(y_gdp, lags=40, ax=axes[0], title='ACF - GDP')
plot_pacf(y_gdp, lags=40, ax=axes[1], title='PACF - GDP', method='ywm')
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
plot_acf(log_change, lags=40, ax=axes[0], title='ACF - Percent change GDP')
plot_pacf(log_change, lags=40, ax=axes[1], title='PACF - Percent change GDP', method='ywm')
plt.tight_layout()
plt.show()
Fitting the model¶
First we will fit up until the last 20 quarters (5 years) of data and forecast to the latest 5, using several different models that incorporate AR(), MA(), and differencing terms, and comparing them
# ---- 2. Train/test split ----
h = 20 # 5 years of data
y_train = y_gdp[:-h] # take up to the last five years
y_test = y_gdp[-h:] # predict last five years
dates_train = gdp.index[:-h]
dates_test = gdp.index[-h:]
# ---- 3. Model grid ----
specs = [
(1, 0, 0), (2, 0, 0), (3, 0, 0), # p, d, q - AR models
(1, 0, 1), (2, 0, 1), (3, 0, 1), # ARMA models
(1, 1, 0), (2, 1, 0), (3, 1, 0), # ARI (differencing, no MA)
(1, 1, 1), (2, 1, 1), (3, 1, 1), # ARIMA d=1
(1, 2, 1), (2, 2, 1), (3, 2, 1), # ARIMA d=2
]
# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
try:
if d == 0:
trend = 'c'
elif d == 1:
trend = 't' # was 'n'
else:
trend = 'n'
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
fcast = fit.get_forecast(steps=h)
mean = np.asarray(fcast.predicted_mean)
ci = np.asarray(fcast.conf_int(alpha=0.05))
rmse = np.sqrt(np.mean((mean - y_test) ** 2))
results.append({
'spec': f'ARIMA({p},{d},{q})',
'p': p, 'd': d, 'q': q,
'n_params': p + q,
'rmse': rmse,
})
fits[(p, d, q)] = (mean, ci)
except Exception as e:
print(f'ARIMA({p},{d},{q}) failed: {e}')
df = pd.DataFrame(results)
plt.figure()
# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}
for d_val in sorted(df['d'].unique()):
sub = df[df['d'] == d_val]
plt.scatter(sub['n_params'], sub['rmse'],
c=colors[d_val], s=80,
label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
for _, row in sub.iterrows():
plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
xytext=(5, 5), textcoords='offset points')
plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)
# Forecast
plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(dates_train[-context:], y_train[-context:],
color='black', lw=1.2, label='Training (recent)')
plt.plot(dates_test, y_test, color='black', lw=2, label='Held-out actual')
highlight = {
(2, 0, 0): ('AR(2)', 'C0'),
(1, 1, 1): ('ARIMA(1,1,1)', 'C3'),
(1, 2, 1): ('ARIMA(1,2,1)', 'C2'),
}
for spec, (label, color) in highlight.items():
if spec not in fits:
continue
mean, ci = fits[spec]
plt.plot(dates_test, mean, color=color, lw=1.8, label=label)
plt.fill_between(dates_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)
plt.axvline(dates_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

Changes in linear trends¶
Here our data includes the COVID-19 pandemic years in the training set, where there were strong changes to the linear trend in GDP that can’t be captured by first order differencing. What if we restrict our time points to pre-COVID?
# ---- 2. Train/test split ----
# Truncate to pre-COVID
mask = gdp.index <= '2019-12-31'
y_pre = y_gdp[mask]
dates_pre = gdp.index[mask]
h = 20
y_train = y_pre[:-h]
y_test = y_pre[-h:]
dates_train = dates_pre[:-h]
dates_test = dates_pre[-h:]
# ---- 3. Model grid ----
specs = [
(1, 0, 0), (2, 0, 0), (3, 0, 0),
(1, 0, 1), (2, 0, 1), (3, 0, 1),
(1, 1, 0), (2, 1, 0), (3, 1, 0),
(1, 1, 1), (2, 1, 1), (3, 1, 1),
(1, 2, 1), (2, 2, 1), (3, 2, 1),
]
# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
try:
if d == 0:
trend = 'c'
elif d == 1:
trend = 't' # was 'n'
else:
trend = 'n'
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
fcast = fit.get_forecast(steps=h)
mean = np.asarray(fcast.predicted_mean)
ci = np.asarray(fcast.conf_int(alpha=0.05))
rmse = np.sqrt(np.mean((mean - y_test) ** 2))
results.append({
'spec': f'ARIMA({p},{d},{q})',
'p': p, 'd': d, 'q': q,
'n_params': p + q,
'rmse': rmse,
})
fits[(p, d, q)] = (mean, ci)
except Exception as e:
print(f'ARIMA({p},{d},{q}) failed: {e}')
df = pd.DataFrame(results)
plt.figure()
# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}
for d_val in sorted(df['d'].unique()):
sub = df[df['d'] == d_val]
plt.scatter(sub['n_params'], sub['rmse'],
c=colors[d_val], s=80,
label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
for _, row in sub.iterrows():
plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
xytext=(5, 5), textcoords='offset points')
plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)
# Forecast
plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(dates_train[-context:], y_train[-context:],
color='black', lw=1.2, label='Training (recent)')
plt.plot(dates_test, y_test, color='black', lw=2, label='Held-out actual')
highlight = {
(2, 0, 0): ('AR(2)', 'C0'),
(1, 1, 1): ('ARIMA(1,1,1)', 'C3'),
(1, 2, 1): ('ARIMA(1,2,1)', 'C2'),
}
for spec, (label, color) in highlight.items():
if spec not in fits:
continue
mean, ci = fits[spec]
plt.plot(dates_test, mean, color=color, lw=1.8, label=label)
plt.fill_between(dates_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)
plt.axvline(dates_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

More problems...¶
Here we see a little less advantage from the model, but it’s possible that economic shifts related to the 2008 financial crisis are at play here.. so what if we go even further back, to when the linear trend in GDP was more stable? (pre-2008)?
# ---- 2. Train/test split ----
mask = gdp.index <= '2007-12-31'
y_pre = y_gdp[mask]
dates_pre = gdp.index[mask]
h = 20
y_train = y_pre[:-h]
y_test = y_pre[-h:]
dates_train = dates_pre[:-h]
dates_test = dates_pre[-h:]
# ---- 3. Model grid ----
specs = [
(1, 0, 0), (2, 0, 0), (3, 0, 0),
(1, 0, 1), (2, 0, 1), (3, 0, 1),
(1, 1, 0), (2, 1, 0), (3, 1, 0),
(1, 1, 1), (2, 1, 1), (3, 1, 1),
(1, 2, 1), (2, 2, 1), (3, 2, 1),
]
# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
try:
if d == 0:
trend = 'c'
elif d == 1:
trend = 't' # was 'n'
else:
trend = 'n'
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
fcast = fit.get_forecast(steps=h)
mean = np.asarray(fcast.predicted_mean)
ci = np.asarray(fcast.conf_int(alpha=0.05))
rmse = np.sqrt(np.mean((mean - y_test) ** 2))
results.append({
'spec': f'ARIMA({p},{d},{q})',
'p': p, 'd': d, 'q': q,
'n_params': p + q,
'rmse': rmse,
})
fits[(p, d, q)] = (mean, ci)
except Exception as e:
print(f'ARIMA({p},{d},{q}) failed: {e}')
df = pd.DataFrame(results)
plt.figure()
# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}
for d_val in sorted(df['d'].unique()):
sub = df[df['d'] == d_val]
plt.scatter(sub['n_params'], sub['rmse'],
c=colors[d_val], s=80,
label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
for _, row in sub.iterrows():
plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
xytext=(5, 5), textcoords='offset points')
plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)
# Forecast
plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(dates_train[-context:], y_train[-context:],
color='black', lw=1.2, label='Training (recent)')
plt.plot(dates_test, y_test, color='black', lw=2, label='Held-out actual')
highlight = {
(2, 0, 0): ('AR(2)', 'C0'),
(1, 1, 1): ('ARIMA(1,1,1)', 'C3'),
(1, 2, 1): ('ARIMA(1,2,1)', 'C2'),
}
for spec, (label, color) in highlight.items():
if spec not in fits:
continue
mean, ci = fits[spec]
plt.plot(dates_test, mean, color=color, lw=1.8, label=label)
plt.fill_between(dates_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)
plt.axvline(dates_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

What does this mean?¶
So what does this mean about AR, ARMA, and ARIMA models and their use? In each case, it’s very important to understand and plot the underlying data and to look for trends before fitting models. For time series data that have multiple underlying trends or more complex behavior, state space methods (which we will discuss later) may be more helpful. The steps for building any ARIMA models should be:
Plot the data
Possibly transform the data (log transform, differencing, etc)
Identify the dependence orders of the model (inspect ACF, PACF)
Parameter estimation
Diagnostics
Choosing a model
One more example¶
Let’s look at one more example from the Shumway and Stoffer book. This dataset gives the thickness of the yearly varves collected from one location in Massachusetts for 634 years (beginning 11,834 years ago!). Varves are sedimentary deposits of sand and silt that are deposited by melting glaciers during the spring melting seasons (see Example 2.8 of the Shumway-Stoffer book, 5th edition). These deposits can be used as proxies for paleoclimatic parameters, such as temperature, because, in a warm year, more sand and silt are deposited from the receding glacier.
varve_data = pd.read_csv('varve.csv')
yraw = varve_data['x']
#plt.figure(figsize = (10, 8))
plt.plot(yraw)
plt.xlabel('Time')
plt.ylabel('Thickness')
plt.title('Glacial Varve Thickness')
plt.show()
More plot diagnostics¶
From just looking at the plot, it appears that the variance may be nonstationary. We can also look at the QQ-plot, which shows us whether the data come from an underlying distribution (default: normal), and whether they violate the assumption of homoscedasticity. Here, we’ll see that the QQ plot deviates significantly from the line, so we probably do need a variance stabilizing transform (like the logarithm).
from statsmodels.graphics.gofplots import qqplot
plt.figure()
qqplot(yraw, fit=True, line="45");<Figure size 1200x400 with 0 Axes>
If we log-transform the data, this will be better behaved.
#Log transform the data first
ylog = np.log(yraw)
#plt.figure(figsize = (12, 6))
plt.plot(ylog)
plt.xlabel('Time')
plt.ylabel('log(Thickness)')
plt.title('Logarithm of Glacial Varve Thickness')
plt.show()
plt.figure()
qqplot(ylog, fit=True, line="45");<Figure size 1200x400 with 0 Axes>
ACF and PACF¶
In addition to the log transform, we’re going to difference our data, then look at the ACF and PACF to see what to do next.
ylogdiff = np.diff(ylog)
#plt.figure(figsize = (12, 6))
plt.plot(ylogdiff)
plt.xlabel('Time')
plt.ylabel('diff(log(Thickness))')
plt.title('Differenced Logarithm of Glacial Varve Thickness')
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
plot_acf(ylogdiff, lags=40, ax=axes[0], title='ACF - diff(log(thickness))')
plot_pacf(ylogdiff, lags=40, ax=axes[1], title='PACF - diff(log(thickness))', method='ywm')
plt.tight_layout()
plt.show()
| Function | AR(p) | MA(q) | ARMA(p,q) |
|---|---|---|---|
| ACF | tails off | cuts off after lag q | tails off |
| PACF | cuts off after lag p | tails off | tails off |
What kind of model should we try?
h = 20
y_train = ylog[:-h]
y_test = ylog[-h:]
times = np.arange(len(ylog))
times_train = times[:-h]
times_test = times[-h:]
# ---- 3. Model grid ----
specs = [
(1, 0, 0), (2, 0, 0), (3, 0, 0), # AR
(0, 0, 1), (0, 0, 2), (0, 0, 3), # MA
(0, 1, 1), (0, 1, 2), (0, 1, 3), #
(1, 0, 1), (2, 0, 1), (3, 0, 1), # ARMA
(1, 1, 0), (2, 1, 0), (3, 1, 0), # ARIMA
]
# ---- 4. Fit, forecast, score ----
results = []
fits = {}
for (p, d, q) in specs:
try:
if d == 0:
trend = 'c'
elif d == 1:
trend = 'n'
else:
trend = 'n'
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
fcast = fit.get_forecast(steps=h)
mean = np.asarray(fcast.predicted_mean)
ci = np.asarray(fcast.conf_int(alpha=0.05))
rmse = np.sqrt(np.mean((mean - y_test) ** 2))
results.append({
'spec': f'ARIMA({p},{d},{q})',
'p': p, 'd': d, 'q': q,
'n_params': p + q,
'rmse': rmse,
})
fits[(p, d, q)] = (mean, ci)
except Exception as e:
print(f'ARIMA({p},{d},{q}) failed: {e}')
df = pd.DataFrame(results)
plt.figure()
# RMSE vs. complexity
colors = {0: 'C0', 1: 'C3', 2: 'C2'}
for d_val in sorted(df['d'].unique()):
sub = df[df['d'] == d_val]
plt.scatter(sub['n_params'], sub['rmse'],
c=colors[d_val], s=80,
label=f'd = {d_val}', edgecolor='k', linewidth=0.5)
for _, row in sub.iterrows():
plt.gca().annotate(row['spec'], (row['n_params'], row['rmse']),
xytext=(5, 5), textcoords='offset points')
plt.xlabel('# ARMA parameters (p + q)')
plt.gca().set_xticks(np.arange(1,5))
plt.ylabel(f'RMSE')
plt.title('Forecast accuracy vs. model complexity')
plt.legend(title='Differencing', loc='upper right')
plt.grid(alpha=0.3)
# Forecast
plt.figure()
context = 40 # Quarters to look back in the past (zooming in)
plt.plot(times_train[-context:], y_train[-context:],
color='black', lw=1.2, label='Training (recent)')
plt.plot(times_test, y_test, color='black', lw=2, label='Held-out actual')
highlight = {
(0, 0, 1): ('MA(2)', 'C0'),
(0, 1, 1): ('ARIMA(0,1,1)', 'C3'),
(1, 1, 1): ('ARIMA(1,1,1)', 'C2'),
}
for spec, (label, color) in highlight.items():
if spec not in fits:
continue
mean, ci = fits[spec]
plt.plot(times_test, mean, color=color, lw=1.8, label=label)
plt.fill_between(times_test, ci[:, 0], ci[:, 1], color=color, alpha=0.15)
plt.axvline(times_test[0], color='gray', ls='--', lw=0.8)
plt.xlabel('Date')
plt.ylabel('log(GDP)')
plt.title(f'{h}-quarter forecasts with 95% intervals')
plt.legend(loc='upper left')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

p=0
d=1
q=1
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
fig = plt.figure(figsize=(10,6))
fit.plot_diagnostics(fig=fig);
plt.tight_layout()
Ljung-Box p-values¶
These diagnostics allow us to look at properties of our residuals, but there is one more statistic that is helpful to look at when choosing - the Ljung-Box-Pierce Q statistic.
Looking at the residual ACF plot lag-by-lag (above called “correlogram”) can miss a problem where every individual is just barely under the significance threshold but collectively they’re too large. The Q statistic tests them jointly:
where is the number of residuals (time points estimated), is the lag index, and is the upper cutoff for lags we’ll pool over.
acorr_ljungbox(fit.resid, lags=20, model_df=p+q)p=1
d=1
q=1
fit = ARIMA(y_train, order=(p, d, q), trend=trend).fit()
fig = plt.figure(figsize=(10,6))
fit.plot_diagnostics(fig=fig);
plt.tight_layout()
acorr_ljungbox(fit.resid, lags=20, model_df=p+q)
fit.summary()