Load packages
library(tidyverse)
library(astsa)
library(fpp3)
library(epidatasets)
AR(1) processes
x1 = arima.sim(list(order = c(1,0,0), ar = 0.9), n = 100)
x2 = arima.sim(list(order = c(1,0,0), ar = -0.9), n = 100)
par(mfrow = c(2, 1), mar = c(2, 0.5, 3, 0.5))
plot(x1, main = paste("AR(1), phi = +0.9"), xlab = "", ylab = "")
plot(x2, main = paste("AR(1), phi = -0.9"), xlab = "", ylab = "")
MA(1) processes
x1 = arima.sim(list(order = c(0,0,1), ma = 0.9), n = 100)
x2 = arima.sim(list(order = c(0,0,1), ma = -0.9), n = 100)
par(mfrow = c(2, 1), mar = c(2, 0.5, 3, 0.5))
plot(x1, main = paste("MA(1), phi = +0.9"), xlab = "", ylab = "")
plot(x2, main = paste("MA(1), phi = -0.9"), xlab = "", ylab = "")
Parameter redundancy
x = rnorm(100)
arima(x, order = c(1,0,1), include.mean = FALSE)
##
## Call:
## arima(x = x, order = c(1, 0, 1), include.mean = FALSE)
##
## Coefficients:
## ar1 ma1
## -0.3849 0.2276
## s.e. 0.3622 0.3748
##
## sigma^2 estimated as 0.9105: log likelihood = -137.22, aic = 280.44
Auto-correlation
x1 = arima.sim(list(order = c(2,0,0), ar = c(1.5, -0.75)), n = 500)
x2 = arima.sim(list(order = c(0,0,3), ma = c(0.9, 0.85, 0.8)), n = 500)
par(mfrow = c(2, 2), mar = c(4.5, 4.5, 0.5, 0.5))
xlim = c(1, 25)
ylim = c(-1, 1)
cex = 1.25
acf(x1, xlim = xlim, ylim = ylim)
legend("topright", legend = "AR(2)", bty = "n", cex = cex)
acf(x2, xlim = xlim, ylim = ylim)
legend("topright", legend = "MA(3)", bty = "n", cex = cex)
pacf(x1, xlim = xlim, ylim = ylim, ylab = "PACF")
legend("topright", legend = "AR(2)", bty = "n", cex = cex)
pacf(x2, xlim = xlim, ylim = ylim, ylab = "PACF")
legend("topright", legend = "MA(3)", bty = "n", cex = cex)
Auto ARIMA?
set.seed(666)
x = rnorm(1000)
# Older forecast package
forecast::auto.arima(x)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Series: x
## ARIMA(2,0,1) with zero mean
##
## Coefficients:
## ar1 ar2 ma1
## -0.9744 -0.0477 0.9509
## s.e. 0.0429 0.0321 0.0294
##
## sigma^2 = 0.9657: log likelihood = -1400
## AIC=2808.01 AICc=2808.05 BIC=2827.64
# Newer fable package, same result (input must be a tsibble)
dat = tsibble(data.frame(x, Time = 1:length(x)), index = Time)
dat |> model(arima = ARIMA(x ~ pdq())) |> report()
## Series: x
## Model: ARIMA(2,0,1)
##
## Coefficients:
## ar1 ar2 ma1
## -0.9744 -0.0477 0.9509
## s.e. 0.0429 0.0321 0.0294
##
## sigma^2 estimated as 0.9657: log likelihood=-1400
## AIC=2808.01 AICc=2808.05 BIC=2827.64
# Uh oh! This is basically a redundant parametrization of white noise
ARIMA for CAR exports
car = global_economy |> filter(Code == "CAF")
# Small bit of exploratory analysis
car |>
ggplot(aes(x = Year, y = Exports)) +
geom_line() + geom_point() +
labs(title = "Central African Republic (CAR) exports",
y = "Exports (% of GDP)") + theme_bw()
car |>
ggplot(aes(x = Year, y = difference(Exports))) +
geom_line() + geom_point() + theme_bw()
## Warning: Removed 1 row containing missing values (`geom_line()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
par(mar = c(4.5, 4.5, 0.5, 0.5))
xlim = c(1, 15)
ylim = c(-0.5, 0.5)
acf(diff(car$Exports), xlim = xlim, ylim = ylim)
pacf(diff(car$Exports), xlim = xlim, ylim = ylim, ylab = "PACF")
# Now go and fit ARIMA models
car_fit = car |>
model(arima210 = ARIMA(Exports ~ pdq(2,1,0)),
arima013 = ARIMA(Exports ~ pdq(0,1,3)),
auto = ARIMA(Exports))
car_fit |> pivot_longer(!Country, names_to = "Model name",
values_to = "Orders")
# Glance at various metrics across the three models
car_fit |> glance() |> select(.model:BIC) |> arrange(AICc)
# Print coefficients, plot roots for the AR model
car_fit |> select(arima210) |> coef()
car_fit |> select(arima210) |> gg_arma()
# Plot residuals from the AR model
car_fit |> select(arima210) |> gg_tsresiduals()
# Make 5-year horizon forecasts from the AR model
car_fit |>
select(Country, arima210) |>
forecast(h = 5) |>
autoplot(car) + labs(title = "ARIMA(2,1,0)") + theme_bw()
# Make 5-year horizon forecasts from a random walk
car |>
model(rw = ARIMA(Exports ~ pdq(0,1,0) + 0)) |>
forecast(h = 5) |>
autoplot(car) + labs(title = "Random walk") + theme_bw()
General warning …
# WARNING!!! The ARIMA() function may actually still do some automatic model
# selection outside of the specified orders p,d,q. This can be dangerous because
# it may decide to do something that you didn't realize! It will decide whether
# to include a constant c or seasonal orders P,D,Q based on the data. To shut
# this off, you have to FULLY specify the model
# Here's an example:
oh_no = car |>
model(model1 = ARIMA(Exports ~ pdq(2)),
model2 = ARIMA(Exports ~ 0 + pdq(2,0,0) + PDQ(0,0,0)))
# Thought you clearly specified as AR(2) model in the first line, right? WRONG.
# Take a look. You'll see it decided to optimize over the intercept, difference
# order, and MA order ...
oh_no |> select(model1) |> report()
## Series: Exports
## Model: ARIMA(2,1,2)
##
## Coefficients:
## ar1 ar2 ma1 ma2
## -0.6741 -0.7142 0.2468 0.4831
## s.e. 0.1821 0.2037 0.2531 0.2576
##
## sigma^2 estimated as 6.416: log likelihood=-132.1
## AIC=274.2 AICc=275.37 BIC=284.41
# The second specification is the one that always returns the AR(2) model
oh_no |> select(model2) |> report()
## Series: Exports
## Model: ARIMA(2,0,0)
##
## Coefficients:
## ar1 ar2
## 0.6059 0.3876
## s.e. 0.1212 0.1217
##
## sigma^2 estimated as 7.327: log likelihood=-141.13
## AIC=288.25 AICc=288.7 BIC=294.43
SARIMA for US employment
leisure = us_employment |>
filter(Title == "Leisure and Hospitality", year(Month) > 2000) |>
mutate(Employed = Employed/1000) |>
select(Month, Employed)
# Small bit of exploratory analysis
leisure |>
ggplot(aes(x = Month, y = Employed)) +
geom_line() + geom_point() +
labs(title = "US employment: leisure and hospitality",
y = "Employed (millions of people)") + theme_bw()
leisure |>
ggplot(aes(x = Month, y = difference(Employed, lag = 12))) +
geom_line() + geom_point() + theme_bw()
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
leisure |>
ggplot(aes(x = Month, y = difference(difference(Employed, lag = 12)))) +
geom_line() + geom_point() + theme_bw()
## Warning: Removed 13 rows containing missing values (`geom_line()`).
## Warning: Removed 13 rows containing missing values (`geom_point()`).
par(mar = c(4.5, 4.5, 0.5, 0.5))
xlim = c(1, 25)
ylim = c(-0.3, 0.2)
acf(diff(diff(leisure$Employed, lag = 12)), xlim = xlim, ylim = ylim)
pacf(diff(diff(leisure$Employed, lag = 12)), xlim = xlim, ylim = ylim,
ylab = "PACF")
# Now go and fit SARIMA models
leisure_fit = leisure |>
model(arima210110 = ARIMA(Employed ~ pdq(2,1,0) + PDQ(1,1,0, period = 12)),
arima012011 = ARIMA(Employed ~ pdq(0,1,2) + PDQ(0,1,1, period = 12)),
auto = ARIMA(Employed))
leisure_fit |> pivot_longer(everything(), names_to = "Model name",
values_to = "Orders")
# Glance at various metrics across the three models
leisure_fit |> glance() |> select(.model:BIC) |> arrange(AICc)
# Print coefficients, plot roots for the AR model
leisure_fit |> select(arima210110) |> coef()
leisure_fit |> select(arima210110) |> gg_arma()
# Plot residuals from the AR model
leisure_fit |> select(arima210110) |> gg_tsresiduals()
# Make 3-year horizon forecasts from the AR model
leisure_fit |>
select(arima210110) |>
forecast(h = 36) |>
autoplot(leisure) + labs(title = "ARIMA(2,1,0)(1,1,0)[12]") + theme_bw()