Load packages

library(tidyverse)
library(astsa)
library(fpp3)
library(epidatasets)

AR(1) processes

x1 = arima.sim(list(order = c(1,0,0), ar = 0.9), n = 100)
x2 = arima.sim(list(order = c(1,0,0), ar = -0.9), n = 100)

par(mfrow = c(2, 1), mar = c(2, 0.5, 3, 0.5))
plot(x1, main = paste("AR(1), phi = +0.9"), xlab = "", ylab = "")
plot(x2, main = paste("AR(1), phi = -0.9"), xlab = "", ylab = "")

MA(1) processes

x1 = arima.sim(list(order = c(0,0,1), ma = 0.9), n = 100)
x2 = arima.sim(list(order = c(0,0,1), ma = -0.9), n = 100)

par(mfrow = c(2, 1), mar = c(2, 0.5, 3, 0.5))
plot(x1, main = paste("MA(1), phi = +0.9"), xlab = "", ylab = "")
plot(x2, main = paste("MA(1), phi = -0.9"), xlab = "", ylab = "")

Parameter redundancy

x = rnorm(100)
arima(x, order = c(1,0,1), include.mean = FALSE)
## 
## Call:
## arima(x = x, order = c(1, 0, 1), include.mean = FALSE)
## 
## Coefficients:
##           ar1     ma1
##       -0.3849  0.2276
## s.e.   0.3622  0.3748
## 
## sigma^2 estimated as 0.9105:  log likelihood = -137.22,  aic = 280.44

Auto-correlation

x1 = arima.sim(list(order = c(2,0,0), ar = c(1.5, -0.75)), n = 500)
x2 = arima.sim(list(order = c(0,0,3), ma = c(0.9, 0.85, 0.8)), n = 500)

par(mfrow = c(2, 2), mar = c(4.5, 4.5, 0.5, 0.5))
xlim = c(1, 25)
ylim = c(-1, 1)
cex = 1.25
acf(x1, xlim = xlim, ylim = ylim)
legend("topright", legend = "AR(2)", bty = "n", cex = cex) 
acf(x2, xlim = xlim, ylim = ylim)
legend("topright", legend = "MA(3)", bty = "n", cex = cex) 
pacf(x1, xlim = xlim, ylim = ylim, ylab = "PACF") 
legend("topright", legend = "AR(2)", bty = "n", cex = cex) 
pacf(x2, xlim = xlim, ylim = ylim, ylab = "PACF")
legend("topright", legend = "MA(3)", bty = "n", cex = cex) 

Auto ARIMA?

set.seed(666)
x = rnorm(1000)         

# Older forecast package
forecast::auto.arima(x)  
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Series: x 
## ARIMA(2,0,1) with zero mean 
## 
## Coefficients:
##           ar1      ar2     ma1
##       -0.9744  -0.0477  0.9509
## s.e.   0.0429   0.0321  0.0294
## 
## sigma^2 = 0.9657:  log likelihood = -1400
## AIC=2808.01   AICc=2808.05   BIC=2827.64
# Newer fable package, same result (input must be a tsibble)
dat = tsibble(data.frame(x, Time = 1:length(x)), index = Time)
dat |> model(arima = ARIMA(x ~ pdq())) |> report()
## Series: x 
## Model: ARIMA(2,0,1) 
## 
## Coefficients:
##           ar1      ar2     ma1
##       -0.9744  -0.0477  0.9509
## s.e.   0.0429   0.0321  0.0294
## 
## sigma^2 estimated as 0.9657:  log likelihood=-1400
## AIC=2808.01   AICc=2808.05   BIC=2827.64
# Uh oh! This is basically a redundant parametrization of white noise

ARIMA for CAR exports

car = global_economy |> filter(Code == "CAF")

# Small bit of exploratory analysis
car |>
  ggplot(aes(x = Year, y = Exports)) + 
  geom_line() + geom_point() +
  labs(title = "Central African Republic (CAR) exports",
       y = "Exports (% of GDP)") + theme_bw() 

car |> 
  ggplot(aes(x = Year, y = difference(Exports))) + 
  geom_line() + geom_point() + theme_bw() 
## Warning: Removed 1 row containing missing values (`geom_line()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

par(mar = c(4.5, 4.5, 0.5, 0.5))
xlim = c(1, 15)
ylim = c(-0.5, 0.5)
acf(diff(car$Exports), xlim = xlim, ylim = ylim)

pacf(diff(car$Exports), xlim = xlim, ylim = ylim, ylab = "PACF")

# Now go and fit ARIMA models
car_fit = car |>
  model(arima210 = ARIMA(Exports ~ pdq(2,1,0)),
        arima013 = ARIMA(Exports ~ pdq(0,1,3)),
        auto = ARIMA(Exports))

car_fit |> pivot_longer(!Country, names_to = "Model name",
                         values_to = "Orders")
# Glance at various metrics across the three models
car_fit |> glance() |> select(.model:BIC) |> arrange(AICc) 
# Print coefficients, plot roots for the AR model
car_fit |> select(arima210) |> coef()
car_fit |> select(arima210) |> gg_arma()

# Plot residuals from the AR model
car_fit |> select(arima210) |> gg_tsresiduals() 

# Make 5-year horizon forecasts from the AR model
car_fit |>  
  select(Country, arima210) |>
  forecast(h = 5) |> 
  autoplot(car) + labs(title = "ARIMA(2,1,0)") + theme_bw()

# Make 5-year horizon forecasts from a random walk
car |>
  model(rw = ARIMA(Exports ~ pdq(0,1,0) + 0)) |>
  forecast(h = 5) |> 
  autoplot(car) + labs(title = "Random walk") + theme_bw()

General warning …

# WARNING!!! The ARIMA() function may actually still do some automatic model 
# selection outside of the specified orders p,d,q. This can be dangerous because
# it may decide to do something that you didn't realize! It will decide whether
# to include a constant c or seasonal orders P,D,Q based on the data. To shut 
# this off, you have to FULLY specify the model

# Here's an example:
oh_no = car |>
  model(model1 = ARIMA(Exports ~ pdq(2)),
        model2 = ARIMA(Exports ~ 0 + pdq(2,0,0) + PDQ(0,0,0)))

# Thought you clearly specified as AR(2) model in the first line, right? WRONG. 
# Take a look. You'll see it decided to optimize over the intercept, difference
# order, and MA order ...
oh_no |> select(model1) |> report()
## Series: Exports 
## Model: ARIMA(2,1,2) 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2
##       -0.6741  -0.7142  0.2468  0.4831
## s.e.   0.1821   0.2037  0.2531  0.2576
## 
## sigma^2 estimated as 6.416:  log likelihood=-132.1
## AIC=274.2   AICc=275.37   BIC=284.41
# The second specification is the one that always returns the AR(2) model
oh_no |> select(model2) |> report()
## Series: Exports 
## Model: ARIMA(2,0,0) 
## 
## Coefficients:
##          ar1     ar2
##       0.6059  0.3876
## s.e.  0.1212  0.1217
## 
## sigma^2 estimated as 7.327:  log likelihood=-141.13
## AIC=288.25   AICc=288.7   BIC=294.43

SARIMA for US employment

leisure = us_employment |>
  filter(Title == "Leisure and Hospitality", year(Month) > 2000) |>
  mutate(Employed = Employed/1000) |>
  select(Month, Employed)

# Small bit of exploratory analysis
leisure |>
  ggplot(aes(x = Month, y = Employed)) + 
  geom_line() + geom_point() +
  labs(title = "US employment: leisure and hospitality",
       y = "Employed (millions of people)") + theme_bw() 

leisure |> 
  ggplot(aes(x = Month, y = difference(Employed, lag = 12))) + 
  geom_line() + geom_point() + theme_bw() 
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).

leisure |> 
  ggplot(aes(x = Month, y = difference(difference(Employed, lag = 12)))) + 
  geom_line() + geom_point() + theme_bw() 
## Warning: Removed 13 rows containing missing values (`geom_line()`).
## Warning: Removed 13 rows containing missing values (`geom_point()`).

par(mar = c(4.5, 4.5, 0.5, 0.5))
xlim = c(1, 25)
ylim = c(-0.3, 0.2)
acf(diff(diff(leisure$Employed, lag = 12)), xlim = xlim, ylim = ylim)

pacf(diff(diff(leisure$Employed, lag = 12)), xlim = xlim, ylim = ylim, 
     ylab = "PACF")

# Now go and fit SARIMA models
leisure_fit = leisure |>
  model(arima210110 = ARIMA(Employed ~ pdq(2,1,0) + PDQ(1,1,0, period = 12)),
        arima012011 = ARIMA(Employed ~ pdq(0,1,2) + PDQ(0,1,1, period = 12)),
        auto = ARIMA(Employed))

leisure_fit |> pivot_longer(everything(), names_to = "Model name",
                            values_to = "Orders")
# Glance at various metrics across the three models
leisure_fit |> glance() |> select(.model:BIC) |> arrange(AICc) 
# Print coefficients, plot roots for the AR model
leisure_fit |> select(arima210110) |> coef()
leisure_fit |> select(arima210110) |> gg_arma()

# Plot residuals from the AR model
leisure_fit |> select(arima210110) |> gg_tsresiduals() 

# Make 3-year horizon forecasts from the AR model
leisure_fit |>  
  select(arima210110) |>
  forecast(h = 36) |> 
  autoplot(leisure) + labs(title = "ARIMA(2,1,0)(1,1,0)[12]") + theme_bw()