Data description

In this exercise, we implement the Bayesian Gaussian AR(p) with conjugate prior when modeling of the Canadian Lynx time series. Conjugacy leads to closed form posterior inference, so when implementing a standard Gibbs sampler we can compare its performance to the exact derivations for predictive densities, posterior densities, Bayes factors and posterior model probabilities.

We use the famous Canadian Lynx data, a yearly time series (from 1845 to 1935) that counts the number (in thousands) of lynx pelts purchased by the Hudson’s Bay Company of Canada. We will extract it from David Stoffer’s R packages for time series analysis astsa.

More serious time series models were considered by Lopes and Salazar (2004) Bayesian model uncertainty in smooth transition autoregressions, Journal of Time Series Analysis, 27, 99-117. See also their references. The paper paper can be found at http://hedibert.org/wp-content/uploads/2013/12/lopes-salazar-2006a.pdf

library("mvtnorm")
library("astsa")

y1 = Lynx
y1 = y1-mean(y1)
y1 = y1/sqrt(var(y1))
n  = length(y1)

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
ts.plot(y1,main="Canadian Lynx",ylab="")
acf(y1,main="")
pacf(y1,main="")

Maximum likelihood inference

We will use the function ar form the R package astsa to fit AR(p) models, for \(p=0,1,\ldots,10\), to the standardized Canadian lynx data and compare their fit via Akaike’s information criteria (AIC). More specifically, for \(t=1,\ldots,n\), we model the data \(\{y_1,\ldots,y_n\}\) as \[ y_t = \beta_0 + \sum_{j=1}^p \beta_j y_{t-j} + \varepsilon_t, \] where \(\varepsilon_t\)s are i.i.d \(N(0,\sigma^2)\).

Matrix notation

I prefer to work with the matrix notation, where \[ y|X,\beta,\sigma^2 \sim N(X\beta,\sigma^2 I_{n-p}), \] where \(y=(y_{p+1},\ldots,y_n)\) and \(X=(x_{p+1},\ldots,x_n)'\), for \(x_t'=(1,y_{t-1},y_{t-2},\ldots,y_{t-p})\). Notice that, for simplification, we are performing the inference based on the last \((n-p)\) observations, so \(y_1,\ldots,y_p\) are used as initial states.

Once the matrix notation is established, one can notice that the model becomes a standard Gaussian multiple linear regression.

pmax   = 10
fit.y1 = ar(y1,order.max=pmax,aic=TRUE)
fit.y1
## 
## Call:
## ar(x = y1, aic = TRUE, order.max = pmax)
## 
## Coefficients:
##       1        2        3  
##  1.1741  -0.4277  -0.2302  
## 
## Order selected 3  sigma^2 estimated as  0.1797
aic.y1 = fit.y1$aic
ps     = 0:pmax
py1    = ps[aic.y1==min(aic.y1)]

plot(0:pmax,aic.y1,type="b",xlab="AR order",ylab="AIC",ylim=c(0,40),axes=FALSE)
axis(2);box();axis(1,at=0:pmax)
title(paste("AIC minimum at p=",py1,sep=""))
text(0,1,round(aic.y1[1]),col=2)
text(1,1,round(aic.y1[2]),col=2)

Bayesian inference with conjugate prior

Bayesian inference can be with performed in closed-form when a conjugate prior is used for the model coefficients and variance. More specifically, we assume the normal-gamma prior for \((\beta,\sigma^2)\): \[ \beta|\sigma^2 \sim N(b_0,\sigma^2B_0) \ \ \ \mbox{and} \ \ \ \sigma^2 \sim IG(\nu_0/2,\nu_0\sigma_0^2/2). \]

Prior predictive \(p(y|X)\)

Conjugacy allows for closed-form derivation of the prior predictive density, which is useful for Bayesian model comparison: \[ p(y|X) = \int p(y|X,\beta,\sigma^2)p(\beta)p(\sigma^2)d\beta d\sigma^2, \] and show that the distribution of \(y|X\) is Student’s \(t\), which is a scale mixture of normals (SMN): \[ y|X \sim t_{\nu_0}(X b_0,\sigma_0^2B_0). \] We can then compare different model orders by predictives, say by comparing \(p(y|X,p=2)\) and \(p(y|X,p=5)\), for instance.

Joint posterior \(p(\beta,\sigma^2|y,X)\)

The conjugate prior leads to the following posterior distribution for \((\beta,\sigma^2)\): \[ \beta|\sigma^2 \sim N(b_1,\sigma^2B_1) \ \ \ \mbox{and} \ \ \ \sigma^2 \sim IG(\nu_1/2,\nu_1\sigma_1^2/2), \] where \[\begin{eqnarray*} B_1^{-1} &=& B_0^{-1} + X'X\\ B_1^{-1}b_1 &=& B_0^{-1}b_0 + X'y\\ \nu_1 &=& \nu_0 + (n-p)\\ \nu_1\sigma_1^2 &=& \nu_0\sigma_0^2 + (y-X b_1)'y + (b_0-b_1)B_0^{-1}b_0. \end{eqnarray*}\]

preds = NULL
for (p in 2:pmax){
  y    = y1[(p+1):n]
  X    = cbind(1,y1[p:(n-1)])
  for (i in 2:p)
    X = cbind(X,y1[(p-i+1):(n-i)])

  # Prior hyperparameters
  b0       = rep(0,p+1)
  B0       = diag(25,p+1)
  nu0      = 18
  sig20    = 0.01666667
  nu0sig20 = nu0*sig20
  iB0      = solve(B0)
  iB0b0    = iB0%*%b0
  
  # posterior sufficient statistics
  B1       = solve(iB0+t(X)%*%X)
  tcB1     = t(chol(B1))
  b1       = B1%*%(iB0b0+t(X)%*%y)
  nu1      = nu0 + (n-p)
  
  # Predictive moments
  m = X%*%b0
  V = sig20*(diag(1,n-p)+X%*%B0%*%t(X))
  predy = dmvt(y,m,V,df=nu0,log=TRUE)
  preds = c(preds,predy)
}
ps = 2:pmax

par(mfrow=c(1,2))
plot(2:pmax,aic.y1[3:(pmax+1)],type="b",xlab="AR order",ylab="AIC",axes=FALSE)
axis(2);box();axis(1,at=2:pmax)
title(paste("AIC minimum at p=",py1,sep=""))
text(0,1,round(aic.y1[1]),col=2)
text(1,1,round(aic.y1[2]),col=2)
plot(2:pmax,preds,axes=FALSE,type="b",ylab="Log predictive",xlab="AR order")
axis(2);box();axis(1,at=2:pmax)
title(paste("Predictive maximum at p=",ps[preds==max(preds)],sep=""))

Posterior model probability \(Pr(p=k|y,X)\)

Also, assuming uniform prior amongst the \(p\) AR models, we can show that the posterior model probabilities are: \[ Pr(p=k|y,X) = \frac{p(y|X,p=k)}{\sum_{l=1}^p p(y|X,p=l)}, \qquad k=0,1,\ldots,10. \]

# Computing posterior model probabilites
pmp = exp(preds)
pmp = pmp/sum(pmp)

par(mfrow=c(1,1))
plot(2:pmax,pmp,axes=FALSE,type="h",ylab="Probability",lwd=3,xlab="AR order")
axis(2);box();axis(1,at=2:pmax)
title("Posterior model probabilities")

Bayesian AR(3) model

We now compare closed-form posterior quantities with Gibbs-based ones.

Closed form posterior inference

p    = 3
y    = y1[(p+1):n]
X    = cbind(1,y1[p:(n-1)])
for (i in 2:p)
  X = cbind(X,y1[(p-i+1):(n-i)])

# prior hyperparameters
b0       = rep(0,p+1)
B0       = diag(25,p+1)
nu0      = 18
sig20    = 0.01666667

# posterior hyperparameters
nu0sig20 = nu0*sig20
iB0      = solve(B0)
iB0b0    = iB0%*%b0
B1       = solve(iB0+t(X)%*%X)
tcB1     = t(chol(B1))
b1       = B1%*%(iB0b0+t(X)%*%y)
nu1      = nu0 + (n-p)
sig21    = ((nu0sig20+sum((y-X%*%b1)*y)+t(b0-b1)%*%iB0b0)/nu1)[1]
se       = sqrt(nu1/(nu1-2)*sig21*diag(B1))

Let’s compare the maximum likelihood and Bayesian summaries

tab = round(cbind(b1,se,b1/se,2*(1-pt(abs(b1/se),df=nu1))),5)
colnames(tab)=c("Mean","StDev","t","tail")
rownames(tab)=c("Intercept","beta1","beta2","beta3")
tab
##               Mean   StDev        t    tail
## Intercept -0.00895 0.04064 -0.22022 0.82613
## beta1      1.16788 0.09610 12.15277 0.00000
## beta2     -0.41754 0.14404 -2.89882 0.00455
## beta3     -0.23755 0.09526 -2.49366 0.01419
# Posterior mean and standard deviation for sigma2
Esig2 = (nu1*sig21/2)/(nu1/2-1)
SDsig2 = sqrt((nu1*sig21/2)^2/((nu1/2-1)*(nu1/2-2)))
c(Esig2,SDsig2)
## [1] 0.1453415 0.1467595
fit.ols = lm(y~X-1)
summary(fit.ols)
## 
## Call:
## lm(formula = y ~ X - 1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.60722 -0.23822 -0.04092  0.19346  1.08512 
## 
## Coefficients:
##     Estimate Std. Error t value Pr(>|t|)    
## X1 -0.008918   0.044684   -0.20  0.84229    
## X2  1.171865   0.106088   11.05  < 2e-16 ***
## X3 -0.423173   0.159110   -2.66  0.00937 ** 
## X4 -0.234507   0.105162   -2.23  0.02842 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4191 on 84 degrees of freedom
## Multiple R-squared:  0.8323, Adjusted R-squared:  0.8243 
## F-statistic: 104.2 on 4 and 84 DF,  p-value: < 2.2e-16
beta.ols = fit.ols$coef
sig.ols = summary(fit.ols)$sig

Gibbs sampler

This exercise illustraste the performance of an Markov chain Monte Carlo (MCMC) algorithm in approximating posterior quantities. Since we do have closed-form quantities, we will be able to check the goodness of the Monte Carlo approximation.

set.seed(12345)
burnin = 10000
M      = 10000
niter  = burnin+M
draws  = matrix(0,niter,p+2)
beta.d = beta.ols
runtime = system.time({
  for (iter in 1:niter){
    sig2.d = 1/rgamma(1,nu1/2,(nu0sig20+sum((y-X%*%beta.d)^2))/2)
    #beta.d = as.vector(rmvnorm(1,b1,sig2.d*B1))
    beta.d = b1+sqrt(sig2.d)*tcB1%*%rnorm(p+1)
    draws[iter,] = c(beta.d,sig2.d)
  }})
draws = draws[(burnin+1):niter,]
runtime
##    user  system elapsed 
##   0.364   0.050   0.415
# runtime with burnin=M=10,000 and using Cholesky = 0.432 seconds
# runtime with burnin=M=10,000 and using rmvnorm  = 7.728 seconds

Trace plots and autocorrelation functions

par=c("beta0","beta1","beta2","beta3","sigma2")
par(mfrow=c(2,p+2))
ind = seq(1,M,by=10)
for (i in 1:(p+2))
  plot(ind,draws[ind,i],xlab="Iterations",ylab="",main=par[i],type="l")
for (i in 1:(p+2))
  acf(draws[,i],main="")

Comparing Exact and Gibbs-based posterior quantities

Esig2 = c((nu1*sig21/2)/(nu1/2-1),mean(draws[,p+2]))
SDsig2 = c(sqrt((nu1*sig21/2)^2/((nu1/2-1)^2*(nu1/2-2))),
           sqrt(var(draws[,p+2])))

tab = round(rbind(cbind(b1,apply(draws[,1:(p+1)],2,mean),se,
            sqrt(apply(draws[,1:(p+1)],2,var))),c(Esig2,SDsig2)),4)

colnames(tab)=c("Mean Exact","Mean Gibbs","StDev Exact","StDev Gibbs")
rownames(tab)=c("Intercept","beta1","beta2","beta3","sigma2")
tab
##           Mean Exact Mean Gibbs StDev Exact StDev Gibbs
## Intercept    -0.0089    -0.0086      0.0406      0.0412
## beta1         1.1679     1.1674      0.0961      0.0978
## beta2        -0.4175    -0.4173      0.1440      0.1467
## beta3        -0.2376    -0.2378      0.0953      0.0971
## sigma2        0.1453     0.1505      0.0204      0.0216

Marginal posterior densities

par(mfrow=c(2,2))
for (i in 2:(p+1)){
  xxx = seq(min(draws[,i]),max(draws[,i]),length=1000)
  s0  = sqrt(sig20*B0[i,i])
  t0  = (xxx-b0[i])/s0
  s   = sqrt(sig21*B1[i,i])
  t   = (xxx-b1[i])/s
  plot(density(draws[,i]),xlab="",main=paste("beta ",i-1,sep=""))
  #xxx = seq(quantile(draws[,i],0.001),quantile(draws[,i],0.999),length=1000)
  lines(xxx,dt(t,df=nu1)/s,col=2)
  lines(xxx,dt(t0,df=nu0)/s0,col=4)
  abline(v=beta.ols[i],col=3)
}

sig2.d = draws[,p+2]
plot(density(sig2.d),xlab="",ylab="Posterior density",main="sigma2")
xxx = seq(min(sig2.d),max(sig2.d),length=1000)
#xxx = seq(0.001,0.05,length=1000)
lines(xxx,dgamma(1/xxx,nu1/2,nu1*sig21/2)/xxx^2,col=2)
abline(v=sig.ols^2,col=3)
legend("topright",legend=c("Exact posterior","Gibbs posterior","Prior","OLS"),col=c(2,1,4,3),lty=1,bty="n",lwd=2)
lines(xxx,dgamma(1/xxx,nu0/2,nu0*sig20/2)/xxx^2,col=4)

Roots of the AR(p) polynomial

The roots \(z\) of the AR(\(p\)) polinomial \[ p(z) = 1-\beta_1z-\beta_2z^2+\cdots+\beta_pz^p, \] determine whether \(y_t\) is stationary or not. When all roots are \(|z|>1\), we say that the AR(\(p\)) is stationary. If one or more \(|z|=1\), then the AR(\(p\)) is difference-stationary and when one or more roots \(|z|<1\) we say the process is non-stationary. Based on the Bayesian AR(3) model, with high probability there is no explosive or unit roots, so \(y_t\) is a stationary time series, as far as AR(\(p\)) models are concerned.

roots.ols = abs(polyroot(c(1,-beta.ols[2:(p+1)])))

roots = matrix(0,M,p)
for (i in 1:M)
  roots[i,] = abs(polyroot(c(1,-draws[i,2:(p+1)])))

par(mfrow=c(1,3))
hist(log(roots[,1]),xlab="log|root1|",ylab="Posterior density",main="",prob=TRUE)
title(paste("Pr(log|root1|<0)=",round(mean(log(roots[,1])<0),4),sep=""))
abline(v=log(roots.ols[1]),col=2,lty=2)
hist(log(roots[,2]),xlab="log|root2|",ylab="Posterior density",main="",prob=TRUE)
title(paste("Pr(log|root2|<0)=",round(mean(log(roots[,2])<0),4),sep=""))
abline(v=log(roots.ols[2]),col=2,lty=2)
hist(log(roots[,3]),xlab="log|root3|",ylab="Posterior density",main="",prob=TRUE)
title(paste("Pr(log|root3|<0)=",round(mean(log(roots[,3])<0),4),sep=""))
abline(v=log(roots.ols[3]),col=2,lty=2)

Out-of-sample forecasting

Posterior predictive densities are relatively easy to sample from: \[ p(y_{n+1},\ldots,y_{n+h}|y^n) = \int p(y_{n+1},\ldots,y_{n+h}|\beta,\sigma^2,y^n)p(\beta,\sigma^2|y^n)d\beta d\sigma^2, \] Where \(y^n=\{y_1,\ldots,y_n\}\) is the observed time series. We can work out recursive the densities: \[ p(y_{n+1},\ldots,y_{n+h}|\beta,\sigma^2,y^n) = \prod_{k=1}^h p(y_{n+k}|y^{n+k-1},\beta,\sigma^2). \] For \(k=1,\ldots,h\), sample \(\{y_{n+k}^{(j)}\}_{j=1}^M\) as follows: \[\begin{eqnarray*} y_{n+k}^{(j)} &\sim& N\left(\beta_0^{(j)}+\sum_{l=1}^p y_{n+k-l}^{(j)}\beta_l^{(j)},\sigma^{2(j)}\right). \end{eqnarray*}\] Obviously, \(y_{n+k-l}^{(j)}=y_{n+k-l}\) whenever \(k<l\). Similarly, for all \(k>p\), draws \(y_{n+k}^{(j)}\) are all based exclusively on out-of-sample draws.

yh = y1[(n-1):(n-p)]
Y = matrix(c(1,yh),M,p+1,byrow=TRUE)
h = 20
yfut = matrix(0,h,M)
for (t in 1:h){
  yh1 = apply(draws[,1:(p+1)]*Y,1,sum)+rnorm(M,0,sqrt(draws[,p+2]))
  yfut[t,] = yh1
  Y   = cbind(1,yh1,Y[,2:p])
}
qs = t(apply(yfut,1,quantile,c(0.025,0.25,0.5,0.75,0.975)))

plot(1845:1935,y1,type="l",xlim=c(1845,1935+h),main="Canadian Lynx",ylab="",xlab="Year",ylim=range(y1,qs),lwd=2)
abline(v=1935,lty=2)
cols = c(4,3,2,3,4)
for (i in 1:5)
  lines(1936:(1935+h),qs[,i],col=cols[i],lwd=2)