In this exercise, we implement the Bayesian Gaussian AR(p) with conjugate prior when modeling of the Canadian Lynx time series. Conjugacy leads to closed form posterior inference, so when implementing a standard Gibbs sampler we can compare its performance to the exact derivations for predictive densities, posterior densities, Bayes factors and posterior model probabilities.
We use the famous Canadian Lynx data, a yearly time series (from 1845 to 1935) that counts the number (in thousands) of lynx pelts purchased by the Hudson’s Bay Company of Canada. We will extract it from David Stoffer’s R packages for time series analysis astsa.
More serious time series models were considered by Lopes and Salazar (2004) Bayesian model uncertainty in smooth transition autoregressions, Journal of Time Series Analysis, 27, 99-117. See also their references. The paper paper can be found at http://hedibert.org/wp-content/uploads/2013/12/lopes-salazar-2006a.pdf
library("mvtnorm")
library("astsa")
y1 = Lynx
y1 = y1-mean(y1)
y1 = y1/sqrt(var(y1))
n = length(y1)
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
ts.plot(y1,main="Canadian Lynx",ylab="")
acf(y1,main="")
pacf(y1,main="")
We will use the function ar form the R package astsa to fit AR(p) models, for \(p=0,1,\ldots,10\), to the standardized Canadian lynx data and compare their fit via Akaike’s information criteria (AIC). More specifically, for \(t=1,\ldots,n\), we model the data \(\{y_1,\ldots,y_n\}\) as \[ y_t = \beta_0 + \sum_{j=1}^p \beta_j y_{t-j} + \varepsilon_t, \] where \(\varepsilon_t\)s are i.i.d \(N(0,\sigma^2)\).
I prefer to work with the matrix notation, where \[ y|X,\beta,\sigma^2 \sim N(X\beta,\sigma^2 I_{n-p}), \] where \(y=(y_{p+1},\ldots,y_n)\) and \(X=(x_{p+1},\ldots,x_n)'\), for \(x_t'=(1,y_{t-1},y_{t-2},\ldots,y_{t-p})\). Notice that, for simplification, we are performing the inference based on the last \((n-p)\) observations, so \(y_1,\ldots,y_p\) are used as initial states.
Once the matrix notation is established, one can notice that the model becomes a standard Gaussian multiple linear regression.
pmax = 10
fit.y1 = ar(y1,order.max=pmax,aic=TRUE)
fit.y1
##
## Call:
## ar(x = y1, aic = TRUE, order.max = pmax)
##
## Coefficients:
## 1 2 3
## 1.1741 -0.4277 -0.2302
##
## Order selected 3 sigma^2 estimated as 0.1797
aic.y1 = fit.y1$aic
ps = 0:pmax
py1 = ps[aic.y1==min(aic.y1)]
plot(0:pmax,aic.y1,type="b",xlab="AR order",ylab="AIC",ylim=c(0,40),axes=FALSE)
axis(2);box();axis(1,at=0:pmax)
title(paste("AIC minimum at p=",py1,sep=""))
text(0,1,round(aic.y1[1]),col=2)
text(1,1,round(aic.y1[2]),col=2)
Bayesian inference can be with performed in closed-form when a conjugate prior is used for the model coefficients and variance. More specifically, we assume the normal-gamma prior for \((\beta,\sigma^2)\): \[ \beta|\sigma^2 \sim N(b_0,\sigma^2B_0) \ \ \ \mbox{and} \ \ \ \sigma^2 \sim IG(\nu_0/2,\nu_0\sigma_0^2/2). \]
Conjugacy allows for closed-form derivation of the prior predictive density, which is useful for Bayesian model comparison: \[ p(y|X) = \int p(y|X,\beta,\sigma^2)p(\beta)p(\sigma^2)d\beta d\sigma^2, \] and show that the distribution of \(y|X\) is Student’s \(t\), which is a scale mixture of normals (SMN): \[ y|X \sim t_{\nu_0}(X b_0,\sigma_0^2B_0). \] We can then compare different model orders by predictives, say by comparing \(p(y|X,p=2)\) and \(p(y|X,p=5)\), for instance.
The conjugate prior leads to the following posterior distribution for \((\beta,\sigma^2)\): \[ \beta|\sigma^2 \sim N(b_1,\sigma^2B_1) \ \ \ \mbox{and} \ \ \ \sigma^2 \sim IG(\nu_1/2,\nu_1\sigma_1^2/2), \] where \[\begin{eqnarray*} B_1^{-1} &=& B_0^{-1} + X'X\\ B_1^{-1}b_1 &=& B_0^{-1}b_0 + X'y\\ \nu_1 &=& \nu_0 + (n-p)\\ \nu_1\sigma_1^2 &=& \nu_0\sigma_0^2 + (y-X b_1)'y + (b_0-b_1)B_0^{-1}b_0. \end{eqnarray*}\]
preds = NULL
for (p in 2:pmax){
y = y1[(p+1):n]
X = cbind(1,y1[p:(n-1)])
for (i in 2:p)
X = cbind(X,y1[(p-i+1):(n-i)])
# Prior hyperparameters
b0 = rep(0,p+1)
B0 = diag(25,p+1)
nu0 = 18
sig20 = 0.01666667
nu0sig20 = nu0*sig20
iB0 = solve(B0)
iB0b0 = iB0%*%b0
# posterior sufficient statistics
B1 = solve(iB0+t(X)%*%X)
tcB1 = t(chol(B1))
b1 = B1%*%(iB0b0+t(X)%*%y)
nu1 = nu0 + (n-p)
# Predictive moments
m = X%*%b0
V = sig20*(diag(1,n-p)+X%*%B0%*%t(X))
predy = dmvt(y,m,V,df=nu0,log=TRUE)
preds = c(preds,predy)
}
ps = 2:pmax
par(mfrow=c(1,2))
plot(2:pmax,aic.y1[3:(pmax+1)],type="b",xlab="AR order",ylab="AIC",axes=FALSE)
axis(2);box();axis(1,at=2:pmax)
title(paste("AIC minimum at p=",py1,sep=""))
text(0,1,round(aic.y1[1]),col=2)
text(1,1,round(aic.y1[2]),col=2)
plot(2:pmax,preds,axes=FALSE,type="b",ylab="Log predictive",xlab="AR order")
axis(2);box();axis(1,at=2:pmax)
title(paste("Predictive maximum at p=",ps[preds==max(preds)],sep=""))
Also, assuming uniform prior amongst the \(p\) AR models, we can show that the posterior model probabilities are: \[ Pr(p=k|y,X) = \frac{p(y|X,p=k)}{\sum_{l=1}^p p(y|X,p=l)}, \qquad k=0,1,\ldots,10. \]
# Computing posterior model probabilites
pmp = exp(preds)
pmp = pmp/sum(pmp)
par(mfrow=c(1,1))
plot(2:pmax,pmp,axes=FALSE,type="h",ylab="Probability",lwd=3,xlab="AR order")
axis(2);box();axis(1,at=2:pmax)
title("Posterior model probabilities")
We now compare closed-form posterior quantities with Gibbs-based ones.
p = 3
y = y1[(p+1):n]
X = cbind(1,y1[p:(n-1)])
for (i in 2:p)
X = cbind(X,y1[(p-i+1):(n-i)])
# prior hyperparameters
b0 = rep(0,p+1)
B0 = diag(25,p+1)
nu0 = 18
sig20 = 0.01666667
# posterior hyperparameters
nu0sig20 = nu0*sig20
iB0 = solve(B0)
iB0b0 = iB0%*%b0
B1 = solve(iB0+t(X)%*%X)
tcB1 = t(chol(B1))
b1 = B1%*%(iB0b0+t(X)%*%y)
nu1 = nu0 + (n-p)
sig21 = ((nu0sig20+sum((y-X%*%b1)*y)+t(b0-b1)%*%iB0b0)/nu1)[1]
se = sqrt(nu1/(nu1-2)*sig21*diag(B1))
Let’s compare the maximum likelihood and Bayesian summaries
tab = round(cbind(b1,se,b1/se,2*(1-pt(abs(b1/se),df=nu1))),5)
colnames(tab)=c("Mean","StDev","t","tail")
rownames(tab)=c("Intercept","beta1","beta2","beta3")
tab
## Mean StDev t tail
## Intercept -0.00895 0.04064 -0.22022 0.82613
## beta1 1.16788 0.09610 12.15277 0.00000
## beta2 -0.41754 0.14404 -2.89882 0.00455
## beta3 -0.23755 0.09526 -2.49366 0.01419
# Posterior mean and standard deviation for sigma2
Esig2 = (nu1*sig21/2)/(nu1/2-1)
SDsig2 = sqrt((nu1*sig21/2)^2/((nu1/2-1)*(nu1/2-2)))
c(Esig2,SDsig2)
## [1] 0.1453415 0.1467595
fit.ols = lm(y~X-1)
summary(fit.ols)
##
## Call:
## lm(formula = y ~ X - 1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.60722 -0.23822 -0.04092 0.19346 1.08512
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## X1 -0.008918 0.044684 -0.20 0.84229
## X2 1.171865 0.106088 11.05 < 2e-16 ***
## X3 -0.423173 0.159110 -2.66 0.00937 **
## X4 -0.234507 0.105162 -2.23 0.02842 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4191 on 84 degrees of freedom
## Multiple R-squared: 0.8323, Adjusted R-squared: 0.8243
## F-statistic: 104.2 on 4 and 84 DF, p-value: < 2.2e-16
beta.ols = fit.ols$coef
sig.ols = summary(fit.ols)$sig
This exercise illustraste the performance of an Markov chain Monte Carlo (MCMC) algorithm in approximating posterior quantities. Since we do have closed-form quantities, we will be able to check the goodness of the Monte Carlo approximation.
set.seed(12345)
burnin = 10000
M = 10000
niter = burnin+M
draws = matrix(0,niter,p+2)
beta.d = beta.ols
runtime = system.time({
for (iter in 1:niter){
sig2.d = 1/rgamma(1,nu1/2,(nu0sig20+sum((y-X%*%beta.d)^2))/2)
#beta.d = as.vector(rmvnorm(1,b1,sig2.d*B1))
beta.d = b1+sqrt(sig2.d)*tcB1%*%rnorm(p+1)
draws[iter,] = c(beta.d,sig2.d)
}})
draws = draws[(burnin+1):niter,]
runtime
## user system elapsed
## 0.364 0.050 0.415
# runtime with burnin=M=10,000 and using Cholesky = 0.432 seconds
# runtime with burnin=M=10,000 and using rmvnorm = 7.728 seconds
par=c("beta0","beta1","beta2","beta3","sigma2")
par(mfrow=c(2,p+2))
ind = seq(1,M,by=10)
for (i in 1:(p+2))
plot(ind,draws[ind,i],xlab="Iterations",ylab="",main=par[i],type="l")
for (i in 1:(p+2))
acf(draws[,i],main="")
Esig2 = c((nu1*sig21/2)/(nu1/2-1),mean(draws[,p+2]))
SDsig2 = c(sqrt((nu1*sig21/2)^2/((nu1/2-1)^2*(nu1/2-2))),
sqrt(var(draws[,p+2])))
tab = round(rbind(cbind(b1,apply(draws[,1:(p+1)],2,mean),se,
sqrt(apply(draws[,1:(p+1)],2,var))),c(Esig2,SDsig2)),4)
colnames(tab)=c("Mean Exact","Mean Gibbs","StDev Exact","StDev Gibbs")
rownames(tab)=c("Intercept","beta1","beta2","beta3","sigma2")
tab
## Mean Exact Mean Gibbs StDev Exact StDev Gibbs
## Intercept -0.0089 -0.0086 0.0406 0.0412
## beta1 1.1679 1.1674 0.0961 0.0978
## beta2 -0.4175 -0.4173 0.1440 0.1467
## beta3 -0.2376 -0.2378 0.0953 0.0971
## sigma2 0.1453 0.1505 0.0204 0.0216
par(mfrow=c(2,2))
for (i in 2:(p+1)){
xxx = seq(min(draws[,i]),max(draws[,i]),length=1000)
s0 = sqrt(sig20*B0[i,i])
t0 = (xxx-b0[i])/s0
s = sqrt(sig21*B1[i,i])
t = (xxx-b1[i])/s
plot(density(draws[,i]),xlab="",main=paste("beta ",i-1,sep=""))
#xxx = seq(quantile(draws[,i],0.001),quantile(draws[,i],0.999),length=1000)
lines(xxx,dt(t,df=nu1)/s,col=2)
lines(xxx,dt(t0,df=nu0)/s0,col=4)
abline(v=beta.ols[i],col=3)
}
sig2.d = draws[,p+2]
plot(density(sig2.d),xlab="",ylab="Posterior density",main="sigma2")
xxx = seq(min(sig2.d),max(sig2.d),length=1000)
#xxx = seq(0.001,0.05,length=1000)
lines(xxx,dgamma(1/xxx,nu1/2,nu1*sig21/2)/xxx^2,col=2)
abline(v=sig.ols^2,col=3)
legend("topright",legend=c("Exact posterior","Gibbs posterior","Prior","OLS"),col=c(2,1,4,3),lty=1,bty="n",lwd=2)
lines(xxx,dgamma(1/xxx,nu0/2,nu0*sig20/2)/xxx^2,col=4)
The roots \(z\) of the AR(\(p\)) polinomial \[ p(z) = 1-\beta_1z-\beta_2z^2+\cdots+\beta_pz^p, \] determine whether \(y_t\) is stationary or not. When all roots are \(|z|>1\), we say that the AR(\(p\)) is stationary. If one or more \(|z|=1\), then the AR(\(p\)) is difference-stationary and when one or more roots \(|z|<1\) we say the process is non-stationary. Based on the Bayesian AR(3) model, with high probability there is no explosive or unit roots, so \(y_t\) is a stationary time series, as far as AR(\(p\)) models are concerned.
roots.ols = abs(polyroot(c(1,-beta.ols[2:(p+1)])))
roots = matrix(0,M,p)
for (i in 1:M)
roots[i,] = abs(polyroot(c(1,-draws[i,2:(p+1)])))
par(mfrow=c(1,3))
hist(log(roots[,1]),xlab="log|root1|",ylab="Posterior density",main="",prob=TRUE)
title(paste("Pr(log|root1|<0)=",round(mean(log(roots[,1])<0),4),sep=""))
abline(v=log(roots.ols[1]),col=2,lty=2)
hist(log(roots[,2]),xlab="log|root2|",ylab="Posterior density",main="",prob=TRUE)
title(paste("Pr(log|root2|<0)=",round(mean(log(roots[,2])<0),4),sep=""))
abline(v=log(roots.ols[2]),col=2,lty=2)
hist(log(roots[,3]),xlab="log|root3|",ylab="Posterior density",main="",prob=TRUE)
title(paste("Pr(log|root3|<0)=",round(mean(log(roots[,3])<0),4),sep=""))
abline(v=log(roots.ols[3]),col=2,lty=2)
Posterior predictive densities are relatively easy to sample from: \[ p(y_{n+1},\ldots,y_{n+h}|y^n) = \int p(y_{n+1},\ldots,y_{n+h}|\beta,\sigma^2,y^n)p(\beta,\sigma^2|y^n)d\beta d\sigma^2, \] Where \(y^n=\{y_1,\ldots,y_n\}\) is the observed time series. We can work out recursive the densities: \[ p(y_{n+1},\ldots,y_{n+h}|\beta,\sigma^2,y^n) = \prod_{k=1}^h p(y_{n+k}|y^{n+k-1},\beta,\sigma^2). \] For \(k=1,\ldots,h\), sample \(\{y_{n+k}^{(j)}\}_{j=1}^M\) as follows: \[\begin{eqnarray*} y_{n+k}^{(j)} &\sim& N\left(\beta_0^{(j)}+\sum_{l=1}^p y_{n+k-l}^{(j)}\beta_l^{(j)},\sigma^{2(j)}\right). \end{eqnarray*}\] Obviously, \(y_{n+k-l}^{(j)}=y_{n+k-l}\) whenever \(k<l\). Similarly, for all \(k>p\), draws \(y_{n+k}^{(j)}\) are all based exclusively on out-of-sample draws.
yh = y1[(n-1):(n-p)]
Y = matrix(c(1,yh),M,p+1,byrow=TRUE)
h = 20
yfut = matrix(0,h,M)
for (t in 1:h){
yh1 = apply(draws[,1:(p+1)]*Y,1,sum)+rnorm(M,0,sqrt(draws[,p+2]))
yfut[t,] = yh1
Y = cbind(1,yh1,Y[,2:p])
}
qs = t(apply(yfut,1,quantile,c(0.025,0.25,0.5,0.75,0.975)))
plot(1845:1935,y1,type="l",xlim=c(1845,1935+h),main="Canadian Lynx",ylab="",xlab="Year",ylim=range(y1,qs),lwd=2)
abline(v=1935,lty=2)
cols = c(4,3,2,3,4)
for (i in 1:5)
lines(1936:(1935+h),qs[,i],col=cols[i],lwd=2)