#####################################################################################
#
# Regression wth correlated errors
#
#####################################################################################
#
# Source:
# http://www2.stat.duke.edu/~pdh10/FCBS/Replication/icecore.RData
#
# Peter Hoff (2009) A First Course in Bayesian Statistical Methods, 
# Section 10.5 Combining the Metropolis and Gibbs algorithms
# Pages 187-193
# 
# Analyses of ice cores from East Antarctica have allowed scientists 
# to deduce historical atmospheric conditions of the last few hundred 
# thousand years (Petit et al, 1999). The first plot of Figure 10.7 
# plots time-series of temperature and carbon dioxide concentration 
# on a standardized scale (centered and scaled to have a mean of zero 
# and a variance of one). The data include 200 values of temperature 
# measured at roughly equal time intervals, with time between consecutive
# measurements being approximately 2,000 years. For each value of temperature
# there is a CO2 concentration value corresponding to a date that is roughly
# 1,000 years previous to the temperature value, on average. Temperature 
# is recorded in terms of its difference from current present temperature 
# in degrees Celsius, and CO2 concentration is recorded in parts per million
# by volume.
#
# The plot indicates that the temporal history of temperature and CO2 follow 
# very similar patterns. The second plot in Figure 10.7 indicates that CO2
# concentration at a given time point is predictive of temperature following 
# that time point. One way to quantify this is by fitting a linear regression
# model for temperature (y) as a function of CO2 (x). Ordinary least squares
# regression gives an estimated model of E[y|x] = −23.02 + 0.08x with a 
# nominal standard error of 0.0038 for the slope term. The validity of 
# this standard error relies on the error terms in the regression model 
# being independent and identically distributed, and standard confidence 
# intervals further rely on the errors being normally distributed.
#
# These two assumptions are examined in the two residual diagnostic plots in 
# Figure 10.8. The first plot, a histogram of the residuals, indicates no serious
# deviation from non-normality. The second plot gives the autocorrelation
# function of the residuals, and indicates a nontrivial correlation of 0.52 
# between residuals at consecutive time points. Such a positive correlation 
# generally means that there is less information in the data, and less evidence 
# for a relationship between the two variables, than is assumed by the ordinary 
# least squares regression analysis.
#
#####################################################################################
library(MASS)

# Loading and transforming the data
load("/Users/hedibert/Downloads/icecore.RData")
ind = 101:200
time = icecore[ind,1]
x    = icecore[ind,2]
y    = icecore[ind,3]
x1   = (x-mean(x))/sqrt(var(x))
y1   = (y-mean(y))/sqrt(var(y))
X    = cbind(1,x)
n    = nrow(X)

pdf(file="icecoredata.pdf",width=12,height=7)
# Figure 10.7 (right panel)
par(mfrow=c(1,1))
plot(x,y,xlab="CO2 (ppmv)",ylab="temperature difference (deg C)")

# OLS regression
ols = lm(y~x)
summary(ols)
abline(ols$coef,col=2)

# Figure 10.8 
par(mfrow=c(1,2))
hist(ols$res,xlab="residual",main="")
acf(ols$res,main="")

# Figure 10.7 (left panel)
par(mfrow=c(1,1))
plot(time,y1,ylim=c(-2.5,3),type="l",lwd=2,xlab="year",
ylab="standardized measurement")
lines(time,x1,col=2,lwd=2)
legend("topright",legend=c("temp","CO2"),col=1:2,lty=1,lwd=2,bty="n")


# Constructing the likelihood function
C1 = function(n,rho){
  C1 = diag(1,n)
  for (i in 1:n)
    C1[i,] = rho^(abs(i-(1:n)))
  return(C1)
}
loglike = function(y,X,beta,sig2,iCrho){
  e2 = t(y-X%*%beta)%*%iCrho%*%(y-X%*%beta)
  return(-0.5*(n*log(sig2)-sum(log(svd(iCrho)$d))+e2/sig2))
}

# Prior distribution
b0  = c(0,0)
B0  = diag(1000,2)
nu0 = 1
s02 = 1
nu0s02 = nu0*s02
iB0    = solve(B0)
iB0b0  = iB0%*%b0
nu1    = nu0 + n

# MCMC scheme
M0 = 1000
M  = 1000
L  = 25

# Initial values
beta    = as.vector(ols$coef)
sig2    = summary(ols)$sig^2
rho     = acf(ols$res,plot=FALSE)$acf[2]
iCmat   = solve(C1(n,rho))
tXiCmat = t(X)%*%iCmat

niter = M0+L*M
pars = matrix(0,niter,4)
system.time({
for (iter in 1:niter){
  # sampling beta
  var  = solve(tXiCmat%*%X/sig2+iB0)
  mean = var%*%(tXiCmat%*%y/sig2+iB0b0)
  beta = mvrnorm(1,mean,var)

  # sampling sig2
  e      = y-X%*%beta
  nu1s12 = nu0s02 + t(e)%*%iCmat%*%e
  sig2   = 1/rgamma(1,nu1/2,nu1s12/2)

  # sampling rho
  rho1   = runif(1)
  iCmat1 = solve(C1(n,rho1))
  num    = loglike(y,X,beta,sig2,iCmat1)
  den    = loglike(y,X,beta,sig2,iCmat)
  accept = min(0,num-den)
  if (log(runif(1))<accept){
    rho     = rho1
    iCmat   = iCmat1
    tXiCmat = t(X)%*%iCmat
  }
  pars[iter,] = c(beta,sig2,rho)
}
})
#    user  system elapsed 
# 484.662  13.283 500.357 
# Approximately 8 minutes to run the MCMC algorithm


# MCMC output and Posterior inference
ind = seq(M0+1,niter,by=L)
par(mfrow=c(2,4))
ts.plot(pars[ind,1],xlab="iterations",ylab="",main=expression(beta[0]))
ts.plot(pars[ind,2],xlab="iterations",ylab="",main=expression(beta[1]))
ts.plot(pars[ind,3],xlab="iterations",ylab="",main=expression(sigma^2))
ts.plot(pars[ind,4],xlab="iterations",ylab="",main=expression(rho))
for (i in 1:4) acf(pars[ind,i],main="")

par(mfrow=c(1,2))
hist(pars[ind,2],prob=TRUE,xlab="",main=expression(beta[1]))
hist(pars[ind,4],prob=TRUE,xlab="",main=expression(rho))


# Comparing OLS and Bayesian GLS
par(mfrow=c(1,1))
plot(x,y,xlab="CO2 (ppmv)",ylab="temperature difference (deg C)")
abline(ols$coef,col=2,lwd=3)
abline(mean(pars[,1]),mean(pars[,2]),col=4,lwd=3)
legend("topleft",legend=c("OLS","Bayesian GLS"),col=c(2,4),lwd=3,lty=1)

dev.off()