#===================================================================================================
# BAYESIAN MULTIPLE LINEAR REGRESSION ANALYSIS
#===================================================================================================
#
#
#  Hedibert Freitas Lopes, PhD
#  Associate Professor of Econometrics and Statistics
#  The University of Chicago Booth School of Business
#  5807 South Woodlawn Avenue
#  Chicago, IL, 60637
#
#  This version: October 22nd 2009.
#  Course: 41913-01 Bayesian Econometrics, Fall 2009.
#===================================================================================================
#
# MODEL: The standard Bayesian approach to multiple linear regression is
#
#               (y|theta,X) ~ N(X*beta,sig2*In)
#
# where y=(y1,...,yn), theta = (beta,sig2), X=(x1,...,xn)' is the (n x q), design matrix and q=p+1.
#
# PRIOR: The prior distribution of theta is NIG(b0,B0,n0,S0), i.e.
#
#               beta|sig2 ~ N(b0,sig2*B0)   and sig2 ~ IG(n0/2,n0*S0/2)
#
# for known hyperparameters b0,B0,n0 and S0.  
#
# PREDICTIVE DISTRIBUTION:
#
#                   (y|X) ~ t(n0,X*b0,S0*(In+X*B0*X'))  
#
# CONDITIONAL AND MARGINAL POSTERIOR DISTRIBUTIONS:
#
#         (beta|sig2,y,X) ~ N(b1,sig2*B1)
#         (sig2|beta,y,X) ~ IG(n1/2,n1*S11/2)
#
#         (beta|y,X)      ~ t(n1,b1,S1*B1)
#         (sig2|y,X)      ~ IG(n1/2,n1*S1/2)
#
#  where
#                  iB1*b1 = iB0*b0 + X'y
#                  iB1    = iB0    + X'X
#                  n1     = n0     + n
#                  n1*S1  = n0*S0  + (y-X*b1)'y + (b0-b1)'*iB0*b0
#                  n1*S11 = n0*S0  + (y-X*beta)'(y-X*beta)
#
# ORDINARY LEAST SQUARE ANALYSIS:
#
#                betahat = iXtX*X'y
#                sig2hat = Se/(n-q)
#
# where Se=e'e, e=y-X*betahat and iXtX is the inverse of X'X.
#
# The conditional and unconditional sampling distributions of betahat are
#
#                betahat|sig2 ~ N(beta,sig2*iXtX)
#                sig2hat|sig2 ~ IG((n-q)/2,((n-q)*sig2/2)
#                betahat      ~ t(n-q,beta,Se)
#
#===================================================================================================
rm(list=ls())
ldigamma = function(sig2,a,b){
  dgamma(1/sig2,a,b,log=TRUE)-2*log(sig2)
}
ldmnorm = function(x,q,m,ldv,iv){
  -0.5*(q*log(2*pi)+ldv+t(x-m)%*%iv%*%(x-m))
}
ldmnorm1 = function(x,m,C){
  if(length(x)==1){
    -0.5*(log(2*pi) + log(C) + (x-m)^2/C)
  }else{
    -0.5*(length(x)*log(2*pi) + log(det(C)) + t(x-m)%*%solve(C)%*%(x-m))
  }
}
ldmt = function(x,nu,mu,ldv,iv){
  d = length(x)
  ll = log(nu+t(x-mu)%*%iv%*%(x-mu))
  lgamma((nu+d)/2)-lgamma(nu/2)+0.5*nu*log(nu)-0.5*d*log(pi)-0.5*ldv-0.5*(nu+d)*ll
}

#===================================================================================================
# DATASET 1: 1993-4 U.S. COLLEGE TUITION
# Source: http://www.stat.duke.edu/data-sets/mw/teaching-data/univ-data
#===================================================================================================
# tuition   : College tuition ("out-of-state" rate for those with in-state discount).
# pcttop25  : Percent of new students from the top 25% of high school class.
# sfratio   : Student to faculty ratio.
# faccomp   : Average faculty compensation.
# accrate   : Fraction of applicants accepted for admission.
# graduat   : Percent of students who graduate.
# pctphd    : Percent of faculty with Ph.D.'s.
# fulltime  : Percent of undergraduates who are full time students.
# alumni    : Percent of alumni who donate.
# numenrl   : Number of new students enrolled.
# public    : Is the college a public or private institution? public=0, private=1
#===================================================================================================
if(0){
names=c("constant","pcttop25","sfratio ","faccomp ","accrate ","graduate","perctphd","fulltime",
        "alumni  ","numenrl ","public  ")
data    = read.table("college.txt",header=TRUE)
n       = nrow(data)
missing = rep(0,n)
data1   = NULL
for (i in 1:n){
  missing[i] = sum(is.na(data[i,]))
  if (missing[i]==0){
    data1=rbind(data1,data[i,])
  }
}
n = nrow(data1)
y = matrix(data1[,1],n,1)
q = ncol(data1)
p = q-1
X = matrix(1,n,q)
for (i in 2:q) 
  X[,i] = data1[,i]
set.seed(1245)
ind   = sample(1:n,replace=F,size=100,prob=rep(1/n,n))
vars  = c(1,2,6,11)
names = names[vars]
X     = X[ind,vars]
y     = y[ind]
n     = nrow(X)
q     = ncol(X)
p     = q-1
}

#===================================================================================================
# DATASET 2: Data on maximum compressive strength parallel to the grain y[i], 
# the density x[i] and the resin-adjusted density z[i] for n=42 specimens of radiata pine.  
# Source: Han and Carlin (2001) MCMC methods for computing Bayes factors: a comparative review
# Journal of the American Statistical Association, 96, 1122-1132.
#===================================================================================================
data = matrix(c(1,3040,29.2,25.4,22,3840,30.7,30.7,2,2470,24.7,22.2,23,3800,32.7,32.6,
 3,3610,32.3,32.2,24,4600,32.6,32.5,4,3480,31.3,31.0,25,1900,22.1,20.8,
 5,3810,31.5,30.9,26,2530,25.3,23.1,6,2330,24.5,23.9,27,2920,30.8,29.8,
 7,1800,19.9,19.2,28,4990,38.9,38.1,8,3110,27.3,27.2,29,1670,22.1,21.3,
 9,3160,27.1,26.3,30,3310,29.2,28.5,10,2310,24.0,23.9,31,3450,30.1,29.2,
11,4360,33.8,33.2,32,3600,31.4,31.4,12,1880,21.5,21.0,33,2850,26.7,25.9,
13,3670,32.2,29.0,34,1590,22.1,21.4,14,1740,22.5,22.0,35,3770,30.3,29.8,
15,2250,27.5,23.8,36,3850,32.0,30.6,16,2650,25.6,25.3,37,2480,23.2,22.6,
17,4970,34.5,34.2,38,3570,30.3,30.3,18,2620,26.2,25.7,39,2620,29.9,23.8,
19,2900,26.7,26.4,40,1890,20.8,18.4,20,1670,21.1,20.0,41,3030,33.2,29.4,
21,2540,24.1,23.9,42,3030,28.2,28.2),21,8,byrow=T)
data = rbind(data[,1:4],data[,5:8])
names = c("constant","density","resin-ajusted density")
y = data[,2]
X = cbind(1,data[,3]-mean(data[,3]),data[,4]-mean(data[,4]))
n = nrow(X)
q = ncol(X)
p = q-1

# --------------------------------------------------------------------------------
# OLS quantities
# --------------------------------------------------------------------------------
Xty     = t(X)%*%y
XtX     = t(X)%*%X
iXtX    = solve(XtX)
bhat    = iXtX%*%t(X)%*%y
Se      = t(y-X%*%bhat)%*%(y-X%*%bhat)
sig2hat = Se/(n-q)

# --------------------------------------------------------------------------------
# Prior hyperparameters
# --------------------------------------------------------------------------------
b0    = c(3000,185,185)
B0    = diag(c(10^6,10^4,10^4))
iB0   = solve(B0)
iB0b0 = iB0%*%b0
n0    = 6
S0    = 60000
n0S0  = n0*S0

# --------------------------------------------------------------------------------
# All 2^p models have the constant term
# --------------------------------------------------------------------------------
In      = diag(1,n)
l       = 0
nmodels = 2^p
Model = matrix(0,nmodels,p)
for (i1 in 1:0) 
  for (i2 in 1:0){ 
    l         = l+1
    Model[l,] = c(i1,i2)
  }
Model = cbind(1,Model)

# --------------------------------------------------------------------------------
# Computing true value of p(y) 
# --------------------------------------------------------------------------------
log.pred.true = NULL
for (i in 1:nmodels){
  ind = Model[i,]==1
  q1  = sum(ind)
  X1  = matrix(X[,ind],n,q1)
  B01 = B0[ind,ind]
  b01 = b0[ind]
  v   = S0*(In+X1%*%B01%*%t(X1))
  cv  = chol(v)
  ldv = 2*sum(log(diag(cv)))
  iv  = solve(v)
  log.pred.true = c(log.pred.true,ldmt(y,n0,X1%*%b01,ldv,iv))
}
nvar = apply(Model,1,sum)
ord  = order(log.pred.true,decreasing=TRUE)
top  = Model[ord,]
for (i in 1:nmodels){
	names1 = NULL
	for (j in 1:q)
	  names1 = paste(names1,ifelse(top[i,j]==1,names[j],"        "),sep=" ")
  print(paste(names1,round(log.pred.true[ord[i]],4),sep=" "))
}

# ---------------------------------------------
# DATASET1: True marginal likelihoods
# ---------------------------------------------
# constant pcttop25 graduate public   -937.2816
# constant pcttop25          public   -939.8830
# constant          graduate public   -941.1353
# constant pcttop25 graduate          -951.3476
# constant          graduate          -952.4882
# constant                   public   -955.9858
# constant pcttop25                   -961.2662
# constant                            -977.8107
# ---------------------------------------------

# ------------------------------------------------
# DATASET2: True marginal likelihoods
# ------------------------------------------------
# constant         resin-ajusted density -312.6480
# constant density resin-ajusted density -318.7411
# constant density                       -321.5050
# constant                               -359.1343
# ------------------------------------------------

# --------------------------------------------------------------------------------
# Computing p(y) based on prior draws, posterior draws and the candidate's formula
# --------------------------------------------------------------------------------
set.seed(1255)
delta = 0.1
nrit  = 10
M     = 1000
nsim  = 10
lp.hm = matrix(0,nsim,nmodels)
lp.mc = matrix(0,nsim,nmodels)
lp.ch = matrix(0,nsim,nmodels)
lp.lm = matrix(0,nsim,nmodels)
lp.nr = matrix(0,nsim,nmodels)
log.pred.true1 = NULL
for (ii in 1:nmodels){
  # Likelihood summaries
  ind    = top[ii,]==1
  q1     = sum(ind)
  X1     = matrix(X[,ind],n,q1)
  X1tX1  = t(X1)%*%X1
  X1ty   = t(X1)%*%y
  # Prior hyperparameters
  B01    = B0[ind,ind]
  b01    = b0[ind]
  iB0b01 = iB0b0[ind]
  iB01   = iB0[ind,ind]
  # Posetrior hyperparameters
  B1     = solve(iB01+X1tX1)
  b1     = B1%*%(iB0b01+X1ty)
  n1     = n0 + n
  n1S1   = n0S0 + t(y-X1%*%b1)%*%y + t(b01-b1)%*%iB0b01
  sig21  = (n1S1/2)/(n1/2-1)  
  # True p(y) based on p(y)=p(y|th)p(th)/p(th|y) where th=(beta,sig2)
  loglike  = sum(dnorm(y,X1%*%b1,sqrt(sig21),log=TRUE))
  logprior = ldmnorm1(b1,b01,sig21[1,1]*B01) + ldigamma(sig21,n0/2,n0S0/2)
  logpost1 = ldmnorm1(b1,b1,sig21[1,1]*B1)
  logpost2 = ldigamma(sig21,n1/2,n1S1/2)  
  log.pred.true1 = c(log.pred.true1,loglike+logprior-logpost1-logpost2)
  # Replications
  for (sim in 1:nsim){
  	 # Prior draws
    sig2p = 1/rgamma(M,n0/2,n0S0/2)
    zp=NULL;for(i in 1:q1){zp=cbind(zp,rnorm(M,0,sqrt(sig2p)))}
    betap=matrix(b01,M,q1,byrow=T)+zp%*%chol(B01)
    # Posterior draws
    sig2s = 1/rgamma(M,n1/2,n1S1/2)
    zs=NULL;for(i in 1:q1){zs=cbind(zs,rnorm(M,0,sqrt(sig2s)))}
    betas=matrix(b1,M,q1,byrow=T)+zs%*%chol(B1)
    # Estimating p(y)
    loglikep = -0.5*n*(log(2*pi*sig2p))
    loglikes = -0.5*n*(log(2*pi*sig2s))
    logpriors = rep(0,M)
    posts    =  0.0
    for (i in 1:M){ 
     	SSE = sum((y-X1%*%betas[i,])^2)
     	logpriors[i] = ldmnorm1(betas[i,],b01,sig2s[1]*B01) + ldigamma(sig2s[i],n0/2,n0S0/2)
      loglikep[i] = loglikep[i] - 0.5*sum((y-X1%*%betap[i,])^2)/sig2p[i]
      loglikes[i] = loglikes[i] - 0.5*sum((y-X1%*%betas[i,])^2)/sig2s[i]
      posts       = posts + exp(ldigamma(sig21,(n0+n)/2,(n0S0+SSE/2)))
    }  
    logposts = loglikes+logpriors
    likes    = exp(loglikes)
    lp.mc[sim,ii] = log(mean(exp(loglikep-max(loglikep))))+max(loglikep)
    lp.hm[sim,ii] = log(1/mean(1/exp(loglikes-max(loglikes))))+max(loglikes)
    lp.ch[sim,ii] = loglike + logprior - logpost1 - log(posts/M)
    lp.lm[sim,ii] = logposts[logposts==max(logposts)][1]+q1*log(2*pi)+0.5*log(det(var(cbind(betas,sig2s))))
    lp.nr[sim,ii] = lp.lm[sim,ii]
    for (i in 1:nrit){
      summ = delta*lp.nr[sim,ii]+(1-delta)*likes
      lp.nr[sim,ii] = sum(likes/summ)/sum(1/summ)
    }
    lp.nr[sim,ii] = log(lp.nr[sim,ii])
  }
}

par(mfrow=c(1,3))
ts.plot(t(lp.mc),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=range(lp.mc,log.pred.true1))
lines(log.pred.true1,col=1,type="b",pch=16)
title("Monte Carlo estimator \n sampling from the prior")
ts.plot(t(lp.hm),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=range(lp.hm,log.pred.true1))
lines(log.pred.true1,col=1,type="b",pch=16)
title("Harmonic mean estimator \n sampling from the posterior")
ts.plot(t(lp.ch),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=range(lp.ch,log.pred.true1))
lines(log.pred.true1,col=1,type="b",pch=16)
title("Chib's estimator \n Rao-blackwellization of p(sig2|y)")

par(mfrow=c(2,2))
L = min(lp.hm,lp.ch,lp.lm,lp.nr,log.pred.true1)
U = max(lp.hm,lp.ch,lp.lm,lp.nr,log.pred.true1)
ts.plot(t(lp.hm),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U))
lines(log.pred.true1,col=1,type="b",pch=16)
title("Harmonic mean estimator \n sampling from the posterior")
ts.plot(t(lp.nr),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U))
lines(log.pred.true1,col=1,type="b",pch=16)
title(paste("Newton-Raftery estimator \n delta=",delta," and ",nrit," iterations",sep=""))
ts.plot(t(lp.lm),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U))
lines(log.pred.true1,col=1,type="b",pch=16)
title("Laplace-Metropolis estimator")
ts.plot(t(lp.ch),col=1:nsim,xlab="Models",ylab="Log Predictive",ylim=c(L,U))
lines(log.pred.true1,col=1,type="b",pch=16)
title("Chib's estimator \n Rao-blackwellization of p(sig2|y)")