######################################################################
# 
# Bayesian Econometrics
# PhD Program in Business Economics
# Spring 2018
#
# By Hedibert Freitas Lopes
# 
######################################################################
#
# Bayesian inference via Gibbs sampler for the multiple linear
# regression with Gaussian errors and conditionally conjugate priors.
# Dealing with sparsity in Gaussian linear models via ridge, Laplace 
# and horseshoe priors
#
######################################################################
rm(list=ls())
bayesreg = function(y,X,phi0,Phi0,c0,d0,burnin,M,thin){
  p         = ncol(X)
  iPhi0     = solve(Phi0)
  iPhi0phi0 = iPhi0%*%phi0
  XtX       = t(X)%*%X
  Xty       = t(X)%*%y
  phiols    = solve(XtX)%*%Xty
  sig2ols   = sum((y-X%*%phiols)^2)/n
  niter     = burnin+M*thin
  draws     = matrix(0,niter,p+1)
  phi       = phi0
  sig2      = d0/(c0+1)
  for (iter in 1:niter){
    # Sampling from p(phi|sig2,y,x)
    var = solve(XtX/sig2+iPhi0)
    mean = var%*%(Xty/sig2+iPhi0phi0)
    phi = mean + t(chol(var))%*%rnorm(p)
    # Sampling from p(sig2|phi,y)
    e  = y-X%*%phi
    c1 = c0 + n/2
    d1 = d0 + sum(e^2)/2
    sig2 = 1/rgamma(1,c1,d1)
    # Storing the Gibbs output for posterior inference
    draws[iter,] = c(phi,sig2)
  }
  ind   = seq(burnin+1,niter,by=thin)
  phis  = draws[ind,1:p]
  sig2s = draws[ind,p+1]
  return(list(phis=phis,sig2s=sig2s,phiols=phiols,sig2ols=sig2ols))
}

bayesreg1 = function(y,X,phi0,Phi0,c0,d0,burnin,M,thin){
  p         = ncol(X)
  iPhi0     = solve(Phi0)
  iPhi0phi0 = iPhi0%*%phi0
  XtX       = t(X)%*%X
  Xty       = t(X)%*%y
  niter     = burnin+M*thin
  draws     = matrix(0,niter,p+1)
  phi       = phi0
  sig2      = d0/(c0+1)
  for (iter in 1:niter){
    # Sampling from p(phi|sig2,y,x)
    var = solve(XtX/sig2+iPhi0)
    mean = var%*%(Xty/sig2+iPhi0phi0)
    phi = mean + t(chol(var))%*%rnorm(p)
    # Sampling from p(sig2|phi,y)
    e  = y-X%*%phi
    c1 = c0 + n/2
    d1 = d0 + sum(e^2)/2
    sig2 = 1/rgamma(1,c1,d1)
    # Storing the Gibbs output for posterior inference
    draws[iter,] = c(phi,sig2)
  }
  ind   = seq(burnin+1,niter,by=thin)
  phis  = draws[ind,1:p]
  sig2s = draws[ind,p+1]
  return(list(phis=phis,sig2s=sig2s))
}

#################################################################################
# Example 1: Simulated data from a Gaussian AR(10) time series model
#################################################################################

# Simulating the time series following an AR(10)
set.seed(1234)
lag     = 10
phi.t   = c(0.987,0.974,-0.098,-0.051,-0.155,0.045,-0.072-0.036,0.179,0.025,0.138,-0.288)
sig2.t  = (0.054)^2
n       = 1000
y       = rep(0,2*n)
for (t in (lag+1):(2*n))
  y[t] = rnorm(1,c(1,y[(t-1):(t-lag)])%*%phi.t,sqrt(sig2.t))
y = y[(n+1):(2*n)]  

par(mfrow=c(1,1))
ts.plot(y)

# Setting up X matrix for the AR(30) model
# Model: y(t)|y(t-1)~N(phi(0)+phi(1)*y(t-1)+...+phi(30)*y(t-30),sig2)
# Prior: p(phi,sig2) = p(phi)*p(sig2), where 
#        phi=(phi(0),...,phi(30))'
#        phi~N(phi0,Phi0)
#        sig2~IG(c0,d0)
phi.t = c(phi.t,rep(0,20))
p  = 30
y1 = y[(p+1):n]
X  = 1
for (i in 1:p)
  X = cbind(X,y[(p-i+1):(n-i)])
phi0 = rep(0,1+p)
Phi0 = diag(1,1+p)
c0   = 0.01
d0   = 0.01

# Gibbs sampler
burnin = 10000
M      = 1000
thin   = 100
run    = bayesreg(y1,X,phi0,Phi0,c0,d0,burnin,M,thin)

# Trace plots
if (p>10){ind=c(1,trunc(seq(2,(1+p),length=10)))}else{ind=1:(1+p)}
par(mfrow=c(2,6))
for (i in ind){
  ts.plot(run$phis[,i],xlab="iterations",ylab="",main=paste("phi",i-1,sep=""))
  abline(h=phi.t[i],col=3,lwd=3)
}
ts.plot(run$sig2s,xlab="iterations",ylab="",main="sigma2")
abline(h=sqrt(sig2.t),col=3,lwd=3)

# Marginal posteriors
par(mfrow=c(2,6))
for (i in ind){
  xx = run$phis[,i]
  xrange = range(xx,run$phiols[i])
  breaks = seq(min(xx),max(xx),length=20)
  hist(run$phis[,i],main=paste("phi",i-1,sep=""),xlab="",breaks=breaks,xlim=xrange)
  abline(v=run$phiols[i],col=2,lwd=2)
  points(phi.t[i],0,col=3,pch=16,cex=2)
}
hist(sqrt(run$sig2s),main="sigma",xlab="")
abline(v=sqrt(run$sig2ols),col=2,lwd=2)
points(sqrt(sig2.t),0,col=3,pch=16,cex=2)  

######################################################################################
# Example 2: Canadian lynx dataset
######################################################################################
#
# Annual numbers of lynx trappings for 1821–1934 in Canada. 
# Taken from Brockwell & Davis (1991), this appears to be the series considered 
# by Campbell & Walker (1977).
#
# Brockwell and Davis (1991) Time Series and Forecasting Methods. Second edition. 
# Springer. Series G (page 557).
#
# Becker, Chambers and Wilks (1988) The New S Language. Wadsworth & Brooks/Cole.
#
# Campbell and Walker (1977) A Survey of statistical work on the Mackenzie River 
# series of annual Canadian lynx trappings for the years 1821–1934 and a new 
# analysis. Journal of the Royal Statistical Society series A, 140, 411–431. 
# doi: 10.2307/2345277.
#
#################################################################################
data(lynx)
y = lynx
y = (y-mean(y))/sqrt(var(y))
n = length(y)

par(mfrow=c(1,1))
ts.plot(y,type="b",xlab="Year",ylab="Number of lynx trappings (standardized)")

# Setting up X matrix for the AR(20) model
# Model: y(t)|y(t-1)~N(phi(0)+phi(1)*y(t-1)+...+phi(30)*y(t-20),sig2)
# Prior: p(phi,sig2) = p(phi)*p(sig2), where 
#        phi=(phi(0),...,phi(20))'
#        phi~N(phi0,Phi0)
#        sig2~IG(c0,d0)
p  = 20
y1 = y[(p+1):n]
X  = 1
for (i in 1:p)
  X = cbind(X,y[(p-i+1):(n-i)])
phi0 = rep(0,1+p)
Phi0 = diag(1,1+p)
c0   = 0.01
d0   = 0.01

# Gibbs sampler
burnin = 10000
M      = 1000
thin   = 100
run    = bayesreg(y1,X,phi0,Phi0,c0,d0,burnin,M,thin)

# Trace plots
par(mfrow=c(3,7))
for (i in 2:(p+1)){
  ts.plot(run$phis[,i],xlab="iterations",ylab="",main=paste("phi",i-1,sep=""))
  abline(h=0,col=2,lwd=3)
}
ts.plot(run$sig2s,xlab="iterations",ylab="",main="sigma2")

# Marginal posteriors
par(mfrow=c(3,7))
for (i in 2:(p+1)){
  xx = run$phis[,i]
  breaks = seq(min(xx),max(xx),length=20)
  hist(run$phis[,i],main=paste("phi",i-1,sep=""),xlab="",breaks=breaks)
  abline(v=run$phiols[i],col=2,lwd=2)
  points(0,0,col=3,pch=16,cex=2)
}
hist(sqrt(run$sig2s),main="sigma",xlab="")
abline(v=sqrt(run$sig2ols),col=2,lwd=2)


######################################################################################
# Example 3: beauty and course evaluations
######################################################################################
#
# Hahn, He and Lopes (2018) Efficient sampling for Gaussian linear regression 
# with arbitrary priors.  Journal of Computational and Graphical Statistics.
#
# The data are course evaluations from the University of Texas at Austin 
# between 2000 and 2002. The data are on a 1 to 5 scale, with larger numbers 
# being better. In addition to the course evaluations, information concerning 
# the class and the instructor were collected. 
# 
# To quote Hamermesh and Parker [2005]:
# "We chose professors at all levels of the academic hierarchy, obtaining professorial 
#  stas from a number of departments that had posted all faculty members'pictures on 
# their departmental websites. An additional ten faculty members'pictures were obtained 
# from miscellaneous departments around the University.  The average evaluation score 
# for each undergraduate course that the faculty member taught during the academic 
# years 2000-2002 is included. This sample selection criterion resulted in 463 courses, 
# with the number of courses taught by the sample members ranging from 1 to 13. 
# The classes ranged in size from 8 to 581 students, while the number of students 
# completing the instructional ratings ranged from 5 to 380. Underlying the 463 
# sample observations are 16,957 completed evaluations from 25,547 registered students."
#
# For additional details on how the beauty scores were constructed and on how to 
# interpret the regression results, see Hamermesh and Parker (2005) Beauty in the 
# classroom: instructors' pulchritude and putative pedagogical productivity. 
# Economics of Education Review, Volume 24, Number 4, 369-376.
# 
# Data: http://faculty.chicagobooth.edu/richard.hahn/teaching/hamermesh.txt
#
# The model we fit allows for fixed effects for each of 95 instructors. 
# We include additive effects for the following factors: 
# 
# 1. Class size (number of students), 
# 2. Language in which the professor earned his or her degree, 
# 3. Whether or not the instructor was a minority, 
# 4. Gender, 
# 5. Beauty rating, and 
# 6. Age. 
# 
# Each of these variables was included in the model via dummy variables according 
# to the following breakdown:
# class size: below 31, 31 to 60, 61 to 150, or 151 to 600 (four levels by quartile),
# language: English or non-English (two levels),
# minority: ethnic minority or non-minority (two levels),
# gender: male or female (two levels),
# beauty: four levels by quartile, and
# age: below 43, 43 to 47, 48 to 56 and 57 to 73 (four levels by quartile).
#
# Finally, we include up to three-way interactions between age, beauty, and gender. 
# We include an intercept in our model so that individual effects can be interpreted 
# as deviation from the average. Three predictors are then dropped because they are 
# collinear with the intercept (and each other); there were no highly beautiful 
# males between 42 and 46, and no moderately beautiful instructors of either gender 
# in that same age group. Even after dropping these columns, the model is numerically 
# singular, with only 98 out of 130 singular values greater than 1e-14.
#
#
#
#      female   male 
#         195    268 
#
#    teaching tenure track      tenured 
#         102          108          253 
#
#    minority not minority 
#          64          399 
#    class size
#      (7,30]   (30,60]  (60,150] (150,600] 
#         245       104        83        31 
#
#   beauty rate         
#
#    (1,3.17] (3.17,4.33]  (4.33,5.5]  (5.5,8.17] 
#         117         166          80         100 
#
######################################################################################
# Reading and preparing the dataset
evals = read.csv("http://faculty.chicagobooth.edu/richard.hahn/teaching/hamermesh.txt")
temp = unique(evals[,c("rank","ethnicity","gender","age","bty.avg","language","pic.outfit","pic.color")])
temp = cbind(temp, as.factor(1:95))
colnames(temp)[9] = "prof"
evals = merge(evals, temp)
evals$age.fac = cut(evals$age, c(28,42,46,56,73))
evals$class.size = cut(evals$cls.students,c(7, 30, 60, 150, 600))
evals$bty = cut(evals$bty.avg, c(1,3.167,4.333,5.50,8.167))
score = as.matrix(evals$score)

# Exploratory data analysis
par(mfrow=c(1,1))
plot(table(evals$prof),xlab="Professor",ylab="Frequency")
abline(h=1,lty=2)
abline(h=2,lty=2)
abline(h=10,lty=2)

# Frequencies
table(evals$rank)
table(evals$ethnicity)
table(evals$gender)
table(evals$class.size)
table(evals$bty)

par(mfrow=c(2,3))
boxplot(score~rank,data=evals,xlab="Rank",ylab="score")
boxplot(score~ethnicity,data=evals,xlab="Minority",ylab="score")
boxplot(score~gender,data=evals,xlab="Gender",ylab="score")
boxplot(score~age.fac,data=evals,xlab="Age",ylab="score")
boxplot(score~class.size,data=evals,xlab="Class size",ylab="score")
boxplot(score~bty,data=evals,xlab="Beauty rate",ylab="score")

# Regression matrix
X = model.matrix(score~class.size+rank+language+ethnicity+(gender+bty+age.fac)^3-1,data=evals)
ind = NULL
for (i in 1:ncol(X)){
  if (sum(X[,i])>0){
    ind= c(ind,i)
  }
}
X = X[,ind]
p = dim(X)[2]
var.names = c("intercept", colnames(X))
X = cbind(1,X)

# Setting up X matrix for the AR(20) model
# Model: y|X ~ N(X*phi,sig2)
# Prior: p(phi,sig2) = p(phi)*p(sig2), where 
#        phi=(phi(1),...,phi(p))'
#        phi~N(phi0,Phi0)
#        sig2~IG(c0,d0)
p = ncol(X)-1
y = score

phi0 = rep(0,1+p)
Phi0 = diag(1,1+p)
c0   = 1
d0   = 1

# Gibbs sampler
burnin = 10000
M      = 10000
thin   = 1
run    = bayesreg1(y,X,phi0,Phi0,c0,d0,burnin,M,thin)

par(mfrow=c(1,1),mar=c(5,9,4,2))
boxplot.matrix(run$phis,outline=FALSE,xlab="Regression coefficient",ylab="",axes=FALSE,horizontal=TRUE)
axis(1);axis(2,at=1:(p+1),lab=var.names,las=2,cex.axis=0.5)
abline(v=0,col=2,lwd=3)


#install.packages("bayeslm")
#packageDescription("bayeslm")
library(bayeslm)

set.seed(23748243)
Nsamples  = burnin+M
X1        = X[,2:ncol(X)]
block_vec = rep(1,ncol(X1))

fit.blasso=bayeslm(y,X1,prior="laplace",block_vec=block_vec,N=Nsamples,icept=TRUE,
                   standardize=FALSE,singular=TRUE,
                   verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p))
fit.hs=bayeslm(y,X1,prior="horseshoe",block_vec=block_vec,N=Nsamples,icept=TRUE,
               standardize=FALSE,singular=TRUE,
               thinning=thin,verb=TRUE,scale_sigma_prior=FALSE,cc=rep(1,p))
               
# Posterior quantiles
qbeta.ridge = t(apply(run$phis,2,quantile,c(0.05,0.5,0.95)))
qbeta.hs = t(apply(fit.hs$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95)))
qbeta.blasso = t(apply(fit.blasso$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95)))

par(mfrow=c(1,1),mar=c(5,9,4,2))
yrange = range(qbeta.ridge,qbeta.hs,qbeta.blasso)
plot(qbeta.ridge[,2],1:(p+1),
     xlab="Regression coefficient (median and 90% credibility interval)",
     ylab="",axes=FALSE,pch=16,cex=0.5,xlim=yrange)
axis(1);axis(2,at=1:(p+1),lab=var.names,las=2,cex.axis=0.5)
abline(v=0,lty=2)
for (i in 1:(p+1)){
  segments(qbeta.ridge[i,1],i,qs.bayes[i,3],i)
  segments(qbeta.blasso[i,1],i-0.2,qbeta.blasso[i,3],i-0.2,col=2)
  segments(qbeta.hs[i,1],i-0.4,qbeta.hs[i,3],i-0.4,col=4)
}
points(qbeta.blasso[,2],1:(p+1)-0.2,col=2,pch=16,cex=0.5)
points(qbeta.hs[,2],1:(p+1)-0.4,col=4,pch=16,cex=0.5)
legend("topright",legend=c("Ridge prior","Lasso prior","Horseshoe prior"),col=c(1,2,4),lty=1)


var.names[(qbeta.ridge[,1]>0)|(qbeta.ridge[,3]<0)]
var.names[qbeta.blasso[,1]>0|(qbeta.blasso[,3]<0)]
var.names[qbeta.hs[,1]>0|(qbeta.hs[,3]<0)]


#######################################################################################
# Example 4: wages
#######################################################################################
#
# Basic reference:
# Blackburn and Newmark (1992) Unobserved ability, eciency wages and 
# interindustry wage", Quarterly Journal of Economics, 107, 1421-36.
#
# Summary: Data on monthly earnings, education, several demographic 
# variables, and IQ scores for 935 men in 1980.
#
#  1. wage                     monthly earnings
#  2. hours                    average weekly hours
#  3. iq                         IQ score
#  4. kww                    knowledge of world work score
#  5. educ                     years of education
#  6. exper                    years of work experience
#  7. tenure                   years with current employer
#  8. age                       age in years
#  9. married                  =1 if married
# 10. black                    =1 if black
# 11. south                    =1 if live in south
# 12. urban                    =1 if live in SMSA
# 13. sibs                      number of siblings
# 14. brthord                 birth order
# 15. meduc                  mother's education
# 16. feduc                    father's education
# 17. lwage                   natural log of wage
#
# Source: Wooldridge (2012) 
# Introductory Econometrics: A Modern Approach (5th edition)
# South-Western, Cengage Learning
#
#######################################################################################
data = read.table("http://hedibert.org/wp-content/uploads/2014/02/wage2-wooldridge.txt",header=FALSE)
n    = nrow(data)
colnames(data) = c("wage","hours","iq","kww","educ","exper","tenure","age","married",
                   "black","south","urban","sibs","brthord","meduc","feduc","lwage")
# Removing rows with missing covariates
miss = rep(0,n)
for (i in 1:n)
 miss[i] = sum(is.na(data[i,]))
table(miss)
data = data[miss==0,]

# Regression matrix
X = model.matrix(lwage~iq*educ+kww+hours*exper*tenure*age+sibs+brthord+meduc+feduc+(married+black+south+urban+meduc+feduc)^3-1,data=data)
p = ncol(X)
y = data[,ncol(data)]


set.seed(23748243)
burnin = 10000
M      = 10000
thin   = 1
Nsamples  = burnin+M
block_vec = rep(1,ncol(X))

fit.ridge=bayeslm(y,X,prior="ridge",block_vec=block_vec,N=Nsamples,icept=TRUE,
                   standardize=FALSE,singular=TRUE,
                   verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p))
fit.blasso=bayeslm(y,X,prior="laplace",block_vec=block_vec,N=Nsamples,icept=TRUE,
                   standardize=FALSE,singular=TRUE,
                   verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p))
fit.hs=bayeslm(y,X,prior="horseshoe",block_vec=block_vec,N=Nsamples,icept=TRUE,
               standardize=FALSE,singular=TRUE,
               thinning=thin,verb=TRUE,scale_sigma_prior=FALSE,cc=rep(1,p))


par(mfrow=c(2,3))
boxplot.matrix(fit.ridge$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE)    
axis(2);axis(1,at=1:p);box();abline(h=0,col=2)
boxplot.matrix(fit.blasso$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE)    
axis(2);axis(1,at=1:p);box();abline(h=0,col=2)
boxplot.matrix(fit.hs$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE)    
axis(2);axis(1,at=1:p);box();abline(h=0,col=2)
boxplot.matrix(fit.ridge$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE,ylim=c(-0.025,0.025))
axis(2);axis(1,at=1:p);box();abline(h=0,col=2)
boxplot.matrix(fit.blasso$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE,ylim=c(-0.025,0.025))
axis(2);axis(1,at=1:p);box();abline(h=0,col=2)
boxplot.matrix(fit.hs$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE,ylim=c(-0.025,0.025))
axis(2);axis(1,at=1:p);box();abline(h=0,col=2)
       

# Posterior quantiles
qbeta.ridge = t(apply(fit.ridge$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95)))
qbeta.hs     = t(apply(fit.hs$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95)))
qbeta.blasso = t(apply(fit.blasso$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95)))
var.names = c("intercept",colnames(X))
var.names[(qbeta.ridge[,1]>0)|(qbeta.ridge[,3]<0)]
var.names[qbeta.blasso[,1]>0|(qbeta.blasso[,3]<0)]
var.names[qbeta.hs[,1]>0|(qbeta.hs[,3]<0)]