######################################################################################
#
# Hahn, He and Lopes (2018) Efficient sampling for Gaussian linear regression with 
# arbitrary priors.  Journal of Computational and Graphical Statistics (to appear).
#
# Section 4. Empirical illustration: beauty and course evaluations
#
# The data are course evaluations from the University of Texas at Austin 
# between 2000 and 2002. The data are on a 1 to 5 scale, with larger numbers 
# being better. In addition to the course evaluations, information concerning 
# the class and the instructor were collected. 
# 
# To quote Hamermesh and Parker [2005]:
# "We chose professors at all levels of the academic hierarchy, obtaining professorial 
#  stas from a number of departments that had posted all faculty members'pictures on 
# their departmental websites. An additional ten faculty members'pictures were obtained 
# from miscellaneous departments around the University.  The average evaluation score 
# for each undergraduate course that the faculty member taught during the academic 
# years 2000-2002 is included. This sample selection criterion resulted in 463 courses, 
# with the number of courses taught by the sample members ranging from 1 to 13. 
# The classes ranged in size from 8 to 581 students, while the number of students 
# completing the instructional ratings ranged from 5 to 380. Underlying the 463 
# sample observations are 16,957 completed evaluations from 25,547 registered students."
#
# For additional details on how the beauty scores were constructed and on how to 
# interpret the regression results, see Hamermesh and Parker (2005) Beauty in the 
# classroom: instructors' pulchritude and putative pedagogical productivity. 
# Economics of Education Review, Volume 24, Number 4, 369-376.
# 
# Data can be downloaded from Richard's Chicago Booth webpage:
#
# http://faculty.chicagobooth.edu/richard.hahn/teaching/hamermesh.txt
#
# The model we fit allows for fixed effects for each of 95 instructors. 
# We include additive effects for the following factors: 
# 
# 1. Class size (number of students), 
# 2. Language in which the professor earned his or her degree, 
# 3. Whether or not the instructor was a minority, 
# 4. Gender, 
# 5. Beauty rating, and 
# 6. Age. 
# 
# Each of these variables was included in the model via dummy variables according 
# to the following breakdown:
# class size: below 31, 31 to 60, 61 to 150, or 151 to 600 (four levels by quartile),
# language: English or non-English (two levels),
# minority: ethnic minority or non-minority (two levels),
# gender: male or female (two levels),
# beauty: four levels by quartile, and
# age: below 43, 43 to 47, 48 to 56 and 57 to 73 (four levels by quartile).
#
# Finally, we include up to three-way interactions between age, beauty, and gender. 
# We include an intercept in our model so that individual effects can be interpreted 
# as deviation from the average. Three predictors are then dropped because they are 
# collinear with the intercept (and each other); there were no highly beautiful 
# males between 42 and 46, and no moderately beautiful instructors of either gender 
# in that same age group. Even after dropping these columns, the model is numerically 
# singular, with only 98 out of 130 singular values greater than 1e-14.
#
######################################################################################
install.packages("bayeslm")
packageDescription("bayeslm")
library(bayeslm)

# Reading and preparing the dataset
evals = read.csv('hamermesh.txt')
evals[1:5,]
dim(evals)

# create a dummy variable
temp = unique(evals[,c("rank", "ethnicity", "gender", "age", "bty.avg", "language", "pic.outfit", "pic.color")])
temp[1:5,]
dim(temp)

temp = cbind(temp, as.factor(1:95))
colnames(temp)[9] = "prof"
temp[1:5,]
dim(temp)

evals = merge(evals, temp)
evals[1:5,]
dim(evals)

evals$age.fac = cut(evals$age, c(28,42,46,56,73))
evals$class.size = cut(evals$cls.students,c(7, 30, 60, 150, 600))
evals$bty = cut(evals$bty.avg, c(1,3.167,4.333,5.50,8.167))
evals[1:5,]
dim(evals)

score = as.matrix(evals$score)


pdf(file="bayeslm-demo.pdf",width=12,height=6)

# Exploratory data analysis
par(mfrow=c(1,1))
plot(table(evals$prof),xlab="Professor",ylab="Frequency")
abline(h=1,lty=2)
abline(h=2,lty=2)
abline(h=10,lty=2)

par(mfrow=c(2,3))
boxplot(score~rank,data=evals)
boxplot(score~ethnicity,data=evals)
boxplot(score~gender,data=evals)
boxplot(score~age.fac,data=evals)
boxplot(score~class.size,data=evals)
boxplot(score~bty,data=evals)


# Regression matrix
X1 = model.matrix(score~prof+class.size+rank+language+ethnicity+(gender+bty+age.fac)^3-1,data=evals)

# There are 3 columns in X1 are all 0, because no observation in the interaction term
# remove constant columns
ind = NULL
for (i in 1:ncol(X1)){
  if (sum(X1[,i])>0){
    ind= c(ind,i)
  }
}
X1 = X1[,ind]
p = dim(X1)[2]
block_vec = rep(1, p)
var_names = c("intercept", colnames(X1))


# Posterior Bayesian inference for sparse linear regression with horseshoe prior
set.seed(23748243)
Nsamples = 10001
thin = 10


fit.ridge=bayeslm(score,X1,prior="ridge",block_vec=block_vec,N=Nsamps,icept=TRUE,standardize=FALSE,singular=TRUE,
                  verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p))
fit.blasso=bayeslm(score,X1,prior="laplace",block_vec=block_vec,N=Nsamps,icept=TRUE,standardize=FALSE,singular=TRUE,
                   verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p))
fit.hs=bayeslm(score,X1,prior="horseshoe",block_vec=block_vec,N=Nsamples,icept=TRUE,standardize=FALSE,singular=TRUE,
               thinning=thin,verb=TRUE,scale_sigma_prior=FALSE,cc=rep(1,p))


names = c("Class size: (30,60]","Class size: (60,150]","Class size: (150,600]","Rank: tenure track","Rank: tenured","Minority: No","Beauty: (3.17,4.33]","Beauty: (4.33,5.5]","Beauty: (5.5,8.17]")
par(mfrow=c(3,3))
l=0
for (i in c(97,98,99,100,101,103,105,106,107)){
  l = l+1
  den.hs = density(fit.hs$beta[100:Nsamples,i])
  den.blasso = density(fit.blasso$beta[100:Nsamples,i])
  den.ridge = density(fit.ridge$beta[100:Nsamples,i])
  limy = range(den.hs$y,den.blasso$y,den.ridge$y)
  limx = range(den.hs$x,den.blasso$x,den.ridge$x)
  plot(den.hs, lwd = 2, bty = "l",xlab="",main=names[l],ylab="Posterior density",ylim=limy,xlim=limx)
  lines(den.blasso,col=4)
  lines(den.ridge,col=2)
  abline(v=0,lty=2)
}

# Posterior quantiles
qbeta.hs = t(apply(fit.hs$beta[100:Nsamples,],2,quantile,c(0.025,0.5,0.975)))
qbeta.blasso = t(apply(fit.blasso$beta[100:Nsamples,],2,quantile,c(0.025,0.5,0.975)))
qbeta.ridge = t(apply(fit.ridge$beta[100:Nsamples,],2,quantile,c(0.025,0.5,0.975)))

# Let us look at the marginal medians
lims = range(qbeta.hs[,2],qbeta.blasso[,2],qbeta.ridge[,2])
par(mfrow=c(1,3))
plot(qbeta.ridge[,2],qbeta.hs[,2],xlim=lims,ylim=lims,xlab="Ridge",ylab="Horseshoe",pch=16)
abline(h=0,lty=2)
abline(v=0,lty=2)
abline(0,1,lty=2)
plot(qbeta.ridge[,2],qbeta.blasso[,2],xlim=lims,ylim=lims,xlab="Ridge",ylab="Bayesian lasso",pch=16)
abline(h=0,lty=2)
abline(v=0,lty=2)
abline(0,1,lty=2)
plot(qbeta.hs[,2],qbeta.blasso[,2],xlim=lims,ylim=lims,xlab="Horseshoe",ylab="Bayesian lasso",pch=16)
abline(h=0,lty=2)
abline(v=0,lty=2)
abline(0,1,lty=2)

# Let us look at the marginal 95% credibility intervals
par(mfrow=c(1,1))
limy = range(qbeta.hs,qbeta.blasso,qbeta.ridge)
plot(qbeta.hs[,2],ylim=limy,ylab="Posterior 2.5th, 50th and 97.5th quantiles",xlab="Regressor",pch=16,cex=0.5)
abline(h=0,lty=2)
for (i in 1:p1){
  segments(i,qbeta.hs[i,1],i,qbeta.hs[i,3])
  segments(i+0.2,qbeta.blasso[i,1],i+0.2,qbeta.blasso[i,3],col=2)
  segments(i+0.4,qbeta.ridge[i,1],i+0.4,qbeta.ridge[i,3],col=3)
  points(i+0.2,qbeta.blasso[i,2],col=2,pch=16,cex=0.5)
  points(i+0.4,qbeta.ridge[i,2],col=3,pch=16,cex=0.5)
}
legend("topright",legend=c("Horseshoe","Bayesian lasso","Ridge"),col=1:3,lty=1)

dev.off()