###################################################################################### # # Hahn, He and Lopes (2018) Efficient sampling for Gaussian linear regression with # arbitrary priors. Journal of Computational and Graphical Statistics (to appear). # # Section 4. Empirical illustration: beauty and course evaluations # # The data are course evaluations from the University of Texas at Austin # between 2000 and 2002. The data are on a 1 to 5 scale, with larger numbers # being better. In addition to the course evaluations, information concerning # the class and the instructor were collected. # # To quote Hamermesh and Parker [2005]: # "We chose professors at all levels of the academic hierarchy, obtaining professorial # stas from a number of departments that had posted all faculty members'pictures on # their departmental websites. An additional ten faculty members'pictures were obtained # from miscellaneous departments around the University. The average evaluation score # for each undergraduate course that the faculty member taught during the academic # years 2000-2002 is included. This sample selection criterion resulted in 463 courses, # with the number of courses taught by the sample members ranging from 1 to 13. # The classes ranged in size from 8 to 581 students, while the number of students # completing the instructional ratings ranged from 5 to 380. Underlying the 463 # sample observations are 16,957 completed evaluations from 25,547 registered students." # # For additional details on how the beauty scores were constructed and on how to # interpret the regression results, see Hamermesh and Parker (2005) Beauty in the # classroom: instructors' pulchritude and putative pedagogical productivity. # Economics of Education Review, Volume 24, Number 4, 369-376. # # Data can be downloaded from Richard's Chicago Booth webpage: # # http://faculty.chicagobooth.edu/richard.hahn/teaching/hamermesh.txt # # The model we fit allows for fixed effects for each of 95 instructors. # We include additive effects for the following factors: # # 1. Class size (number of students), # 2. Language in which the professor earned his or her degree, # 3. Whether or not the instructor was a minority, # 4. Gender, # 5. Beauty rating, and # 6. Age. # # Each of these variables was included in the model via dummy variables according # to the following breakdown: # class size: below 31, 31 to 60, 61 to 150, or 151 to 600 (four levels by quartile), # language: English or non-English (two levels), # minority: ethnic minority or non-minority (two levels), # gender: male or female (two levels), # beauty: four levels by quartile, and # age: below 43, 43 to 47, 48 to 56 and 57 to 73 (four levels by quartile). # # Finally, we include up to three-way interactions between age, beauty, and gender. # We include an intercept in our model so that individual effects can be interpreted # as deviation from the average. Three predictors are then dropped because they are # collinear with the intercept (and each other); there were no highly beautiful # males between 42 and 46, and no moderately beautiful instructors of either gender # in that same age group. Even after dropping these columns, the model is numerically # singular, with only 98 out of 130 singular values greater than 1e-14. # ###################################################################################### install.packages("bayeslm") packageDescription("bayeslm") library(bayeslm) # Reading and preparing the dataset evals = read.csv('hamermesh.txt') evals[1:5,] dim(evals) # create a dummy variable temp = unique(evals[,c("rank", "ethnicity", "gender", "age", "bty.avg", "language", "pic.outfit", "pic.color")]) temp[1:5,] dim(temp) temp = cbind(temp, as.factor(1:95)) colnames(temp)[9] = "prof" temp[1:5,] dim(temp) evals = merge(evals, temp) evals[1:5,] dim(evals) evals$age.fac = cut(evals$age, c(28,42,46,56,73)) evals$class.size = cut(evals$cls.students,c(7, 30, 60, 150, 600)) evals$bty = cut(evals$bty.avg, c(1,3.167,4.333,5.50,8.167)) evals[1:5,] dim(evals) score = as.matrix(evals$score) pdf(file="bayeslm-demo.pdf",width=12,height=6) # Exploratory data analysis par(mfrow=c(1,1)) plot(table(evals$prof),xlab="Professor",ylab="Frequency") abline(h=1,lty=2) abline(h=2,lty=2) abline(h=10,lty=2) par(mfrow=c(2,3)) boxplot(score~rank,data=evals) boxplot(score~ethnicity,data=evals) boxplot(score~gender,data=evals) boxplot(score~age.fac,data=evals) boxplot(score~class.size,data=evals) boxplot(score~bty,data=evals) # Regression matrix X1 = model.matrix(score~prof+class.size+rank+language+ethnicity+(gender+bty+age.fac)^3-1,data=evals) # There are 3 columns in X1 are all 0, because no observation in the interaction term # remove constant columns ind = NULL for (i in 1:ncol(X1)){ if (sum(X1[,i])>0){ ind= c(ind,i) } } X1 = X1[,ind] p = dim(X1)[2] block_vec = rep(1, p) var_names = c("intercept", colnames(X1)) # Posterior Bayesian inference for sparse linear regression with horseshoe prior set.seed(23748243) Nsamples = 10001 thin = 10 fit.ridge=bayeslm(score,X1,prior="ridge",block_vec=block_vec,N=Nsamps,icept=TRUE,standardize=FALSE,singular=TRUE, verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p)) fit.blasso=bayeslm(score,X1,prior="laplace",block_vec=block_vec,N=Nsamps,icept=TRUE,standardize=FALSE,singular=TRUE, verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p)) fit.hs=bayeslm(score,X1,prior="horseshoe",block_vec=block_vec,N=Nsamples,icept=TRUE,standardize=FALSE,singular=TRUE, thinning=thin,verb=TRUE,scale_sigma_prior=FALSE,cc=rep(1,p)) names = c("Class size: (30,60]","Class size: (60,150]","Class size: (150,600]","Rank: tenure track","Rank: tenured","Minority: No","Beauty: (3.17,4.33]","Beauty: (4.33,5.5]","Beauty: (5.5,8.17]") par(mfrow=c(3,3)) l=0 for (i in c(97,98,99,100,101,103,105,106,107)){ l = l+1 den.hs = density(fit.hs$beta[100:Nsamples,i]) den.blasso = density(fit.blasso$beta[100:Nsamples,i]) den.ridge = density(fit.ridge$beta[100:Nsamples,i]) limy = range(den.hs$y,den.blasso$y,den.ridge$y) limx = range(den.hs$x,den.blasso$x,den.ridge$x) plot(den.hs, lwd = 2, bty = "l",xlab="",main=names[l],ylab="Posterior density",ylim=limy,xlim=limx) lines(den.blasso,col=4) lines(den.ridge,col=2) abline(v=0,lty=2) } # Posterior quantiles qbeta.hs = t(apply(fit.hs$beta[100:Nsamples,],2,quantile,c(0.025,0.5,0.975))) qbeta.blasso = t(apply(fit.blasso$beta[100:Nsamples,],2,quantile,c(0.025,0.5,0.975))) qbeta.ridge = t(apply(fit.ridge$beta[100:Nsamples,],2,quantile,c(0.025,0.5,0.975))) # Let us look at the marginal medians lims = range(qbeta.hs[,2],qbeta.blasso[,2],qbeta.ridge[,2]) par(mfrow=c(1,3)) plot(qbeta.ridge[,2],qbeta.hs[,2],xlim=lims,ylim=lims,xlab="Ridge",ylab="Horseshoe",pch=16) abline(h=0,lty=2) abline(v=0,lty=2) abline(0,1,lty=2) plot(qbeta.ridge[,2],qbeta.blasso[,2],xlim=lims,ylim=lims,xlab="Ridge",ylab="Bayesian lasso",pch=16) abline(h=0,lty=2) abline(v=0,lty=2) abline(0,1,lty=2) plot(qbeta.hs[,2],qbeta.blasso[,2],xlim=lims,ylim=lims,xlab="Horseshoe",ylab="Bayesian lasso",pch=16) abline(h=0,lty=2) abline(v=0,lty=2) abline(0,1,lty=2) # Let us look at the marginal 95% credibility intervals par(mfrow=c(1,1)) limy = range(qbeta.hs,qbeta.blasso,qbeta.ridge) plot(qbeta.hs[,2],ylim=limy,ylab="Posterior 2.5th, 50th and 97.5th quantiles",xlab="Regressor",pch=16,cex=0.5) abline(h=0,lty=2) for (i in 1:p1){ segments(i,qbeta.hs[i,1],i,qbeta.hs[i,3]) segments(i+0.2,qbeta.blasso[i,1],i+0.2,qbeta.blasso[i,3],col=2) segments(i+0.4,qbeta.ridge[i,1],i+0.4,qbeta.ridge[i,3],col=3) points(i+0.2,qbeta.blasso[i,2],col=2,pch=16,cex=0.5) points(i+0.4,qbeta.ridge[i,2],col=3,pch=16,cex=0.5) } legend("topright",legend=c("Horseshoe","Bayesian lasso","Ridge"),col=1:3,lty=1) dev.off()