###################################################################### # # Bayesian Econometrics # PhD Program in Business Economics # Spring 2018 # # By Hedibert Freitas Lopes # ###################################################################### # # Bayesian inference via Gibbs sampler for the multiple linear # regression with Gaussian errors and conditionally conjugate priors. # Dealing with sparsity in Gaussian linear models via ridge, Laplace # and horseshoe priors # ###################################################################### rm(list=ls()) bayesreg = function(y,X,phi0,Phi0,c0,d0,burnin,M,thin){ p = ncol(X) iPhi0 = solve(Phi0) iPhi0phi0 = iPhi0%*%phi0 XtX = t(X)%*%X Xty = t(X)%*%y phiols = solve(XtX)%*%Xty sig2ols = sum((y-X%*%phiols)^2)/n niter = burnin+M*thin draws = matrix(0,niter,p+1) phi = phi0 sig2 = d0/(c0+1) for (iter in 1:niter){ # Sampling from p(phi|sig2,y,x) var = solve(XtX/sig2+iPhi0) mean = var%*%(Xty/sig2+iPhi0phi0) phi = mean + t(chol(var))%*%rnorm(p) # Sampling from p(sig2|phi,y) e = y-X%*%phi c1 = c0 + n/2 d1 = d0 + sum(e^2)/2 sig2 = 1/rgamma(1,c1,d1) # Storing the Gibbs output for posterior inference draws[iter,] = c(phi,sig2) } ind = seq(burnin+1,niter,by=thin) phis = draws[ind,1:p] sig2s = draws[ind,p+1] return(list(phis=phis,sig2s=sig2s,phiols=phiols,sig2ols=sig2ols)) } bayesreg1 = function(y,X,phi0,Phi0,c0,d0,burnin,M,thin){ p = ncol(X) iPhi0 = solve(Phi0) iPhi0phi0 = iPhi0%*%phi0 XtX = t(X)%*%X Xty = t(X)%*%y niter = burnin+M*thin draws = matrix(0,niter,p+1) phi = phi0 sig2 = d0/(c0+1) for (iter in 1:niter){ # Sampling from p(phi|sig2,y,x) var = solve(XtX/sig2+iPhi0) mean = var%*%(Xty/sig2+iPhi0phi0) phi = mean + t(chol(var))%*%rnorm(p) # Sampling from p(sig2|phi,y) e = y-X%*%phi c1 = c0 + n/2 d1 = d0 + sum(e^2)/2 sig2 = 1/rgamma(1,c1,d1) # Storing the Gibbs output for posterior inference draws[iter,] = c(phi,sig2) } ind = seq(burnin+1,niter,by=thin) phis = draws[ind,1:p] sig2s = draws[ind,p+1] return(list(phis=phis,sig2s=sig2s)) } ################################################################################# # Example 1: Simulated data from a Gaussian AR(10) time series model ################################################################################# # Simulating the time series following an AR(10) set.seed(1234) lag = 10 phi.t = c(0.987,0.974,-0.098,-0.051,-0.155,0.045,-0.072-0.036,0.179,0.025,0.138,-0.288) sig2.t = (0.054)^2 n = 1000 y = rep(0,2*n) for (t in (lag+1):(2*n)) y[t] = rnorm(1,c(1,y[(t-1):(t-lag)])%*%phi.t,sqrt(sig2.t)) y = y[(n+1):(2*n)] par(mfrow=c(1,1)) ts.plot(y) # Setting up X matrix for the AR(30) model # Model: y(t)|y(t-1)~N(phi(0)+phi(1)*y(t-1)+...+phi(30)*y(t-30),sig2) # Prior: p(phi,sig2) = p(phi)*p(sig2), where # phi=(phi(0),...,phi(30))' # phi~N(phi0,Phi0) # sig2~IG(c0,d0) phi.t = c(phi.t,rep(0,20)) p = 30 y1 = y[(p+1):n] X = 1 for (i in 1:p) X = cbind(X,y[(p-i+1):(n-i)]) phi0 = rep(0,1+p) Phi0 = diag(1,1+p) c0 = 0.01 d0 = 0.01 # Gibbs sampler burnin = 10000 M = 1000 thin = 100 run = bayesreg(y1,X,phi0,Phi0,c0,d0,burnin,M,thin) # Trace plots if (p>10){ind=c(1,trunc(seq(2,(1+p),length=10)))}else{ind=1:(1+p)} par(mfrow=c(2,6)) for (i in ind){ ts.plot(run$phis[,i],xlab="iterations",ylab="",main=paste("phi",i-1,sep="")) abline(h=phi.t[i],col=3,lwd=3) } ts.plot(run$sig2s,xlab="iterations",ylab="",main="sigma2") abline(h=sqrt(sig2.t),col=3,lwd=3) # Marginal posteriors par(mfrow=c(2,6)) for (i in ind){ xx = run$phis[,i] xrange = range(xx,run$phiols[i]) breaks = seq(min(xx),max(xx),length=20) hist(run$phis[,i],main=paste("phi",i-1,sep=""),xlab="",breaks=breaks,xlim=xrange) abline(v=run$phiols[i],col=2,lwd=2) points(phi.t[i],0,col=3,pch=16,cex=2) } hist(sqrt(run$sig2s),main="sigma",xlab="") abline(v=sqrt(run$sig2ols),col=2,lwd=2) points(sqrt(sig2.t),0,col=3,pch=16,cex=2) ###################################################################################### # Example 2: Canadian lynx dataset ###################################################################################### # # Annual numbers of lynx trappings for 1821–1934 in Canada. # Taken from Brockwell & Davis (1991), this appears to be the series considered # by Campbell & Walker (1977). # # Brockwell and Davis (1991) Time Series and Forecasting Methods. Second edition. # Springer. Series G (page 557). # # Becker, Chambers and Wilks (1988) The New S Language. Wadsworth & Brooks/Cole. # # Campbell and Walker (1977) A Survey of statistical work on the Mackenzie River # series of annual Canadian lynx trappings for the years 1821–1934 and a new # analysis. Journal of the Royal Statistical Society series A, 140, 411–431. # doi: 10.2307/2345277. # ################################################################################# data(lynx) y = lynx y = (y-mean(y))/sqrt(var(y)) n = length(y) par(mfrow=c(1,1)) ts.plot(y,type="b",xlab="Year",ylab="Number of lynx trappings (standardized)") # Setting up X matrix for the AR(20) model # Model: y(t)|y(t-1)~N(phi(0)+phi(1)*y(t-1)+...+phi(30)*y(t-20),sig2) # Prior: p(phi,sig2) = p(phi)*p(sig2), where # phi=(phi(0),...,phi(20))' # phi~N(phi0,Phi0) # sig2~IG(c0,d0) p = 20 y1 = y[(p+1):n] X = 1 for (i in 1:p) X = cbind(X,y[(p-i+1):(n-i)]) phi0 = rep(0,1+p) Phi0 = diag(1,1+p) c0 = 0.01 d0 = 0.01 # Gibbs sampler burnin = 10000 M = 1000 thin = 100 run = bayesreg(y1,X,phi0,Phi0,c0,d0,burnin,M,thin) # Trace plots par(mfrow=c(3,7)) for (i in 2:(p+1)){ ts.plot(run$phis[,i],xlab="iterations",ylab="",main=paste("phi",i-1,sep="")) abline(h=0,col=2,lwd=3) } ts.plot(run$sig2s,xlab="iterations",ylab="",main="sigma2") # Marginal posteriors par(mfrow=c(3,7)) for (i in 2:(p+1)){ xx = run$phis[,i] breaks = seq(min(xx),max(xx),length=20) hist(run$phis[,i],main=paste("phi",i-1,sep=""),xlab="",breaks=breaks) abline(v=run$phiols[i],col=2,lwd=2) points(0,0,col=3,pch=16,cex=2) } hist(sqrt(run$sig2s),main="sigma",xlab="") abline(v=sqrt(run$sig2ols),col=2,lwd=2) ###################################################################################### # Example 3: beauty and course evaluations ###################################################################################### # # Hahn, He and Lopes (2018) Efficient sampling for Gaussian linear regression # with arbitrary priors. Journal of Computational and Graphical Statistics. # # The data are course evaluations from the University of Texas at Austin # between 2000 and 2002. The data are on a 1 to 5 scale, with larger numbers # being better. In addition to the course evaluations, information concerning # the class and the instructor were collected. # # To quote Hamermesh and Parker [2005]: # "We chose professors at all levels of the academic hierarchy, obtaining professorial # stas from a number of departments that had posted all faculty members'pictures on # their departmental websites. An additional ten faculty members'pictures were obtained # from miscellaneous departments around the University. The average evaluation score # for each undergraduate course that the faculty member taught during the academic # years 2000-2002 is included. This sample selection criterion resulted in 463 courses, # with the number of courses taught by the sample members ranging from 1 to 13. # The classes ranged in size from 8 to 581 students, while the number of students # completing the instructional ratings ranged from 5 to 380. Underlying the 463 # sample observations are 16,957 completed evaluations from 25,547 registered students." # # For additional details on how the beauty scores were constructed and on how to # interpret the regression results, see Hamermesh and Parker (2005) Beauty in the # classroom: instructors' pulchritude and putative pedagogical productivity. # Economics of Education Review, Volume 24, Number 4, 369-376. # # Data: http://faculty.chicagobooth.edu/richard.hahn/teaching/hamermesh.txt # # The model we fit allows for fixed effects for each of 95 instructors. # We include additive effects for the following factors: # # 1. Class size (number of students), # 2. Language in which the professor earned his or her degree, # 3. Whether or not the instructor was a minority, # 4. Gender, # 5. Beauty rating, and # 6. Age. # # Each of these variables was included in the model via dummy variables according # to the following breakdown: # class size: below 31, 31 to 60, 61 to 150, or 151 to 600 (four levels by quartile), # language: English or non-English (two levels), # minority: ethnic minority or non-minority (two levels), # gender: male or female (two levels), # beauty: four levels by quartile, and # age: below 43, 43 to 47, 48 to 56 and 57 to 73 (four levels by quartile). # # Finally, we include up to three-way interactions between age, beauty, and gender. # We include an intercept in our model so that individual effects can be interpreted # as deviation from the average. Three predictors are then dropped because they are # collinear with the intercept (and each other); there were no highly beautiful # males between 42 and 46, and no moderately beautiful instructors of either gender # in that same age group. Even after dropping these columns, the model is numerically # singular, with only 98 out of 130 singular values greater than 1e-14. # # # # female male # 195 268 # # teaching tenure track tenured # 102 108 253 # # minority not minority # 64 399 # class size # (7,30] (30,60] (60,150] (150,600] # 245 104 83 31 # # beauty rate # # (1,3.17] (3.17,4.33] (4.33,5.5] (5.5,8.17] # 117 166 80 100 # ###################################################################################### # Reading and preparing the dataset evals = read.csv("http://faculty.chicagobooth.edu/richard.hahn/teaching/hamermesh.txt") temp = unique(evals[,c("rank","ethnicity","gender","age","bty.avg","language","pic.outfit","pic.color")]) temp = cbind(temp, as.factor(1:95)) colnames(temp)[9] = "prof" evals = merge(evals, temp) evals$age.fac = cut(evals$age, c(28,42,46,56,73)) evals$class.size = cut(evals$cls.students,c(7, 30, 60, 150, 600)) evals$bty = cut(evals$bty.avg, c(1,3.167,4.333,5.50,8.167)) score = as.matrix(evals$score) # Exploratory data analysis par(mfrow=c(1,1)) plot(table(evals$prof),xlab="Professor",ylab="Frequency") abline(h=1,lty=2) abline(h=2,lty=2) abline(h=10,lty=2) # Frequencies table(evals$rank) table(evals$ethnicity) table(evals$gender) table(evals$class.size) table(evals$bty) par(mfrow=c(2,3)) boxplot(score~rank,data=evals,xlab="Rank",ylab="score") boxplot(score~ethnicity,data=evals,xlab="Minority",ylab="score") boxplot(score~gender,data=evals,xlab="Gender",ylab="score") boxplot(score~age.fac,data=evals,xlab="Age",ylab="score") boxplot(score~class.size,data=evals,xlab="Class size",ylab="score") boxplot(score~bty,data=evals,xlab="Beauty rate",ylab="score") # Regression matrix X = model.matrix(score~class.size+rank+language+ethnicity+(gender+bty+age.fac)^3-1,data=evals) ind = NULL for (i in 1:ncol(X)){ if (sum(X[,i])>0){ ind= c(ind,i) } } X = X[,ind] p = dim(X)[2] var.names = c("intercept", colnames(X)) X = cbind(1,X) # Setting up X matrix for the AR(20) model # Model: y|X ~ N(X*phi,sig2) # Prior: p(phi,sig2) = p(phi)*p(sig2), where # phi=(phi(1),...,phi(p))' # phi~N(phi0,Phi0) # sig2~IG(c0,d0) p = ncol(X)-1 y = score phi0 = rep(0,1+p) Phi0 = diag(1,1+p) c0 = 1 d0 = 1 # Gibbs sampler burnin = 10000 M = 10000 thin = 1 run = bayesreg1(y,X,phi0,Phi0,c0,d0,burnin,M,thin) par(mfrow=c(1,1),mar=c(5,9,4,2)) boxplot.matrix(run$phis,outline=FALSE,xlab="Regression coefficient",ylab="",axes=FALSE,horizontal=TRUE) axis(1);axis(2,at=1:(p+1),lab=var.names,las=2,cex.axis=0.5) abline(v=0,col=2,lwd=3) #install.packages("bayeslm") #packageDescription("bayeslm") library(bayeslm) set.seed(23748243) Nsamples = burnin+M X1 = X[,2:ncol(X)] block_vec = rep(1,ncol(X1)) fit.blasso=bayeslm(y,X1,prior="laplace",block_vec=block_vec,N=Nsamples,icept=TRUE, standardize=FALSE,singular=TRUE, verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p)) fit.hs=bayeslm(y,X1,prior="horseshoe",block_vec=block_vec,N=Nsamples,icept=TRUE, standardize=FALSE,singular=TRUE, thinning=thin,verb=TRUE,scale_sigma_prior=FALSE,cc=rep(1,p)) # Posterior quantiles qbeta.ridge = t(apply(run$phis,2,quantile,c(0.05,0.5,0.95))) qbeta.hs = t(apply(fit.hs$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95))) qbeta.blasso = t(apply(fit.blasso$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95))) par(mfrow=c(1,1),mar=c(5,9,4,2)) yrange = range(qbeta.ridge,qbeta.hs,qbeta.blasso) plot(qbeta.ridge[,2],1:(p+1), xlab="Regression coefficient (median and 90% credibility interval)", ylab="",axes=FALSE,pch=16,cex=0.5,xlim=yrange) axis(1);axis(2,at=1:(p+1),lab=var.names,las=2,cex.axis=0.5) abline(v=0,lty=2) for (i in 1:(p+1)){ segments(qbeta.ridge[i,1],i,qs.bayes[i,3],i) segments(qbeta.blasso[i,1],i-0.2,qbeta.blasso[i,3],i-0.2,col=2) segments(qbeta.hs[i,1],i-0.4,qbeta.hs[i,3],i-0.4,col=4) } points(qbeta.blasso[,2],1:(p+1)-0.2,col=2,pch=16,cex=0.5) points(qbeta.hs[,2],1:(p+1)-0.4,col=4,pch=16,cex=0.5) legend("topright",legend=c("Ridge prior","Lasso prior","Horseshoe prior"),col=c(1,2,4),lty=1) var.names[(qbeta.ridge[,1]>0)|(qbeta.ridge[,3]<0)] var.names[qbeta.blasso[,1]>0|(qbeta.blasso[,3]<0)] var.names[qbeta.hs[,1]>0|(qbeta.hs[,3]<0)] ####################################################################################### # Example 4: wages ####################################################################################### # # Basic reference: # Blackburn and Newmark (1992) Unobserved ability, eciency wages and # interindustry wage", Quarterly Journal of Economics, 107, 1421-36. # # Summary: Data on monthly earnings, education, several demographic # variables, and IQ scores for 935 men in 1980. # # 1. wage monthly earnings # 2. hours average weekly hours # 3. iq IQ score # 4. kww knowledge of world work score # 5. educ years of education # 6. exper years of work experience # 7. tenure years with current employer # 8. age age in years # 9. married =1 if married # 10. black =1 if black # 11. south =1 if live in south # 12. urban =1 if live in SMSA # 13. sibs number of siblings # 14. brthord birth order # 15. meduc mother's education # 16. feduc father's education # 17. lwage natural log of wage # # Source: Wooldridge (2012) # Introductory Econometrics: A Modern Approach (5th edition) # South-Western, Cengage Learning # ####################################################################################### data = read.table("http://hedibert.org/wp-content/uploads/2014/02/wage2-wooldridge.txt",header=FALSE) n = nrow(data) colnames(data) = c("wage","hours","iq","kww","educ","exper","tenure","age","married", "black","south","urban","sibs","brthord","meduc","feduc","lwage") # Removing rows with missing covariates miss = rep(0,n) for (i in 1:n) miss[i] = sum(is.na(data[i,])) table(miss) data = data[miss==0,] # Regression matrix X = model.matrix(lwage~iq*educ+kww+hours*exper*tenure*age+sibs+brthord+meduc+feduc+(married+black+south+urban+meduc+feduc)^3-1,data=data) p = ncol(X) y = data[,ncol(data)] set.seed(23748243) burnin = 10000 M = 10000 thin = 1 Nsamples = burnin+M block_vec = rep(1,ncol(X)) fit.ridge=bayeslm(y,X,prior="ridge",block_vec=block_vec,N=Nsamples,icept=TRUE, standardize=FALSE,singular=TRUE, verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p)) fit.blasso=bayeslm(y,X,prior="laplace",block_vec=block_vec,N=Nsamples,icept=TRUE, standardize=FALSE,singular=TRUE, verb=TRUE,thinning=thin,scale_sigma_prior=FALSE,cc=rep(1,p)) fit.hs=bayeslm(y,X,prior="horseshoe",block_vec=block_vec,N=Nsamples,icept=TRUE, standardize=FALSE,singular=TRUE, thinning=thin,verb=TRUE,scale_sigma_prior=FALSE,cc=rep(1,p)) par(mfrow=c(2,3)) boxplot.matrix(fit.ridge$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE) axis(2);axis(1,at=1:p);box();abline(h=0,col=2) boxplot.matrix(fit.blasso$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE) axis(2);axis(1,at=1:p);box();abline(h=0,col=2) boxplot.matrix(fit.hs$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE) axis(2);axis(1,at=1:p);box();abline(h=0,col=2) boxplot.matrix(fit.ridge$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE,ylim=c(-0.025,0.025)) axis(2);axis(1,at=1:p);box();abline(h=0,col=2) boxplot.matrix(fit.blasso$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE,ylim=c(-0.025,0.025)) axis(2);axis(1,at=1:p);box();abline(h=0,col=2) boxplot.matrix(fit.hs$beta[(burnin+1):Nsamples,2:(p+1)],outline=FALSE,axes=FALSE,ylim=c(-0.025,0.025)) axis(2);axis(1,at=1:p);box();abline(h=0,col=2) # Posterior quantiles qbeta.ridge = t(apply(fit.ridge$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95))) qbeta.hs = t(apply(fit.hs$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95))) qbeta.blasso = t(apply(fit.blasso$beta[(burnin+1):Nsamples,],2,quantile,c(0.05,0.5,0.95))) var.names = c("intercept",colnames(X)) var.names[(qbeta.ridge[,1]>0)|(qbeta.ridge[,3]<0)] var.names[qbeta.blasso[,1]>0|(qbeta.blasso[,3]<0)] var.names[qbeta.hs[,1]>0|(qbeta.hs[,3]<0)]