############################################################################################## # # Bayesian Variable selection in multiple linear regression: comparing a few alternatives # # 1) BIC # 2) Mallows' Cp # 3) Horseshoe prior # 4) Normal-Gamma prior # # Bobby Gramacy's R Package monomvn - https://bobby.gramacy.com/r_packages/monomvn/ # # Griffin and Brown (2009) Inference with Normal-Gamma prior # distributions in regression problems. Bayesian Analysis, 5, 171-188. # Carvalho, Polson and Scott (2010) The horseshoe estimator for # sparse signals. Biometrika, 97(2), 465-480. # # By Hedibert F. Lopes # Professor of Statistics and Econometrics # Insper - Institute for Education and Research # This version: March 6th 2021 # ############################################################################################## # # Basic references for the data and econometrics: # Blackburn and Newmark (1992) Unobserved ability, efficiency wages and interindustry wage", # Quarterly Journal of Economics, 107, 1421-36. # Wooldridge (2012) Introductory Econometrics: A Modern Approach (5th edition). # # Monthly earnings, education, demographic variables, and IQ scores for 935 men in 1980. # # 1. wage monthly earnings # 2. hours average weekly hours # 3. iq IQ score # 4. kww knowledge of world work score # 5. educ years of education # 6. exper years of work experience # 7. tenure years with current employer # 8. age age in years # 9. married =1 if married # 10. black =1 if black # 11. south =1 if live in south # 12. urban =1 if live in SMSA # 13. sibs number of siblings # 14. brthord birth order # 15. meduc mother's education # 16. feduc father's education # 17. lwage natural log of wage # ############################################################################################## rm(list=ls()) #install.packages("leaps") #install.packages("bayesreg") #install.packages("monomvn") library("leaps") library("bayesreg") library("monomvn") data = read.table("wage.txt") n = nrow(data) nmissing = rep(0,n) for (i in 1:n) nmissing[i] = sum(is.na(data[i,])) y = data[nmissing==0,17] X = as.matrix(data[nmissing==0,2:16]) X = cbind(X,X[,4]*X[,8:11],X[,5]*X[,8:11],X[,6]*X[,8:11]) k = ncol(X) n = nrow(X) y = (y-mean(y))/sqrt(var(y)) mX = apply(X,2,mean) sX = sqrt(apply(X,2,var)) X = X-matrix(mX,n,k,byrow=TRUE) X = X%*%diag(1/sX) namesX = c("hours","iq","kww","educ","exper","tenure","age","married","black","south","urban", "sibs","brthord","meduc","feduc","educ.married","educ.black","educ.south","educ.urban", "exper.married","exper.black","exper.south","exper.urban","tenure.married","tenure.black", "tenure.south","tenure.urban") colnames(X) = namesX # Regresson 1: OLS regression with all predictors ################################################# ols = lm(y~X-1) sighat = round(summary(ols)$sig,3) se.beta = sighat*sqrt(diag(solve(t(X)%*%X))) L = ols$coef-se.beta U = ols$coef+se.beta par(mfrow=c(1,1)) plot(ols$coef,xlab="Predictor",ylab="OLS coefficient",ylim=range(L,U),cex=0.5,col=2,pch=16) for (i in 1:k){ segments(i,L[i],i,U[i],col=2) if ((L[i]>0)|(U[i]<0)){ text(i,max(U),i,cex=0.5) } abline(h=0,col=2) } # Regresson 2: Exhaustive search: 2^27=1.34 million models ########################################################## data = data.frame(y=y,X=X) regs = regsubsets(y~.,data=data,nvmax=k,intercept=FALSE) regs.summary = summary(regs) ind = 1:k k.bic = ind[regs.summary$bic==min(regs.summary$bic)] k.cp = ind[regs.summary$cp==min(regs.summary$cp)] k.rsq = ind[regs.summary$rsq==max(regs.summary$rsq)] k.adjr2 = ind[regs.summary$adjr2==max(regs.summary$adjr2)] par(mfrow=c(1,3)) plot(regs.summary$bic,pch=16,xlab="Number of predictors",ylab="BIC",main=paste("k=",k.bic,sep="")) plot(regs.summary$cp,pch=16,xlab="Number of predictors",ylab="Cp",main=paste("k=",k.cp,sep="")) plot(regs.summary$rsq,pch=16,xlab="Number of predictors",ylab="R2 and adjusted R2",main="") points(regs.summary$adjr2,pch=16,col=2,main="") legend("topleft",legend=c(paste("R2: k=",k.rsq,sep=""),paste("R2adj: k=",k.adjr2,sep="")), col=1:2,pch=16,bty="n") par(mfrow=c(1,2)) plot(regs,scale="bic") plot(regs,scale="adjr2") bic = regs.summary$bic cp = regs.summary$cp ii = 1:k pred.bic = ind[regs.summary$which[ii[bic==min(bic)],]] pred.cp = ind[regs.summary$which[ii[cp==min(cp)],]] namesX[pred.bic] namesX[pred.cp] # Regression 3: Shrinkage/sparsity priors: ridge, Laplace & horseshoe M0 = 10000 M = 10000 skip = 100 data1 = data.frame(y=y,X=X) fit.h = bayesreg(y~X,data=data1,model="normal",prior="horseshoe",n.samples=M,burnin=M0,thin=skip) q.h = t(apply(fit.h$beta,1,quantile,c(0.025,0.5,0.975))) cond.h = ((q.h[,1]>0)|(q.h[,3]<0)) pred.h = ind[cond.h] k.h = sum(cond.h) par(mfrow=c(1,1),mai=c(0.8,0.8,0.5,0.5)) plot(ols$coef,xlab="Predictor",ylab="Coeffients",main="",pch=16,cex=0.5, col=0,axes=FALSE,ylim=range(L,U,q.h)) axis(2);box();axis(1,at=1:k) for (i in 1:k){ segments(i-0.1,L[i],i-0.1,U[i],lwd=2) segments(i+0.2,q.h[i,1],i+0.2,q.h[i,3],col=2,lwd=2) } points(-0.1+1:k,ols$coef,pch=16,cex=0.75) points(0.2+1:k,q.h[,2],pch=16,cex=0.75,col=2) legend("topleft",legend=c("OLS","HORSESHOE"),col=1:2,lwd=2,bty="n") abline(h=0,lty=2) c(mean((y-X[,pred.bic]%*%lm(y~X[,pred.bic]-1)$coef)^2), mean((y-X[,pred.cp]%*%lm(y~X[,pred.cp]-1)$coef)^2), mean((y-X%*%fit.h$mu.beta)^2)) ################################################################################ # Training and testing exercise (33-fold CV) - n=663 ################################################################################ set.seed(12345) M0 = 1000 M = 10000 skip = 1 ii = 1:k ind = 1:k indn = 1:n mse.in = matrix(0,33,5) mse.out = matrix(0,33,5) mae.in = matrix(0,33,5) mae.out = matrix(0,33,5) random = sample(1:n,size=n,replace=FALSE) XX = X[random,] yy = y[random] for (t in 1:33){ # Defining training and testing samples ini = 20*(t-1)+1 fin = 20*t test = ini:fin train = setdiff(indn,test) y1 = yy[train] y2 = yy[test] X1 = XX[train,] X2 = XX[test,] data1 = data.frame(y1,X1) # OLS fit fit.ols = lm(y1~X1-1) # Subset selection: using BIC and Cp as criteria regs = regsubsets(y1~.,data=data1,nvmax=k,intercept=FALSE) bic = summary(regs)$bic pred.bic = ind[regs.summary$which[ii[bic==min(bic)],]] fit.bic = lm(y1~X1[,pred.bic]-1) cp = summary(regs)$cp pred.cp = ind[regs.summary$which[ii[cp==min(cp)],]] fit.cp = lm(y1~X1[,pred.cp]-1) # Bayesian regularization: horseshoe and Normal-Gamma priors fit.hs = blasso(X1,y1,T=M,thin=skip,RJ=FALSE,case="hs",icept=FALSE,verb=0) fit.ng = blasso(X1,y1,T=M,thin=skip,RJ=FALSE,case="ng",icept=FALSE,verb=0) beta.hs = apply(fit.hs$beta,2,mean) beta.ng = apply(fit.ng$beta,2,mean) # Computing MSEs and MADs error.in = cbind(y1-X1%*%fit.ols$coef,y1-X1[,pred.bic]%*%fit.bic$coef, y1-X1[,pred.cp]%*%fit.cp$coef,y1-X1%*%beta.hs,y1-X1%*%beta.ng) error.out = cbind(y2-X2%*%fit.ols$coef,y2-X2[,pred.bic]%*%fit.bic$coef, y2-X2[,pred.cp]%*%fit.cp$coef,y2-X2%*%beta.hs,y2-X2%*%beta.ng) mse.in[t,] = apply(error.in^2,2,mean) mse.out[t,] = apply(error.out^2,2,mean) mae.in[t,] = apply(abs(error.in),2,mean) mae.out[t,] = apply(abs(error.out),2,mean) } win.mse.in = apply(mse.in==apply(mse.in,1,min),2,sum) win.mse.out = apply(mse.out==apply(mse.out,1,min),2,sum) win.mae.in = apply(mae.in==apply(mae.in,1,min),2,sum) win.mae.out = apply(mae.out==apply(mae.out,1,min),2,sum) pdf(file="wage.pdf",width=12,height=9) methods = c("OLS","OLS(BIC)","OLS(CP)","Horseshoe","NG") par(mfrow=c(2,2)) boxplot(mse.in,col=2:6,names=methods,ylab="MSE",outline=F,ylim=c(0.66,0.75)) title("33-fold cross validation\nIn-sample") text(1:5,0.75,win.in) text(1:5,0.66,round(apply(mse.in,2,mean),3)) boxplot(100*(mse.in[,2:5]/mse.in[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MSE",outline=FALSE) abline(h=0,lty=2) title("33-fold cross validation\nIn-sample") boxplot(mse.out,col=2:6,names=methods,ylab="MSE",outline=F,ylim=c(0.2,1.5)) title("33-fold cross validation\nOut-of-sample") text(1:5,1.5,win.out) text(1:5,0.2,round(apply(mse.out,2,mean),3)) boxplot(100*(mse.out[,2:5]/mse.out[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MSE",outline=FALSE) abline(h=0,lty=2) title("33-fold cross validation\nOut-of-sample") par(mfrow=c(2,2)) boxplot(mae.in,col=2:6,names=methods,ylab="MAE",outline=F,ylim=c(0.61,0.66)) title("33-fold cross validation\nIn-sample") text(1:5,0.66,win.mae.in) text(1:5,0.61,round(apply(mae.in,2,mean),3)) boxplot(100*(mae.in[,2:5]/mae.in[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MAE",outline=FALSE) abline(h=0,lty=2) title("33-fold cross validation\nIn-sample") boxplot(mae.out,col=2:6,names=methods,ylab="MAE",outline=F,ylim=c(0.3,1.1)) title("33-fold cross validation\nOut-of-sample") text(1:5,1.1,win.mae.out) text(1:5,0.3,round(apply(mae.out,2,mean),3)) boxplot(100*(mae.out[,2:5]/mae.out[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MAE",outline=FALSE) abline(h=0,lty=2) title("33-fold cross validation\nOut-of-sample") dev.off()