################################################################################################### # # Bayesian multiple linear regression with shrinkage/sparsity-inducing priors # # Case: Automated Modeling # Stine and Foster (2014) Statistics for Business: Decision Making and Analysis, pages767-772. # # By Hedibert F. Lopes # March 5th 2021. # ################################################################################################### # # # n = 198 observations # p = 26 predictors (15 continuous + 11 dummies) # # y: average cost, in dollars, per block # # Predictors: # # 1. Units : number of blocks in the production run # 2. Precision SD : allowed SD of block dimensions (in mils) # 3. Weight Final : kilograms of a finished block # 4. Weight Rem : kg removed during production, per block # 5. Stamp Ops : number of stamping operations, per block # 6. Chisel Ops : number of chiseling operations, per block # 7. Labor Hours : direct labor hours, per block (variable cost) # 8. Machine Hours : mahine hours, per block (variable cost) # 9. Cost Metal/Kg : dollars/kilogram of input metal # 10. Room Temp : indoor temperature (F) during production # 11. Breakdowns : number of production failures during run # 12. 1/Units : absorbing remaining fixed costs # 13. Breakdown/Unit : absorbing remaining fixed costs # 14. Total Metal Cost : Total cost of metal used (Weight Final + Weight Rem)*(Cost Metal/Kg) # 15. Temp Deviation : (Room Temp - 75)**2 # Detail : whether or not extra detail work is required (detail/none) # 17. D(Detail) : dummy for Detail (1:detail) # Rush : whether or not the job was rush order (rush/none) # 19. D(Rush) : dummy for Rush (1:rush) # Manager : supervising production manager (Lee, Pat, Randy, Jean or Terry) # D(Lee) : dummy for Lee # 22. D(Pat) : dummy for Pat # 23. D(Randy) : dummy for Randy # 24. D(Jean) : dummy for Jean # 25. D(Terry) : dummy for Terry # Music : type of music played during production (no music,rock,pop,jazz) # D(No music) : dummy for no music # 28. D(Rock) : dummy for rock # 29. D(Pop) ) : dummy for pop # 30. D(Jazz) ) : dummy for jazz # Shift : day or night shift (day,night) # 32. D(Day) : dummy for Shift (1:day) # Plant : new or old plant # 34. D(New) : dummy for Plant (1:new) # # # R package bayesreg # Makalic and Schmidt (2006) # High-Dimensional Bayesian Regularized Regression with the bayesreg Package # https://arxiv.org/pdf/1611.06649.pdf # ################################################################################### #install.packages("bayeslm") #install.packages("leaps") library(readxl) library("bayesreg") library("leaps") # Reading the data data = read_excel("multipleregression-stine-foster-casestudy.xlsx") namesX = names(data)[1+c(1:15,17,19,28:30,32,34)] data = data.matrix(data) #View(data) # removing outliers and selecting some of the predictors y = data[data[,1]<70,1] X = data[data[,1]<70,1+c(1:15,17,19,28:30,32,34)] n = nrow(X) k = ncol(X) ystd = (y-mean(y))/sqrt(var(y)) mX = apply(X,2,mean) sX = sqrt(apply(X,2,var)) Xstd = X Xstd[,1:15] = Xstd[,1:15]-matrix(mX[1:15],n,15,byrow=TRUE) Xstd[,1:15] = Xstd[,1:15]%*%diag(1/sX[1:15]) X = Xstd y = ystd pdf(file="multipleregression-stine-foster-casestudy.pdf",width=12,height=8) # Variability of y and X par(mfrow=c(1,1)) boxplot(cbind(y,X[,1:15]),outline=FALSE,col=c(2,rep(grey(0.8),k))) # Correlations between y and X cors = cor(cbind(y,X))[1,2:(1+k)] ind = 1:k ord = ind[order(cors,decreasing=TRUE)] par(mfrow=c(1,2)) plot(cors,xlab="Predictor",ylab="Correlation with Y") abline(h=0,col=2) plot(cors[ord],xlab="Predictor",ylab="Correlation with Y") abline(h=0,col=2) # Singular value decomposition of X par(mfrow=c(1,1)) svdX = svd(X)$d plot(100*svdX/sum(svdX),xlab="Principal Component",ylab="% variabiliity") # Regresson 1: OLS regression with all predictors ols = lm(y~X-1) sighat = round(summary(ols)$sig,3) se.beta = sighat*sqrt(diag(solve(t(X)%*%X))) L = ols$coef-se.beta U = ols$coef+se.beta par(mfrow=c(1,1)) plot(ols$coef,xlab="Predictor",ylab="OLS coefficient",ylim=range(L,U),cex=0.5,col=2,pch=16) for (i in 1:k){ segments(i,L[i],i,U[i],col=2) if ((L[i]>0)|(U[i]<0)){ text(i,max(U),i,cex=0.5) } abline(h=0,col=2) } # Regression 2: OLS regression with the significant predictors found by Regression 1 ind = 1:k cond = ((L>0)|(U<0)) pred1 = ind[cond] k1 = length(pred1) X1 = X[,pred1] cbind(pred1,namesX[pred1]) ols1 = lm(y~X1-1) sighat1 = round(summary(ols1)$sig,3) se.beta1 = sighat1*sqrt(diag(solve(t(X1)%*%X1))) L1 = ols1$coef-2*se.beta1 U1 = ols1$coef+2*se.beta1 par(mfrow=c(1,1)) plot(ols1$coef,xlab="Predictor",ylab="OLS coefficient",ylim=range(L1,U1),cex=0.5,col=2,pch=16) for (i in 1:k1){ segments(i,L1[i],i,U1[i],col=2) if ((L1[i]>0)|(U1[i]<0)){ text(i,max(U1),pred1[i],cex=0.5) } abline(h=0,col=2) } # Regression 3: OLS regression with the significant predictors found by Regression 2 pred2 = pred1[(L1>0)|(U1<0)] cbind(pred2,namesX[pred2]) # Regression 4: Sub-set exhaustive selection (2^22=4.2million models) data = data.frame(y=y,X=X) regs = regsubsets(y~.,data=data,nvmax=k,intercept=FALSE) regs.summary = summary(regs) par(mfrow=c(1,1)) plot(regs.summary$bic,pch=16,xlab="Number of predictors",ylab="BIC") par(mfrow=c(1,1)) plot(regs,scale="bic") bic = regs.summary$bic ii = 1:k coef(regs,ii[bic==min(bic)]) pred4 = ind[regs.summary$which[ii[bic==min(bic)],]] namesX[pred4] pred1 pred2 pred4 # Regression 5: Shrinkage/sparsity priors: ridge, Laplace & horseshoe M0 = 1000 M = 1000 skip = 10 library("bayesreg") data1 = data.frame(y=y,X=X) fit.r = bayesreg(y~X,data=data1,model="normal",prior="ridge",n.samples=M,burnin=M0,thin=skip) fit.l = bayesreg(y~X,data=data1,model="normal",prior="lasso",n.samples=M,burnin=M0,thin=skip) fit.h = bayesreg(y~X,data=data1,model="normal",prior="horseshoe",n.samples=M,burnin=M0,thin=skip) q.r = t(apply(fit.r$beta,1,quantile,c(0.025,0.5,0.975))) q.l = t(apply(fit.l$beta,1,quantile,c(0.025,0.5,0.975))) q.h = t(apply(fit.h$beta,1,quantile,c(0.025,0.5,0.975))) cond.r = ((q.r[,1]>0)|(q.r[,3]<0)) cond.l = ((q.l[,1]>0)|(q.l[,3]<0)) cond.h = ((q.h[,1]>0)|(q.h[,3]<0)) pred5.r = ind[cond.r] pred5.l = ind[cond.l] pred5.h = ind[cond.h] k.r = sum(cond.r) k.l = sum(cond.l) k.h = sum(cond.h) par(mfrow=c(2,2),mai=c(0.5,0.5,0.5,0.5)) plot(ols$coef,xlab="Predictor",ylab="Coeffients",main="",ylim=range(L,U),pch=16,cex=0.5) for (i in 1:k) segments(i,L[i],i,U[i]) abline(h=0,col=2) title(paste("OLS\n k=",k1,sep="")) plot(q.r[,2],xlab="Predictor",ylab="Coeffients",main="",ylim=range(q.r),pch=16,cex=0.5) for (i in 1:k) segments(i,q.r[i,1],i,q.r[i,3]) abline(h=0,col=2) title(paste("Ridge prior\n k=",k.r,sep="")) plot(q.l[,2],xlab="Predictor",ylab="Coeffients",main="",ylim=range(q.l),pch=16,cex=0.5) for (i in 1:k) segments(i,q.l[i,1],i,q.l[i,3]) abline(h=0,col=2) title(paste("Laplace prior\n k=",k.l,sep="")) plot(q.h[,2],xlab="Predictor",ylab="Coeffients",main="",ylim=range(q.h),pch=16,cex=0.5) for (i in 1:k) segments(i,q.h[i,1],i,q.h[i,3]) abline(h=0,col=2) title(paste("Horseshoe prior\n k=",k.h,sep="")) par(mfrow=c(1,1),mai=c(0.8,0.8,0.5,0.5)) plot(ols$coef,xlab="Predictor",ylab="Coeffients",main="",pch=16,cex=0.5, col=0,axes=FALSE,ylim=range(L,U,q.r,q.l,q.h)) axis(2);box();axis(1,at=1:k) for (i in 1:k){ segments(i-0.1,L[i],i-0.1,U[i],lwd=2) segments(i,q.r[i,1],i,q.r[i,3],col=2,lwd=2) segments(i+0.1,q.l[i,1],i+0.1,q.l[i,3],col=3,lwd=2) segments(i+0.2,q.h[i,1],i+0.2,q.h[i,3],col=4,lwd=2) } points(-0.1+1:k,ols$coef,pch=16,cex=0.75) points(1:k,q.r[,2],pch=16,cex=0.75,col=2) points(0.1+1:k,q.l[,2],pch=16,cex=0.75,col=3) points(0.2+1:k,q.h[,2],pch=16,cex=0.75,col=4) legend("topleft",legend=c("OLS","RIDGE","LASSO","HORSESHOE"),col=1:4,lwd=2,bty="n") abline(h=0,lty=2) # Intersection of all regression models namesX[Reduce(intersect,list(pred5.r,pred5.l,pred5.h))] namesX[Reduce(intersect,list(pred2,pred5.r,pred5.l,pred5.h))] # Training and testing exercise (14-fold CV) ii = 1:k ind = 1:k indn = 1:n mse.in = matrix(0,14,5) mse.out = matrix(0,14,5) random = sample(1:n,size=n,replace=FALSE) XX = X[random,] yy = y[random] for (t in 1:14){ ini = 14*(t-1)+1 fin = 14*t train = setdiff(indn,ini:fin) test = ini:fin y1 = yy[train] y2 = yy[test] X1 = XX[train,] X2 = XX[test,] data1 = data.frame(y1,X1) fit.o = lm(y1~X1-1) regs = regsubsets(y1~.,data=data1,nvmax=k,intercept=FALSE) bic = summary(regs)$bic pred = ind[regs.summary$which[ii[bic==min(bic)],]] fit.o1 = lm(y1~X1[,pred]-1) fit.r = bayesreg(y1~X1,data=data1,model="normal",prior="ridge",n.samples=M,burnin=M0,thin=skip) fit.l = bayesreg(y1~X1,data=data1,model="normal",prior="lasso",n.samples=M,burnin=M0,thin=skip) fit.h = bayesreg(y1~X1,data=data1,model="normal",prior="horseshoe",n.samples=M,burnin=M0,thin=skip) # In-sample MSE mse.in[t,] = c(mean((y1 - X1%*%fit.o$coef)^2), mean((y1 - X1[,pred]%*%fit.o1$coef)^2), mean((y1 - X1%*%fit.r$mu.beta)^2), mean((y1 - X1%*%fit.l$mu.beta)^2), mean((y1 - X1%*%fit.h$mu.beta)^2)) # Out-of-sample MSE mse.out[t,] = c(mean((y2 - X2%*%fit.o$coef)^2), mean((y2 - X2[,pred]%*%fit.o1$coef)^2), mean((y2 - X2%*%fit.r$mu.beta)^2), mean((y2 - X2%*%fit.l$mu.beta)^2), mean((y2 - X2%*%fit.h$mu.beta)^2)) } par(mfrow=c(1,2)) boxplot(mse.in,col=2:6,names=c("OLS","OLS(subset)","Ridge","Lasso","Horseshoe"),ylab="MSE",outline=F) boxplot(mse.out,col=2:6,names=c("OLS","OLS(subset)","Ridge","Lasso","Horseshoe"),ylab="MSE",outline=F) par(mfrow=c(1,2)) boxplot(mse.in[,2:5]/mse.in[,1],col=2:5,names=c("OLS(subset)","Ridge","Lasso","Horseshoe"), ylab="Relative MSE",outline=FALSE) abline(h=1,col=6) title("14-fold cross validation\nIn sample") boxplot(mse.out[,2:5]/mse.out[,1],col=2:5,names=c("OLS(subset)","Ridge","Lasso","Horseshoe"), ylab="Relative MSE",outline=FALSE) abline(h=1,col=6) title("14-fold cross validation\nOut-of-sample") dev.off()