##############################################################################################
#
#  Bayesian Variable selection in multiple linear regression: comparing a few alternatives
#
#      1) BIC
#      2) Mallows' Cp 
#      3) Horseshoe prior 
#      4) Normal-Gamma prior 
#  
#  Bobby Gramacy's R Package monomvn - https://bobby.gramacy.com/r_packages/monomvn/
#
#  Griffin and Brown (2009) Inference with Normal-Gamma prior 
#         distributions in regression problems. Bayesian Analysis, 5, 171-188.
#  Carvalho, Polson and Scott (2010) The horseshoe estimator for 
#         sparse signals. Biometrika, 97(2), 465-480.
#
#  By Hedibert F. Lopes
#  Professor of Statistics and Econometrics
#  Insper - Institute for Education and Research
#  This version: March 6th 2021
#
##############################################################################################
#
# Basic references for the data and econometrics: 
#   Blackburn and Newmark (1992) Unobserved ability, efficiency wages and interindustry wage", 
#                                Quarterly Journal of Economics, 107, 1421-36.
#   Wooldridge (2012) Introductory Econometrics: A Modern Approach (5th edition).
#
# Monthly earnings, education, demographic variables, and IQ scores for 935 men in 1980.
#
#  1. wage                     monthly earnings
#  2. hours                    average weekly hours
#  3. iq                       IQ score
#  4. kww                      knowledge of world work score
#  5. educ                     years of education
#  6. exper                    years of work experience
#  7. tenure                   years with current employer
#  8. age                      age in years
#  9. married                  =1 if married
# 10. black                    =1 if black
# 11. south                    =1 if live in south
# 12. urban                    =1 if live in SMSA
# 13. sibs                     number of siblings
# 14. brthord                  birth order
# 15. meduc                    mother's education
# 16. feduc                    father's education
# 17. lwage                    natural log of wage
#
##############################################################################################
rm(list=ls())

#install.packages("leaps") 
#install.packages("bayesreg") 
#install.packages("monomvn") 
library("leaps")
library("bayesreg")
library("monomvn")


data = read.table("wage.txt")
n    = nrow(data)
nmissing = rep(0,n)
for (i in 1:n)
  nmissing[i] = sum(is.na(data[i,]))


y  = data[nmissing==0,17]
X  = as.matrix(data[nmissing==0,2:16])
X  = cbind(X,X[,4]*X[,8:11],X[,5]*X[,8:11],X[,6]*X[,8:11])
k  = ncol(X)
n  = nrow(X)
y  = (y-mean(y))/sqrt(var(y))
mX = apply(X,2,mean)
sX = sqrt(apply(X,2,var))
X  = X-matrix(mX,n,k,byrow=TRUE)
X  = X%*%diag(1/sX)

namesX = c("hours","iq","kww","educ","exper","tenure","age","married","black","south","urban",
"sibs","brthord","meduc","feduc","educ.married","educ.black","educ.south","educ.urban",
"exper.married","exper.black","exper.south","exper.urban","tenure.married","tenure.black",
"tenure.south","tenure.urban")
colnames(X) = namesX

# Regresson 1: OLS regression with all predictors
#################################################
ols     = lm(y~X-1)
sighat  = round(summary(ols)$sig,3)
se.beta = sighat*sqrt(diag(solve(t(X)%*%X)))
L = ols$coef-se.beta
U = ols$coef+se.beta

par(mfrow=c(1,1))
plot(ols$coef,xlab="Predictor",ylab="OLS  coefficient",ylim=range(L,U),cex=0.5,col=2,pch=16)
for (i in 1:k){
  segments(i,L[i],i,U[i],col=2)
  if ((L[i]>0)|(U[i]<0)){
     text(i,max(U),i,cex=0.5)
  }
  abline(h=0,col=2)
}


# Regresson 2: Exhaustive search: 2^27=1.34 million models
##########################################################
data  = data.frame(y=y,X=X)
regs  = regsubsets(y~.,data=data,nvmax=k,intercept=FALSE)
regs.summary = summary(regs)

ind     = 1:k
k.bic   = ind[regs.summary$bic==min(regs.summary$bic)]
k.cp    = ind[regs.summary$cp==min(regs.summary$cp)]
k.rsq   = ind[regs.summary$rsq==max(regs.summary$rsq)]
k.adjr2 = ind[regs.summary$adjr2==max(regs.summary$adjr2)]

par(mfrow=c(1,3))
plot(regs.summary$bic,pch=16,xlab="Number of predictors",ylab="BIC",main=paste("k=",k.bic,sep=""))
plot(regs.summary$cp,pch=16,xlab="Number of predictors",ylab="Cp",main=paste("k=",k.cp,sep=""))
plot(regs.summary$rsq,pch=16,xlab="Number of predictors",ylab="R2 and adjusted R2",main="")
points(regs.summary$adjr2,pch=16,col=2,main="")
legend("topleft",legend=c(paste("R2: k=",k.rsq,sep=""),paste("R2adj: k=",k.adjr2,sep="")),
col=1:2,pch=16,bty="n")

par(mfrow=c(1,2))
plot(regs,scale="bic")
plot(regs,scale="adjr2")

bic      = regs.summary$bic
cp       = regs.summary$cp
ii       = 1:k
pred.bic = ind[regs.summary$which[ii[bic==min(bic)],]]
pred.cp  = ind[regs.summary$which[ii[cp==min(cp)],]]
namesX[pred.bic]
namesX[pred.cp]


# Regression 3: Shrinkage/sparsity priors: ridge, Laplace & horseshoe
M0     = 10000
M      = 10000
skip   = 100
data1  = data.frame(y=y,X=X)
fit.h  = bayesreg(y~X,data=data1,model="normal",prior="horseshoe",n.samples=M,burnin=M0,thin=skip)
q.h    = t(apply(fit.h$beta,1,quantile,c(0.025,0.5,0.975)))
cond.h = ((q.h[,1]>0)|(q.h[,3]<0))
pred.h = ind[cond.h]
k.h    = sum(cond.h)

par(mfrow=c(1,1),mai=c(0.8,0.8,0.5,0.5))
plot(ols$coef,xlab="Predictor",ylab="Coeffients",main="",pch=16,cex=0.5,
     col=0,axes=FALSE,ylim=range(L,U,q.h))
axis(2);box();axis(1,at=1:k)
for (i in 1:k){
  segments(i-0.1,L[i],i-0.1,U[i],lwd=2)
  segments(i+0.2,q.h[i,1],i+0.2,q.h[i,3],col=2,lwd=2)
}
points(-0.1+1:k,ols$coef,pch=16,cex=0.75)
points(0.2+1:k,q.h[,2],pch=16,cex=0.75,col=2)
legend("topleft",legend=c("OLS","HORSESHOE"),col=1:2,lwd=2,bty="n")
abline(h=0,lty=2)


c(mean((y-X[,pred.bic]%*%lm(y~X[,pred.bic]-1)$coef)^2),
mean((y-X[,pred.cp]%*%lm(y~X[,pred.cp]-1)$coef)^2),
mean((y-X%*%fit.h$mu.beta)^2))


################################################################################
# Training and testing exercise (33-fold CV) - n=663
################################################################################
set.seed(12345)
M0      = 1000
M       = 10000
skip    = 1
ii      = 1:k
ind     = 1:k
indn    = 1:n
mse.in  = matrix(0,33,5)
mse.out = matrix(0,33,5)
mae.in  = matrix(0,33,5)
mae.out = matrix(0,33,5)
random  = sample(1:n,size=n,replace=FALSE)
XX      = X[random,]
yy      = y[random]
for (t in 1:33){
	
  # Defining training and testing samples	
  ini      = 20*(t-1)+1
  fin      = 20*t	
  test     = ini:fin
  train    = setdiff(indn,test)
  y1       = yy[train]
  y2       = yy[test]
  X1       = XX[train,]
  X2       = XX[test,]
  data1    = data.frame(y1,X1)

  # OLS fit
  fit.ols  = lm(y1~X1-1)

  # Subset selection: using BIC and Cp as criteria
  regs     = regsubsets(y1~.,data=data1,nvmax=k,intercept=FALSE)
  bic      = summary(regs)$bic
  pred.bic = ind[regs.summary$which[ii[bic==min(bic)],]]
  fit.bic  = lm(y1~X1[,pred.bic]-1)  
  cp       = summary(regs)$cp
  pred.cp  = ind[regs.summary$which[ii[cp==min(cp)],]]
  fit.cp   = lm(y1~X1[,pred.cp]-1)  

  # Bayesian regularization: horseshoe and Normal-Gamma priors
  fit.hs   = blasso(X1,y1,T=M,thin=skip,RJ=FALSE,case="hs",icept=FALSE,verb=0)
  fit.ng   = blasso(X1,y1,T=M,thin=skip,RJ=FALSE,case="ng",icept=FALSE,verb=0)
  beta.hs  = apply(fit.hs$beta,2,mean)
  beta.ng  = apply(fit.ng$beta,2,mean)
 
  # Computing MSEs and MADs
  error.in    = cbind(y1-X1%*%fit.ols$coef,y1-X1[,pred.bic]%*%fit.bic$coef,
                      y1-X1[,pred.cp]%*%fit.cp$coef,y1-X1%*%beta.hs,y1-X1%*%beta.ng)
  error.out   = cbind(y2-X2%*%fit.ols$coef,y2-X2[,pred.bic]%*%fit.bic$coef,
                      y2-X2[,pred.cp]%*%fit.cp$coef,y2-X2%*%beta.hs,y2-X2%*%beta.ng)  
  mse.in[t,]  = apply(error.in^2,2,mean)
  mse.out[t,] = apply(error.out^2,2,mean)        
  mae.in[t,]  = apply(abs(error.in),2,mean)
  mae.out[t,] = apply(abs(error.out),2,mean)        

}
win.mse.in  = apply(mse.in==apply(mse.in,1,min),2,sum)
win.mse.out = apply(mse.out==apply(mse.out,1,min),2,sum)
win.mae.in  = apply(mae.in==apply(mae.in,1,min),2,sum)
win.mae.out = apply(mae.out==apply(mae.out,1,min),2,sum)


pdf(file="wage.pdf",width=12,height=9)

methods = c("OLS","OLS(BIC)","OLS(CP)","Horseshoe","NG")

par(mfrow=c(2,2))  
boxplot(mse.in,col=2:6,names=methods,ylab="MSE",outline=F,ylim=c(0.66,0.75))
title("33-fold cross validation\nIn-sample")
text(1:5,0.75,win.in)
text(1:5,0.66,round(apply(mse.in,2,mean),3))

boxplot(100*(mse.in[,2:5]/mse.in[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MSE",outline=FALSE)
abline(h=0,lty=2)
title("33-fold cross validation\nIn-sample")

boxplot(mse.out,col=2:6,names=methods,ylab="MSE",outline=F,ylim=c(0.2,1.5))
title("33-fold cross validation\nOut-of-sample")
text(1:5,1.5,win.out)
text(1:5,0.2,round(apply(mse.out,2,mean),3))

boxplot(100*(mse.out[,2:5]/mse.out[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MSE",outline=FALSE)
abline(h=0,lty=2)
title("33-fold cross validation\nOut-of-sample")

par(mfrow=c(2,2))  
boxplot(mae.in,col=2:6,names=methods,ylab="MAE",outline=F,ylim=c(0.61,0.66))
title("33-fold cross validation\nIn-sample")
text(1:5,0.66,win.mae.in)
text(1:5,0.61,round(apply(mae.in,2,mean),3))

boxplot(100*(mae.in[,2:5]/mae.in[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MAE",outline=FALSE)
abline(h=0,lty=2)
title("33-fold cross validation\nIn-sample")

boxplot(mae.out,col=2:6,names=methods,ylab="MAE",outline=F,ylim=c(0.3,1.1))
title("33-fold cross validation\nOut-of-sample")
text(1:5,1.1,win.mae.out)
text(1:5,0.3,round(apply(mae.out,2,mean),3))

boxplot(100*(mae.out[,2:5]/mae.out[,1]-1),col=3:6,names=methods[2:5],ylab="Relative (%) MAE",outline=FALSE)
abline(h=0,lty=2)
title("33-fold cross validation\nOut-of-sample")


dev.off()