1) Data and Descriptive Statistics

We will use a dataset with log-wages and several demographics from Blackburn and Neumark (1992). Before moving to modeling, we do some descriptive statistics of the observed variables.From this, one can draw some interesting patterns, such as lower average log wages for black people, lower average log wages for southern US residents, and sligthly increasing averages of log wages versus years of education.

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(bayeslm)
library(leaps)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
mydata = read.table("http://hedibert.org/wp-content/uploads/2014/02/wage2-wooldridge.txt",header=FALSE)
colnames(mydata) = c("wage","hours","iq","kww","educ","exper","tenure","age","married",
                       "black","south","urban","sibs","brthord","meduc","feduc","lwage")
y = mydata[,ncol(mydata)] # lwage vector
X = model.matrix(lwage ~ kww + hours*exper*tenure*age + sibs +
                     (iq+educ+married + black + south + urban)^3-1,data=mydata) 
n = nrow(X) # #observations
p = ncol(X) # #variables in X
  
data <- as.data.frame(cbind(y, X))


###########################################################################################################################
# Descriptive Statistics ##################################################################################################
###########################################################################################################################
  
par(mfrow=c(2,3))
boxplot(lwage~kww,data=mydata,xlab="Knowledge of World Work Score",ylab="Log of Wages")
boxplot(lwage~iq,data=mydata,xlab="IQ",ylab="Log of Wages")
boxplot(lwage~educ,data=mydata,xlab="Years of Education",ylab="Log of Wages")
boxplot(lwage~exper,data=mydata,xlab="Work Experience (years)",ylab="Log of Wages")
boxplot(lwage~tenure,data=mydata,xlab="Current Employer (years)",ylab="Log of Wages")
boxplot(lwage~age,data=mydata,xlab="Age",ylab="Log of Wages")

boxplot(lwage~married,data=mydata,xlab="= 1 if married",ylab="Log of Wages")
boxplot(lwage~black,data=mydata,xlab="= 1 if black",ylab="Log of Wages")
boxplot(lwage~south,data=mydata,xlab="= 1 if live in south",ylab="Log of Wages")
boxplot(lwage~urban,data=mydata,xlab="= 1 if live in SMSA",ylab="Log of Wages")
boxplot(lwage~meduc,data=mydata,xlab="Mother's Education",ylab="Log of Wages")
boxplot(lwage~feduc,data=mydata,xlab="Father's Education",ylab="Log of Wages")

To further assess the information in this data, we will use a supervised learning approach with log wages as the response variable. Assume \(y|X,\beta,\sigma^2 \sim N(X\beta,\sigma^2)\), such that the outcome variable \(Y\) is log-wages and \(X\) is a matrix of covariates (race, education, tenure, age etc)

We are given six different regression models to fit the data:

a. Maximum Likelihood Estimation

b. Ridge Regression

c. Penalized Maximum Likelihood (LASSO)

d. Bayesian Ridge Regression

e. Bayesian LASSO

f. Bayesian Horseshoe

2)Whole Sample Model Comparison

First, we use the whole sample to assess the models.

a. MLE Estimation

In MLE estimation, we select the best model using Forward Stepwise Selection. We begin with a model with zero covariates and then start adding variables that, one by one, increase \(R^2\) the most until we reach the maximum number of possible covariates. We then look at number of predictors that give the best fit with a reasonable Adjusted \(R^2\). In our case, we set the maximum number of covariates to 20, since using all the 58 possibillities would be too computationally demanding.

regs = regsubsets(y~. ,data = data,  nvmax=20,
                    method="forward", really.big = TRUE)
regs.results <- summary(regs) #store coefficients
  
par(mfrow = c(1,1))
plot(regs.results$rsq,pch=16,xlab="Number of predictors",ylab="Fit")
points(regs.results$adjr2,col=2,pch=16)
legend("bottomright",legend=c("R2","R2adj"),col=1:2,pch=16)

par(mfrow=c(1,3))
plot(regs.results$adjr2,pch=17, xlab="Number of predictors",ylab="Adjusted R2")
plot(regs.results$cp,pch=17,xlab="Number of predictors",ylab="Cp")
plot(regs.results$bic,pch=17,xlab="Number of predictors",ylab="BIC")

f <- paste("y ~",names(coef(regs,7)[2]),"+", names(coef(regs,7)[3]),"+", names(coef(regs,7)[4]),"+", names(coef(regs,7)[5]),"+", 
           names(coef(regs,7)[6]),"+", names(coef(regs,7)[7]),"+", names(coef(regs,7)[8]))

best.mle <-  lm(f, data=data)
    
stargazer(best.mle, type = "text")
## 
## ================================================
##                          Dependent variable:    
##                      ---------------------------
##                                   y             
## ------------------------------------------------
## hours                         -0.025***         
##                                (0.005)          
##                                                 
## `exper:tenure`                0.006***          
##                                (0.002)          
##                                                 
## `hours:age`                   0.001***          
##                               (0.0001)          
##                                                 
## `iq:educ`                     0.0003***         
##                               (0.00003)         
##                                                 
## `exper:tenure:age`           -0.0002***         
##                               (0.0001)          
##                                                 
## `educ:married:urban`          0.017***          
##                                (0.002)          
##                                                 
## `black:south:urban`           -0.295***         
##                                (0.052)          
##                                                 
## Constant                      6.331***          
##                                (0.087)          
##                                                 
## ------------------------------------------------
## Observations                     935            
## R2                              0.275           
## Adjusted R2                     0.269           
## Residual Std. Error       0.360 (df = 927)      
## F Statistic            50.201*** (df = 7; 927)  
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01

From the first picture, we would settle for something around 5 predictors, as both \(R^2\) and Adjusted \(R^2\) are still close to each other. From the second picture, Adjusted \(R^2\) and Cp suggest a very complex model with about 20 predictors, whereas BIC suggests 7 predictors as the best specification. Since we are looking for a parsimonious model, we will follow BIC and choose the 7-predictor model. Next, we compute the Root Mean Squared Error of the selected model.

rmse.mle <- sqrt( c(crossprod(best.mle$residuals))  / length(best.mle$residuals) ) # RMSE
rmse.mle
## [1] 0.3584296

b. Ridge Regression

Now, to fit a Ridge model, we use the glmnet R function. To understand the weight of lambda on the constrained maximization, we perform a 10-fold cross-validation, as plotted in the first picture. However, even for high values of lambda, the number of coefficients is always 58. This is somewhat expected, because it is more difficult for the ridge quadratic penalization to set a coefficient to zero. Since we always pick 58 regressors, we will go with the minimum RMSE lambda from the cross-validation to generate our final coefficients.

set.seed(5555)
ridge.model <- glmnet(X, y, family=c("gaussian"), alpha = 0)
plot(ridge.model,xvar="lambda")

crossvalid.ridge <- cv.glmnet(X, y, alpha= 0)
plot(crossvalid.ridge)

Since we always pick 58 regressors, we will go with the minimum recommended lambda from the cross-validation to generate our final coefficients.

ridge.coeffs <- predict(ridge.model, type = "coefficients", 
                           s= crossvalid.ridge$lambda.min)
ridge.coeffs
## 59 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)             5.822234e+00
## kww                     3.408661e-03
## hours                  -4.090187e-03
## exper                   3.220054e-03
## tenure                  4.198868e-03
## age                     5.425541e-03
## sibs                   -1.482688e-03
## iq                      1.255096e-03
## educ                    1.716245e-02
## married                 1.377091e-02
## black                  -1.145432e-02
## south                  -2.429386e-02
## urban                   3.172993e-02
## hours:exper             3.494231e-05
## hours:tenure            2.823491e-05
## exper:tenure            8.871547e-05
## hours:age              -3.337110e-05
## exper:age               6.082166e-05
## tenure:age              8.958263e-05
## iq:educ                 9.138106e-05
## iq:married              2.134127e-04
## iq:black               -7.744555e-05
## iq:south               -1.505530e-04
## iq:urban                2.344815e-04
## educ:married            4.626267e-03
## educ:black             -1.164979e-03
## educ:south              3.147751e-04
## educ:urban              4.244241e-03
## married:black           1.297834e-02
## married:south          -9.412515e-03
## married:urban           8.082607e-03
## black:south            -3.718436e-02
## black:urban            -2.154466e-02
## south:urban            -2.072877e-02
## hours:exper:tenure     -8.054334e-07
## hours:exper:age         1.053233e-06
## hours:tenure:age        4.170209e-07
## exper:tenure:age        7.832262e-07
## iq:educ:married         4.003783e-05
## iq:educ:black          -1.100388e-05
## iq:educ:south           8.061053e-06
## iq:educ:urban           3.237649e-05
## iq:married:black        3.536609e-04
## iq:married:south       -3.467100e-05
## iq:married:urban        2.562410e-05
## iq:black:south         -9.013525e-05
## iq:black:urban         -3.122408e-04
## iq:south:urban         -1.399819e-04
## educ:married:black      8.538672e-04
## educ:married:south      1.267725e-03
## educ:married:urban      2.459828e-03
## educ:black:south       -3.862439e-03
## educ:black:urban       -1.859040e-03
## educ:south:urban        9.850518e-04
## married:black:south    -3.702560e-02
## married:black:urban     2.872654e-02
## married:south:urban    -3.526662e-02
## black:south:urban      -5.934334e-02
## hours:exper:tenure:age -4.715900e-08

We also compute the RMSE of the Ridge specification:

rmse.ridge <- sqrt(min(crossvalid.ridge$cvm))
rmse.ridge
## [1] 0.363187

c. LASSO

Using glmnet again, we fit a LASSO regression. As we can see from the first picture, LASSO has no problem in setting several coefficients to zero as lambda increases. Hence, we can choose a model with 2 to 56 predictors.

set.seed(5555)
lasso.model <- glmnet(X, y, family=c("gaussian"), alpha = 1)
  
par(mfrow=(c(1,1)))
plot(lasso.model,xvar="lambda")

crossvalid.lasso <- cv.glmnet(X, y, alpha= 1)
plot(crossvalid.lasso)

The second picture (10-fold cross-validation) tells us that the minimum RMSE lambda is

## [1] -5.052437

We use this lambda to compute LASSO’s coefficients, aiming at the lowest penalization with the lowest RMSE. As we can see, many coefficients are set to zero, as opposed to the Ridge Regression case.

lasso.coeffs <- predict(lasso.model, type = "coefficients", 
                         s= crossvalid.lasso$lambda.min)
lasso.coeffs
## 59 x 1 sparse Matrix of class "dgCMatrix"
##                                    1
## (Intercept)             5.953725e+00
## kww                     3.864081e-03
## hours                  -4.584766e-03
## exper                   8.650051e-03
## tenure                  9.436544e-03
## age                     4.422377e-03
## sibs                    .           
## iq                      .           
## educ                    .           
## married                 .           
## black                   .           
## south                  -2.561893e-02
## urban                   .           
## hours:exper             .           
## hours:tenure            .           
## exper:tenure            .           
## hours:age               .           
## exper:age               .           
## tenure:age              .           
## iq:educ                 2.287094e-04
## iq:married              .           
## iq:black                .           
## iq:south                .           
## iq:urban                .           
## educ:married            9.862594e-03
## educ:black              .           
## educ:south              .           
## educ:urban              1.274030e-02
## married:black           .           
## married:south           .           
## married:urban           .           
## black:south             .           
## black:urban             .           
## south:urban            -2.977473e-03
## hours:exper:tenure      .           
## hours:exper:age         .           
## hours:tenure:age        .           
## exper:tenure:age        .           
## iq:educ:married         2.996603e-05
## iq:educ:black           .           
## iq:educ:south           .           
## iq:educ:urban           .           
## iq:married:black        .           
## iq:married:south        .           
## iq:married:urban        .           
## iq:black:south          .           
## iq:black:urban         -1.218275e-04
## iq:south:urban          .           
## educ:married:black      .           
## educ:married:south      .           
## educ:married:urban      1.495578e-03
## educ:black:south       -8.856096e-03
## educ:black:urban       -1.854923e-03
## educ:south:urban        .           
## married:black:south     .           
## married:black:urban     .           
## married:south:urban    -3.667492e-02
## black:south:urban      -8.325720e-02
## hours:exper:tenure:age  .

We also compute the RMSE of the LASSO specification:

rmse.lasso <- sqrt(min(crossvalid.lasso$cvm))
rmse.lasso
## [1] 0.3631448

(d),(e),(f) - Bayesian Ridge, Bayesian LASSO, and Bayesian Horseshoe

Now, moving to the Bayesian estimation, we use the bayeslm R package to estimate all three models. The only difference is the prior we use in each one. We use a 10,000 burn-in sample and a 20,000 testing sample.

set.seed(5555)
  
# Bayesian Ridge
bayes.ridge <- bayeslm(y,X,prior = "ridge", icept = TRUE, 
                             N = 50000, burnin=20000) 
   
# Bayesian LASSO 
bayes.lasso <- bayeslm(y,X,prior = "laplace", icept = TRUE, 
                             N = 50000, burnin=20000)
   
# Bayesian Horseshoe
bayes.horseshoe <- bayeslm(y,X,prior = "horseshoe", icept = TRUE, 
                             N = 50000, burnin=20000)

Summary of Estimates

Below, we show the coefficients for each of the six models:

coeffs.summary <- as.matrix(cbind(ridge.coeffs, lasso.coeffs, apply(bayes.ridge$beta,2,mean), apply(bayes.lasso$beta,2,mean),
                                         apply(bayes.horseshoe$beta,2,mean)))
  
colnames(coeffs.summary) <- c("RIDGE", "LASSO", "Bayes.LASSO",
                            "Bayes.RIDGE", "Bayes.HORSESHOE")         
 
# Bit of a long way to leave tables with the same number of rows
  
coeffs.summary <- as.data.frame(coeffs.summary)
   
coeffs.summary$v_id <- rownames(coeffs.summary)
   
coeffs.mle <- as.data.frame(round(best.mle$coef,3))
coeffs.mle$v_id <- rownames(coeffs.mle)
coeffs.mle$v_id <- gsub(pattern = "`", "", coeffs.mle$v_id)
    
coeffs.summary <- merge(coeffs.summary, coeffs.mle, by = "v_id",all.x = TRUE)
    
  
colnames(coeffs.summary) <- c("Predictor","RIDGE", "LASSO", "Bayes.RIDGE",
                            "Bayes.LASSO", "Bayes.HORSESHOE", "MLE")
   
coeffs.summary$MLE[is.na(coeffs.summary$MLE) ] <- 0
   
coeffs.summary <- coeffs.summary[,c("Predictor", "MLE", "RIDGE", "LASSO", "Bayes.RIDGE",
                            "Bayes.LASSO", "Bayes.HORSESHOE")]
  
coeffs.summary[,c("MLE", "RIDGE", "LASSO", 
                    "Bayes.RIDGE", "Bayes.LASSO", "Bayes.HORSESHOE")] <- round(coeffs.summary[,c("MLE", "RIDGE", "LASSO", 
                                                                                           "Bayes.RIDGE", "Bayes.LASSO", "Bayes.HORSESHOE")],4)
coeffs.summary
##                 Predictor    MLE   RIDGE   LASSO Bayes.RIDGE Bayes.LASSO
## 1             (Intercept)  6.331  5.8222  5.9537      6.7790      6.7790
## 2                     age  0.000  0.0054  0.0044      0.0052      0.0043
## 3                   black  0.000 -0.0115  0.0000     -0.0087     -0.0112
## 4             black:south  0.000 -0.0372  0.0000     -0.0407     -0.0432
## 5       black:south:urban -0.295 -0.0593 -0.0833     -0.0572     -0.0538
## 6             black:urban  0.000 -0.0215  0.0000     -0.0245     -0.0212
## 7                    educ  0.000  0.0172  0.0000      0.0170      0.0169
## 8              educ:black  0.000 -0.0012  0.0000     -0.0012     -0.0013
## 9        educ:black:south  0.000 -0.0039 -0.0089     -0.0042     -0.0039
## 10       educ:black:urban  0.000 -0.0019 -0.0019     -0.0020     -0.0018
## 11           educ:married  0.000  0.0046  0.0099      0.0048      0.0045
## 12     educ:married:black  0.000  0.0009  0.0000      0.0009      0.0007
## 13     educ:married:south  0.000  0.0013  0.0000      0.0016      0.0011
## 14     educ:married:urban  0.017  0.0025  0.0015      0.0026      0.0022
## 15             educ:south  0.000  0.0003  0.0000      0.0003      0.0002
## 16       educ:south:urban  0.000  0.0010  0.0000      0.0013      0.0008
## 17             educ:urban  0.000  0.0042  0.0127      0.0045      0.0051
## 18                  exper  0.000  0.0032  0.0087      0.0032      0.0031
## 19              exper:age  0.000  0.0001  0.0000      0.0001      0.0001
## 20           exper:tenure  0.006  0.0001  0.0000      0.0001      0.0001
## 21       exper:tenure:age  0.000  0.0000  0.0000      0.0000      0.0000
## 22                  hours -0.025 -0.0041 -0.0046     -0.0043     -0.0044
## 23              hours:age  0.001  0.0000  0.0000      0.0000      0.0000
## 24            hours:exper  0.000  0.0000  0.0000      0.0000      0.0000
## 25        hours:exper:age  0.000  0.0000  0.0000      0.0000      0.0000
## 26     hours:exper:tenure  0.000  0.0000  0.0000      0.0000      0.0000
## 27 hours:exper:tenure:age  0.000  0.0000  0.0000      0.0000      0.0000
## 28           hours:tenure  0.000  0.0000  0.0000      0.0000      0.0000
## 29       hours:tenure:age  0.000  0.0000  0.0000      0.0000      0.0000
## 30                     iq  0.000  0.0013  0.0000      0.0013      0.0010
## 31               iq:black  0.000 -0.0001  0.0000     -0.0001     -0.0001
## 32         iq:black:south  0.000 -0.0001  0.0000      0.0000     -0.0001
## 33         iq:black:urban  0.000 -0.0003 -0.0001     -0.0003     -0.0003
## 34                iq:educ  0.000  0.0001  0.0002      0.0001      0.0001
## 35          iq:educ:black  0.000  0.0000  0.0000      0.0000      0.0000
## 36        iq:educ:married  0.000  0.0000  0.0000      0.0000      0.0000
## 37          iq:educ:south  0.000  0.0000  0.0000      0.0000      0.0000
## 38          iq:educ:urban  0.000  0.0000  0.0000      0.0000      0.0000
## 39             iq:married  0.000  0.0002  0.0000      0.0002      0.0002
## 40       iq:married:black  0.000  0.0004  0.0000      0.0004      0.0003
## 41       iq:married:south  0.000  0.0000  0.0000      0.0000      0.0000
## 42       iq:married:urban  0.000  0.0000  0.0000      0.0000      0.0000
## 43               iq:south  0.000 -0.0002  0.0000     -0.0002     -0.0001
## 44         iq:south:urban  0.000 -0.0001  0.0000     -0.0001     -0.0001
## 45               iq:urban  0.000  0.0002  0.0000      0.0002      0.0002
## 46                    kww  0.000  0.0034  0.0039      0.0035      0.0031
## 47                married  0.000  0.0138  0.0000      0.0120      0.0162
## 48          married:black  0.000  0.0130  0.0000      0.0122      0.0131
## 49    married:black:south  0.000 -0.0370  0.0000     -0.0393     -0.0329
## 50    married:black:urban  0.000  0.0287  0.0000      0.0320      0.0249
## 51          married:south  0.000 -0.0094  0.0000     -0.0092     -0.0079
## 52    married:south:urban  0.000 -0.0353 -0.0367     -0.0374     -0.0320
## 53          married:urban  0.000  0.0081  0.0000      0.0074      0.0093
## 54                   sibs  0.000 -0.0015  0.0000     -0.0014     -0.0011
## 55                  south  0.000 -0.0243 -0.0256     -0.0256     -0.0217
## 56            south:urban  0.000 -0.0207 -0.0030     -0.0236     -0.0207
## 57                 tenure  0.000  0.0042  0.0094      0.0046      0.0043
## 58             tenure:age  0.000  0.0001  0.0000      0.0001      0.0001
## 59                  urban  0.000  0.0317  0.0000      0.0329      0.0296
##    Bayes.HORSESHOE
## 1           6.7790
## 2           0.0032
## 3          -0.0093
## 4          -0.0387
## 5          -0.0431
## 6          -0.0162
## 7           0.0116
## 8          -0.0009
## 9          -0.0037
## 10         -0.0016
## 11          0.0045
## 12          0.0006
## 13          0.0005
## 14          0.0024
## 15          0.0000
## 16          0.0002
## 17          0.0063
## 18          0.0028
## 19          0.0001
## 20          0.0001
## 21          0.0000
## 22         -0.0045
## 23          0.0000
## 24          0.0000
## 25          0.0000
## 26          0.0000
## 27          0.0000
## 28          0.0000
## 29          0.0000
## 30          0.0004
## 31         -0.0001
## 32         -0.0001
## 33         -0.0002
## 34          0.0002
## 35          0.0000
## 36          0.0000
## 37          0.0000
## 38          0.0000
## 39          0.0002
## 40          0.0002
## 41          0.0000
## 42          0.0000
## 43         -0.0001
## 44         -0.0001
## 45          0.0001
## 46          0.0024
## 47          0.0221
## 48          0.0096
## 49         -0.0269
## 50          0.0111
## 51         -0.0059
## 52         -0.0203
## 53          0.0088
## 54         -0.0006
## 55         -0.0160
## 56         -0.0178
## 57          0.0044
## 58          0.0001
## 59          0.0242

Finally, we compute the RMSEs of each model:

rmse.bayes.ridge<-sqrt(c(crossprod(bayes.ridge$residuals))/length(bayes.ridge$residuals))
rmse.bayes.lasso<-sqrt(c(crossprod(bayes.lasso$residuals))/length(bayes.lasso$residuals)) 
rmse.bayes.horse<-sqrt(c(crossprod(bayes.horseshoe$residuals))/length(bayes.horseshoe$residuals)) 

RMSE.summary <-as.data.frame(c(rmse.mle, rmse.ridge, rmse.lasso, rmse.bayes.ridge, rmse.bayes.lasso, rmse.bayes.horse)) 
rownames(RMSE.summary) <- c("MLE", "RIDGE", "LASSO", "Bayes.RIDGE", "Bayes.LASSO", "Bayes.HORSESHOE") 
colnames(RMSE.summary) <- c("RMSE")
    
    
RMSE.summary
##                      RMSE
## MLE             0.3584296
## RIDGE           0.3631870
## LASSO           0.3631448
## Bayes.RIDGE     1.0166361
## Bayes.LASSO     0.9819440
## Bayes.HORSESHOE 0.9016975

In this case, MLE has the lowest RMSE. This is expected, because MLE is supposed to estimate the parameters such that it has the best fit to the data. As opposed to this, the other methods introduce some bias to try and reduce the number of parameters. However, as we have seen from the tables above, it seems that our choice of MLE model not only has the lowest RMSE, but also the least non-zero coefficients.

Hence, we would choose the the MLE in this first part.

3) Training and Testing Samples Model Comparison

Now, we randomly split the sample in two equal parts.

a. MLE

To choose an MLE model, we will use a different strategy with respect to the first part. We will simulate 1,000 random samples of the training set and use a Foward Stepwise Selection for each of them, setting the maximum number of predictors to 20 again.

plot(1:p,nmodel/N,type="h",xlab="Number of predictors",ylab="Relative frequency",
       axes=FALSE)
axis(2);box()
axis(1,at=1:p)
title(paste("Training = ",100*alpha,"%",sep=""))

By looking at the distribution of the number of predictors in these 1,000 random samples, it seems we should pick something around 8 predictors. Since this is the most frequent result, we will choose an 8-predictor model. Now, we run an Exhaustive Model Selection with 8 as the maximum number of predictors. This way, we should find the best combination of 8 variables in terms of Adjusted \(R^2\).

set.seed(8888)   
train = sample(c(TRUE,FALSE),size=n,rep=TRUE,prob=c(alpha,1-alpha)) 
test = !train
test.mat = model.matrix(y~.,data=data[test,])

md <- which.max(nmodel/N)

reg.train = regsubsets(y~. ,data = data[train, ],  nvmax=which.max(nmodel/N),
                           really.big = TRUE)
## Reordering variables and trying again:
f <- paste("y ~",names(coef(regs,md)[2]),"+", names(coef(regs,md)[3]),"+", names(coef(regs,md)[4]),"+", names(coef(regs,md)[5]),"+", 
           names(coef(regs,md)[6]),"+", names(coef(regs,md)[7]),"+", names(coef(regs,md)[8]),"+", names(coef(regs,md)[9]))

best.mle <-  lm(f, data=data[train,])

stargazer(best.mle, type = "text")
## 
## =================================================
##                           Dependent variable:    
##                       ---------------------------
##                                    y             
## -------------------------------------------------
## hours                          -0.028***         
##                                 (0.006)          
##                                                  
## `exper:tenure`                   0.004           
##                                 (0.003)          
##                                                  
## `hours:age`                    0.001***          
##                                (0.0002)          
##                                                  
## `iq:educ`                      0.0004***         
##                                (0.00005)         
##                                                  
## `exper:tenure:age`              -0.0001          
##                                (0.0001)          
##                                                  
## `educ:married:urban`           0.016***          
##                                 (0.003)          
##                                                  
## `married:south:urban`           -0.037           
##                                 (0.048)          
##                                                  
## `black:south:urban`            -0.245***         
##                                 (0.081)          
##                                                  
## Constant                       6.295***          
##                                 (0.125)          
##                                                  
## -------------------------------------------------
## Observations                      465            
## R2                               0.309           
## Adjusted R2                      0.297           
## Residual Std. Error        0.363 (df = 456)      
## F Statistic             25.537*** (df = 8; 456)  
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01

As opposed to part 1, we now test the fit of the model by out-of-sample prediction and computation of the Root Mean Squared Fit (Prediction) Error. For the MLE case with 8 predictors, we have the following RMSFE:

rmsfe.mle <- sqrt(mean((data$y[test]- test.mat[,names(best.mle$coefficients)]%*%best.mle$coefficients)^2))
    
rmsfe.mle  
## [1] 0.3600842

b. Ridge Regression

We go through the Ridge Regression the same way as before:

set.seed(8888)
ridge.model <- glmnet(X[train, ], y[train], family=c("gaussian"), alpha = 0)
  
par(mfrow=(c(1,1)))
plot(ridge.model,xvar="lambda")

crossvalid.ridge <- cv.glmnet(X, y, alpha= 1)
plot(crossvalid.ridge)

ridge.coeffs <- predict(ridge.model, type = "coefficients", 
                         s= crossvalid.ridge$lambda.min)

Now, the Ridge Regression 10-fold cross-validation gives a minimum lambda of

log(crossvalid.ridge$lambda.min)
## [1] -5.331539

And the corresponding fit error in the testing sample is

rmsfe.ridge <- sqrt(mean((data$y[test]- test.mat%*%ridge.coeffs)^2))

rmsfe.ridge    
## [1] 0.3671738

c. LASSO

Lasso is also estimated the same way as before:

set.seed(8888)

lasso.model <- glmnet(X[train, ], y[train], family=c("gaussian"), alpha = 1)
  
par(mfrow=(c(1,1)))
plot(lasso.model,xvar="lambda")

crossvalid.lasso <- cv.glmnet(X, y, alpha= 1)
plot(crossvalid.lasso)

lasso.coeffs <- predict(lasso.model, type = "coefficients", 
                           s= crossvalid.lasso$lambda.min)

Minimum MSE lambda is

log(crossvalid.lasso$lambda.min)
## [1] -5.331539

And the out-of-sample prediction error is

rmsfe.lasso <- sqrt(mean((data$y[test]- test.mat%*%lasso.coeffs)^2))
  
rmsfe.lasso  
## [1] 0.3657714

(d),(e),(f) - Bayesian Ridge, Bayesian LASSO, and Bayesian Horseshoe

Using bayeslm again, we train in half of the sample and compute the Root Mean Squared Fit Error for the three different priors

train = sample(c(TRUE,FALSE),size=n,rep=TRUE,prob=c(alpha,1-alpha)) 
test = !train
test.mat = model.matrix(y~.,data=data[test,])

# Bayesian Ridge 
set.seed(1111)
bayes.ridge <- bayeslm(y[train],X[train, ],prior = "ridge", icept = TRUE, 
                     N = 100000, burnin=100000)
  
bayes.ridge.mean <- as.matrix(apply(bayes.ridge$beta,2,mean)) 
  
# Bayesian Lasso
set.seed(2222)
bayes.lasso <- bayeslm(y[train],X[train, ],prior = "laplace", icept = TRUE, 
                     N = 100000, burnin=100000) 
  
bayes.lasso.mean <- as.matrix(apply(bayes.lasso$beta,2,mean)) 

  
# Bayesian Horseshoe
set.seed(3333)
bayes.horseshoe <- bayeslm(y[train],X[train, ],prior = "horseshoe", icept = TRUE, 
                     N = 100000, burnin=100000)
bayes.horseshoe.mean <- as.matrix(apply(bayes.horseshoe$beta,2,mean)) 


#Root mean squared fit errors

rmsfe.bayes.lasso <- sqrt(mean((data$y[test] - test.mat%*%bayes.lasso.mean)^2))
rmsfe.bayes.ridge <- sqrt(mean((data$y[test] - test.mat%*%bayes.ridge.mean)^2))
rmsfe.bayes.horseshoe <- sqrt(mean((data$y[test] - test.mat%*%bayes.horseshoe.mean)^2))  

At the end, we get to the following RMSE and RMSFE for all six models:

RMSFE.2 <- rbind(rmsfe.mle, rmsfe.ridge, rmsfe.lasso, rmsfe.bayes.ridge, rmsfe.bayes.lasso, rmsfe.bayes.horseshoe)

colnames(RMSFE.2) <- c("RMSFE.2")

rmse.mle <- sqrt( c(crossprod(best.mle$residuals))  / length(best.mle$residuals) ) # RMSE
rmse.ridge <- sqrt(min(crossvalid.ridge$cvm))
rmse.lasso <- sqrt(min(crossvalid.lasso$cvm))
rmse.bayes.ridge<-sqrt(c(crossprod(bayes.ridge$residuals))/length(bayes.ridge$residuals))
rmse.bayes.lasso<-sqrt(c(crossprod(bayes.lasso$residuals))/length(bayes.lasso$residuals)) 
rmse.bayes.horse<-sqrt(c(crossprod(bayes.horseshoe$residuals))/length(bayes.horseshoe$residuals)) 

RMSE.2 <-as.data.frame(c(rmse.mle, rmse.ridge, rmse.lasso, rmse.bayes.ridge, rmse.bayes.lasso, rmse.bayes.horse)) 
rownames(RMSE.2) <- c("MLE", "RIDGE", "LASSO", "Bayes.RIDGE", "Bayes.LASSO", "Bayes.HORSESHOE") 
colnames(RMSE.2) <- c("RMSE.2")

errors.part2 <- cbind(RMSE.2, RMSFE.2) 
rownames(errors.part2) <- c("MLE", "RIDGE", "LASSO", "Bayes.RIDGE", "Bayes.LASSO", "Bayes.HORSESHOE")
errors.part2
##                    RMSE.2   RMSFE.2
## MLE             0.3593174 0.3600842
## RIDGE           0.3626888 0.3671738
## LASSO           0.3626888 0.3657714
## Bayes.RIDGE     0.8877046 0.9028560
## Bayes.LASSO     0.8282265 0.8400695
## Bayes.HORSESHOE 0.7059418 0.7250065

Putting Parts 1 and 2 together, MLE seems to be the best choice in terms of fit and prediction, both because it has small RMSE and RMSFE and because it is a more parsimonious model (8 predictors).

RMSE.1 <- RMSE.summary
colnames(RMSE.1) <- c("RMSE.1")

model.errors <- cbind(RMSE.1, RMSE.2, RMSFE.2) 
rownames(model.errors) <- c("MLE", "RIDGE", "LASSO", "Bayes.RIDGE", "Bayes.LASSO", "Bayes.HORSESHOE")
model.errors
##                    RMSE.1    RMSE.2   RMSFE.2
## MLE             0.3584296 0.3593174 0.3600842
## RIDGE           0.3631870 0.3626888 0.3671738
## LASSO           0.3631448 0.3626888 0.3657714
## Bayes.RIDGE     1.0166361 0.8877046 0.9028560
## Bayes.LASSO     0.9819440 0.8282265 0.8400695
## Bayes.HORSESHOE 0.9016975 0.7059418 0.7250065