#install.packages("flexmix") #install.packages("mvtnorm") library("flexmix") library("mvtnorm") inclusion = matrix(0,64,6) l = 0 for (i1 in 1:0) for (i2 in 1:0) for (i3 in 1:0) for (i4 in 1:0) for (i5 in 1:0) for (i6 in 1:0){ l=l+1 inclusion[l,] = c(i1,i2,i3,i4,i5,i6) } inclusion=inclusion[1:63,] data = read.table("http://hedibert.org/wp-content/uploads/2021/03/wage.txt") # Dependent variable - standardized log wage y = data[,1] y = (y-mean(y))/sd(y) n = length(y) # Predictors X = matrix(0,n,6) X[,1] = data[,5] # years of education X[,2] = data[,7] # years with current employer X[,3] = data[,8] # age in years X[,4] = data[,9] # =1 if married X[,5] = data[,10] # =1 if black X[,6] = data[,12] # =1 if live in SMSA p = 6 # Complete model summary(lm(y~X)) # Bayesian hyperparameters B0 = 2 c0 = 2 d0 = 1 df0 = 2*c0 In = diag(1,n) # Search for top modelss bic = rep(0,63) R2 = rep(0,63) py = rep(0,63) for (i in 1:63){ X1 = cbind(1,X[,inclusion[i,]==1]) # Classical OLS/MLE quantities for model comparison (BIC/R2) reg = lm(y~X1-1) bic[i] = BIC(reg) R2[i] = summary(reg)$adj.r.squared # Bayesian marginal likelihood evaluation for model comparison scale = (d0/c0)*(In+B0*X1%*%t(X1)) py[i] = dmvt(y,sigma=scale,df=df0) } cbind(R2,bic,py) ord.R2 = order(R2,decreasing=TRUE) ord.bic = order(bic) ord.py = order(py,decreasing=TRUE) topmodels=cbind(ord.R2,ord.bic,ord.py)[1:5,] topmodels[1:5,] top = sort(unique(c(topmodels[1:5,1],topmodels[1:5,2],topmodels[1:5,3]))) ntop = length(top) inclusiontop = inclusion[top,] # Splitting the sample into training and testing set.seed(54321) n = nrow(X) n1 = 468 n2 = n-n1 R = 10 rmse = array(0,c(R,ntop,2)) mae = array(0,c(R,ntop,2)) for (rep in 1:R){ ss = sample(1:n,size=n) training = ss[1:n1] testing = ss[(n1+1):n] for (l in 1:ntop){ Xtrain = X[training,inclusiontop[l,]==1] p1 = ncol(Xtrain)+1 ytrain = y[training] Xtest = X[testing,inclusiontop[l,]==1] ytest = y[testing] beta.ols = lm(ytrain~Xtrain)$coef Xtrain1 = cbind(1,Xtrain) V = solve(diag(1/B0,p1)+t(Xtrain1)%*%Xtrain1) beta.bayes = V%*%t(Xtrain1)%*%ytrain ytest.ols = cbind(1,Xtest)%*%beta.ols ytest.bayes = cbind(1,Xtest)%*%beta.bayes rmse[rep,l,1] = sqrt(mean((ytest.ols-ytest)^2)) mae[rep,l,1] = mean(abs(ytest.ols-ytest)) rmse[rep,l,2] = sqrt(mean((ytest.bayes-ytest)^2)) mae[rep,l,2] = mean(abs(ytest.bayes-ytest)) } } par(mfrow=c(2,7)) for (i in 1:ntop) boxplot(rmse[,i,]) for (i in 1:ntop) boxplot(mae[,i,]) tab = cbind(inclusion[top,],round(apply(rmse[,,2],2,mean)/apply(rmse[,,1],2,mean),3), round(apply(mae[,,2],2,mean)/apply(mae[,,1],2,mean),3)) colnames(tab) = c("X1","X2","X3","X4","X5","X6","Rel RMSE","Rel MAE") tab inclusion[order(apply(rmse[,,1],2,mean)),][1,] inclusion[order(apply(rmse[,,2],2,mean)),][1,] inclusion[order(apply(mae[,,1],2,mean)),][1,] inclusion[order(apply(mae[,,2],2,mean)),][1,]