#install.packages("flexmix")
#install.packages("mvtnorm")
library("flexmix")
library("mvtnorm")

inclusion = matrix(0,64,6)
l = 0
for (i1 in 1:0)
for (i2 in 1:0)
for (i3 in 1:0)
for (i4 in 1:0)
for (i5 in 1:0)
for (i6 in 1:0){
  l=l+1
  inclusion[l,] = c(i1,i2,i3,i4,i5,i6)
}
inclusion=inclusion[1:63,]


data = read.table("http://hedibert.org/wp-content/uploads/2021/03/wage.txt")
# Dependent variable - standardized log wage
y = data[,1]
y = (y-mean(y))/sd(y)
n = length(y)
# Predictors
X = matrix(0,n,6)
X[,1] = data[,5] # years of education
X[,2] = data[,7] # years with current employer
X[,3] = data[,8] # age in years
X[,4] = data[,9] # =1 if married
X[,5] = data[,10] # =1 if black
X[,6] = data[,12] # =1 if live in SMSA
p     = 6


# Complete model
summary(lm(y~X))

# Bayesian hyperparameters
B0  = 2
c0  = 2
d0  = 1
df0 = 2*c0
In  = diag(1,n)

# Search for top modelss
bic = rep(0,63)
R2  = rep(0,63)
py  = rep(0,63)
for (i in 1:63){
  X1 = cbind(1,X[,inclusion[i,]==1])

  # Classical OLS/MLE quantities for model comparison (BIC/R2)
  reg = lm(y~X1-1)
  bic[i] = BIC(reg)
  R2[i] = summary(reg)$adj.r.squared

  # Bayesian marginal likelihood evaluation for model comparison
  scale = (d0/c0)*(In+B0*X1%*%t(X1))
  py[i] = dmvt(y,sigma=scale,df=df0)

}
cbind(R2,bic,py)

ord.R2  = order(R2,decreasing=TRUE)
ord.bic = order(bic)
ord.py  = order(py,decreasing=TRUE)

topmodels=cbind(ord.R2,ord.bic,ord.py)[1:5,]

topmodels[1:5,]

top  = sort(unique(c(topmodels[1:5,1],topmodels[1:5,2],topmodels[1:5,3])))
ntop = length(top)
inclusiontop = inclusion[top,]


# Splitting the sample into training and testing
set.seed(54321)
n    = nrow(X)
n1   = 468
n2   = n-n1
R    = 10
rmse = array(0,c(R,ntop,2))
mae  = array(0,c(R,ntop,2))
for (rep in 1:R){  
  ss        = sample(1:n,size=n)
  training  = ss[1:n1]
  testing   = ss[(n1+1):n]
  for (l in 1:ntop){
    Xtrain    = X[training,inclusiontop[l,]==1]
    p1        = ncol(Xtrain)+1
    ytrain    = y[training]
    Xtest     = X[testing,inclusiontop[l,]==1]
    ytest     = y[testing]
    beta.ols  = lm(ytrain~Xtrain)$coef
    Xtrain1   = cbind(1,Xtrain)
    V         = solve(diag(1/B0,p1)+t(Xtrain1)%*%Xtrain1)
    beta.bayes    = V%*%t(Xtrain1)%*%ytrain
    ytest.ols     = cbind(1,Xtest)%*%beta.ols
    ytest.bayes   = cbind(1,Xtest)%*%beta.bayes
    rmse[rep,l,1] = sqrt(mean((ytest.ols-ytest)^2))
    mae[rep,l,1]  = mean(abs(ytest.ols-ytest))
    rmse[rep,l,2] = sqrt(mean((ytest.bayes-ytest)^2))
    mae[rep,l,2]  = mean(abs(ytest.bayes-ytest))
  }
}


par(mfrow=c(2,7))
for (i in 1:ntop)
  boxplot(rmse[,i,])
for (i in 1:ntop)
  boxplot(mae[,i,])


tab = cbind(inclusion[top,],round(apply(rmse[,,2],2,mean)/apply(rmse[,,1],2,mean),3),
round(apply(mae[,,2],2,mean)/apply(mae[,,1],2,mean),3))

colnames(tab) = c("X1","X2","X3","X4","X5","X6","Rel RMSE","Rel MAE")
tab

inclusion[order(apply(rmse[,,1],2,mean)),][1,]
inclusion[order(apply(rmse[,,2],2,mean)),][1,]
inclusion[order(apply(mae[,,1],2,mean)),][1,]
inclusion[order(apply(mae[,,2],2,mean)),][1,]