##############################################################################################
#
#  Bayesian Variable selection in multiple logistic regression
#
#      1) GLM
#      2) Bayes GLM
#      3) GLMNET: ridge, elastic net and lasso penalties
#      4) CART
#      5) BART
#      6) Random Forest
#  
#  Models are compared based on Bootstrap replication of K-fold cross validation
# 
#  Measures of comparison: 
#       a) Out-of-sample classification error (probability cut-off is 0.5)
#       b) Out-of-sample Deviance (goodness of fit)
#       c) Out-of-sample area under the curve (AUC)
#
#  By Hedibert F. Lopes
#  Professor of Statistics and Econometrics
#  Insper - Institute for Education and Research
#  This version: March 20th 2021
#
##############################################################################################

#install.packages("aplore3")
#install.packages("glmnet")
#install.packages("arm")
#install.packages("tree") 
#install.packages("BART") 
library("aplore3")
library("glmnet")
library("arm")
library("tree") 
library("BART")
library("randomForest")

deviance = function(y,X,beta){
  fit = X%*%beta
  return(-2*(sum(fit*y)+sum(log(1+exp(fit)))))
}


#?icu

attach(icu)

n = nrow(icu)

# Response variable
y = rep(0,n)
y[sta=="Lived"] = 1


# Predictors
X     = matrix(0,n,21)
X[,1] = age
X[,2] = sys
X[,3] = hra
X[gender=="Male",4] = 1
X[race=="White",5] = 1
X[race=="Black",6] = 1
X[ser=="Medical",7] = 1
X[can=="Yes",8] = 1
X[crn=="Yes",9] = 1
X[inf=="Yes",10] = 1
X[cpr=="Yes",11] = 1
X[pre=="Yes",12] = 1
X[type=="Emergency",13] = 1
X[fra=="Yes",14] = 1
X[po2=="> 60",15] = 1
X[ph==">= 7.25",16] = 1
X[pco=="<= 45",17] = 1
X[bic==">= 18",18] = 1
X[cre=="<= 2.0",19] = 1
X[loc=="Stupor",20] = 1
X[loc=="Coma",21] = 1
p = ncol(X)


###############################################################
# Generalized linear model - Bernoulli response
###############################################################
glmfit   = glm(y~X,family=binomial)
glm.coef = glmfit$coef
summary(glmfit)


###############################################################
# Bayesian Generalized linear model - Bernoulli response
###############################################################
bayesfit    = bayesglm.fit(cbind(1,X),y,family=binomial(link = "logit"))
bayes.coef  = bayesfit$coef
plot(glm.coef,ylim=range(glm.coef,bayes.coef),ylab="Coefficient",xlab="",pch=16)
abline(h=0,lty=2)
points(bayes.coef,col=2,pch=16)
legend("topright",legend=c("GLM","Bayes GLM"),col=1:2,pch=16,bty="n")


###############################################################
# Sparse Generalized linear model - Bernoulli response
###############################################################
grid = exp(seq(-10,-2,by=0.1))
glm.ridge = glmnet(X,y,family="binomial",alpha=0,lambda=grid)
glm.lasso = glmnet(X,y,family="binomial",alpha=1,lambda=grid)
glm.enet  = glmnet(X,y,family="binomial",alpha=0.5,lambda=grid)

par(mfrow=c(1,3))
plot(glm.ridge,main="Ridge regression\n")
plot(glm.enet,main="Elastic net regression\n")
plot(glm.lasso,main="LASSO regression\n")

par(mfrow=c(1,2))
plot(glm.ridge$lambda, glm.ridge $beta[1,],ylab="Estimated coefficient",
     xlab="Lambda",ylim=range(glm.ridge $beta),type="l",col=grey(0.7))
for (i in 1:p)
lines(glm.ridge$lambda,glm.ridge$beta[i,],col=grey(0.7))
for (i in 1:p)
lines(glm.lasso$lambda,glm.lasso$beta[i,],col=2)
title("Ridge vs Lasso")

plot(glm.enet$lambda,glm.enet$beta[1,],ylab="Estimated coefficient",
     xlab="Lambda",ylim=range(glm.enet$beta),type="l",col=grey(0.7))
for (i in 1:p)
lines(glm.enet$lambda,glm.enet$beta[i,],col=grey(0.7))
for (i in 1:p)
lines(glm.lasso$lambda,glm.lasso$beta[i,],col=2)
title("Elastic net vs Lasso")

par(mfrow=c(1,3))
ridge.cvout = cv.glmnet(X,y,family="binomial",alpha=0)
plot(ridge.cvout)
ridge.bestlambda = ridge.cvout$lambda.min
ridge.bestmodel= glmnet(X,y,family="binomial",alpha=0,lambda=ridge.bestlambda)
ridge.coef = coef(ridge.bestmodel)[,1]

enet.cvout = cv.glmnet(X,y,family="binomial",alpha=1/2)
plot(enet.cvout)
enet.bestlambda = enet.cvout$lambda.min
enet.bestmodel= glmnet(X,y,family="binomial",alpha=1/2,lambda=enet.bestlambda)
enet.coef = coef(enet.bestmodel)[,1]

lasso.cvout = cv.glmnet(X,y,family="binomial",alpha=1)
plot(lasso.cvout)
lasso.bestlambda = lasso.cvout$lambda.min
lasso.bestmodel= glmnet(X,y,family="binomial",alpha=1,lambda=lasso.bestlambda)
lasso.coef = coef(lasso.bestmodel)[,1]

range = range(glm.coef,bayes.coef,ridge.coef,enet.coef,lasso.coef)
par(mfrow=c(1,1))
plot(0:p,glm.coef,ylab="Coefficient",ylim=range,pch=16,xlab="coefficient",col=0)
for (i in 0:p){
  segments(i-0.25,0,i-0.25,glm.coef[i+1],col=1,lwd=3)
  segments(i-0.15,0,i-0.15,bayes.coef[i+1],col=2,lwd=3)
  segments(i,0,i,ridge.coef[i+1],col=3,lwd=3)
  segments(i+0.15,0,i+0.15,enet.coef[i+1],col=4,lwd=3)
  segments(i+0.25,0,i+0.25,lasso.coef[i+1],col=5,lwd=3)
}
legend("topright",legend=c("GLM","BGLM","Ridge","Elasticnet","Lasso"),col=1:5,lty=1,lwd=3)

range = range(bayes.coef,ridge.coef,enet.coef,lasso.coef)
par(mfrow=c(1,1))
plot(0:p,bayes.coef,ylab="Coefficient",ylim=range,pch=16,xlab="coefficient",col=0)
for (i in 0:p){
  segments(i-0.15,0,i-0.15,bayes.coef[i+1],col=2,lwd=3)
  segments(i-0.05,0,i-0.05,ridge.coef[i+1],col=3,lwd=3)
  segments(i+0.05,0,i+0.05,enet.coef[i+1],col=4,lwd=3)
  segments(i+0.15,0,i+0.15,lasso.coef[i+1],col=5,lwd=3)
}
legend("topright",legend=c("BGLM","Ridge","Elasticnet","Lasso"),col=2:5,lty=1,lwd=3)


#################################################################
# Classification and regression tree (CART) - Bernoulli response
#################################################################
tree.fit = tree(sta~.,data=icu[,-1]) 
summary(tree.fit)
tree.fit
plot(tree.fit,type="uniform") 
text(tree.fit,pretty=0)
pred.cart = predict(tree.fit,data=icu[,-1])[,1]

par(mfrow=c(2,2))
plot(glmfit$fit[y==1],ylim=c(0,1),xlab="Patients who lived",ylab="Probability patient lived")
title("GLM")
abline(h=0.5,lty=2)
legend("bottomleft",legend=paste("<0.5 = ",round(mean(glmfit$fit[y==1]<0.5),3)))
plot(glmfit$fit[y==0],ylim=c(0,1),xlab="Patients who died",ylab="Probability patient lived")
title("GLM")
abline(h=0.5,lty=2)
legend("topright",legend=paste(">0.5 = ",round(mean(glmfit$fit[y==0]>0.5),3)))
plot(pred.cart[y==1],ylim=c(0,1),xlab="Patients who lived",ylab="Probability patient lived")
title("CART")
abline(h=0.5,lty=2)
legend("bottomleft",legend=paste("<0.5 = ",round(mean(pred.cart[y==1]<0.5),3)))
plot(pred.cart[y==0],ylim=c(0,1),xlab="Patients who died",ylab="Probability patient lived")
title("CART")
abline(h=0.5,lty=2)
legend("topright",legend=paste(">0.5 = ",round(mean(pred.cart[y==0]>0.5),3)))

# ROC
cutoff = seq(0.01,0.99,by=0.01)
nc     = length(cutoff)
errors = matrix(0,nc,4)
for (i in 1:nc){
  errors[i,] = c(mean(glmfit$fit[y==1]<=cutoff[i]),
    mean(glmfit$fit[y==0]>cutoff[i]),
    mean(pred.cart[y==1]<=cutoff[i]),
    mean(pred.cart[y==0]>cutoff[i]))
}
par(mfrow=c(1,1))
plot(errors[,2],1-errors[,1],xlab="False positives",ylab="True positives",pch=16)
points(errors[,4],1-errors[,3],col=2,pch=16)
legend("bottomright",legend=c("GLM","CART"),col=1:2,pch=16,bty="n")

# BART for classification
bart      = lbart(X,y)
pred.bart = apply(1/(1+exp(-bart$yhat.train)),2,median)

# Random forest
rf      = randomForest(sta~.,data=icu[,-1],mtry=10,importance=TRUE) 
pred.rf = rf$votes[,1]

# Now comparing 8 models
cuts    = c(seq(0.001,0.999,by=0.01),0.9999)
ncuts   = length(cuts)
tpr     = matrix(0,ncuts,8)
fpr     = matrix(0,ncuts,8)
eta     = matrix(0,n,5)
eta[,1] = cbind(1,X)%*%glm.coef
eta[,2] = cbind(1,X)%*%bayes.coef
eta[,3] = cbind(1,X)%*%ridge.coef
eta[,4] = cbind(1,X)%*%enet.coef
eta[,5] = cbind(1,X)%*%lasso.coef
pred    = exp(eta)/(1+exp(eta))
pred    = cbind(pred,pred.cart)
pred    = cbind(pred,pred.bart)
pred    = cbind(pred,pred.rf)

for (i in 1:ncuts){
  cut  = cuts[i]
  for (j in 1:8){
    class = rep(0,n)
    class[pred[,j]>=cut] = 1
    tpr[i,j] = sum((class==1)&(y==1))/sum(y==1)
    fpr[i,j] = sum((class==1)&(y==0))/sum(y==0)
  }
}

auc = rep(0,8)
for (j in 1:8)
 auc[j] =  round(sum((cuts[2]-cuts[1])*tpr[,j]),3)

par(mfrow=c(1,1))
plot(fpr[,1],tpr[,1],xlim=c(0,1),ylim=c(0,1),type="l",lwd=2,
     xlab="False positive rate (1-Specificity)",ylab="True positive rate (Sensitivity)")
abline(0,1,lty=2)
for (i in 2:8)
  lines(fpr[,i],tpr[,i],col=i,lwd=2)

legend("bottomright",legend=c(
paste("GLM (AUC=",auc[1],")",sep=""),
paste("BGLM (AUC=",auc[2],")",sep=""),
paste("RIDGE (AUC=",auc[3],")",sep=""),
paste("ENET (AUC=",auc[4],")",sep=""),
paste("LASSO (AUC=",auc[5],")",sep=""),
paste("CART (AUC=",auc[6],")",sep=""),
paste("BART (AUC=",auc[7],")",sep=""),
paste("RF (AUC=",auc[8],")",sep="")),col=1:8,lty=1,lwd=2)


############################################################################
# Bootstrapping with 150 for training and 50 for testing
############################################################################
set.seed(1243253)
Rep   = 100
gsize = 80
n2    = n-gsize
K     = 2
error = array(0,c(Rep,K,8))
auc   = array(0,c(Rep,K,8))
dev   = array(0,c(Rep,K,5))
for (rep in 1:Rep){
  ind   = sample(1:n)
  y1    = y[ind]
  X1    = X[ind,]
  for (k in 1:K){

    # Training/testing split
    # ----------------------
    test   = sort(sample(1:n,size=gsize,replace=FALSE))
    train  = setdiff(1:n,test)
    ytrain = y1[train]
    Xtrain = X1[train,]
    ytest  = y1[test]
    Xtest  = X1[test,]

    # Classical GLM
    # -------------
    glmfit   = glm(ytrain~Xtrain,family=binomial)
    glm.coef = glmfit$coef
    eta       = cbind(1,Xtest)%*%glm.coef
    glm.pred  = exp(eta)/(1+exp(eta))
    glm.class = rep(0,length(eta))
    glm.class[glm.pred>0.5] = 1
    error[rep,k,1] = mean(ytest!=glm.class)
    dev[rep,k,1] = deviance(ytest,cbind(1,Xtest),glm.coef)
    
    # Bayesian GLM
    # ------------
    bayesfit    = bayesglm.fit(cbind(1,Xtrain),ytrain,family=binomial(link = "logit"))
    bayes.coef  = bayesfit$coef
    eta         = exp(cbind(1,Xtest)%*%bayes.coef)
    bayes.pred  = eta/(1+eta)
    bayes.class = rep(0,length(ytest))
    bayes.class[bayes.pred>0.5]=1
    error[rep,k,2] = mean(ytest!=bayes.class)
    dev[rep,k,2] = deviance(ytest,cbind(1,Xtest),bayes.coef)
 
    # Ridge, elastic net and LASSO logistic regressions
    # -------------------------------------------------
    grid      = exp(seq(-10,-2,by=0.1))  

    glm.ridge        = glmnet(Xtrain,ytrain,family="binomial",alpha=0,lambda=grid)
    ridge.cvout      = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=0)
    ridge.bestlambda = ridge.cvout$lambda.min
    ridge.bestmodel  = glmnet(Xtrain,ytrain,family="binomial",alpha=0,lambda=ridge.bestlambda)
    ridge.coef       = coef(ridge.bestmodel)[,1]
    ridge.class      = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,Xtest,type="class")
    error[rep,k,3]   = mean(ytest!=ridge.class)
    dev[rep,k,3]     = deviance(ytest,cbind(1,Xtest),ridge.coef)

    glm.lasso        = glmnet(Xtrain,ytrain,family="binomial",alpha=1,lambda=grid)
    enet.cvout       = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=1/2)
    enet.bestlambda  = enet.cvout$lambda.min
    enet.bestmodel   = glmnet(Xtrain,ytrain,family="binomial",alpha=1/2,lambda=enet.bestlambda)
    enet.coef        = coef(enet.bestmodel)[,1]
    enet.class       = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,Xtest,type="class")
    error[rep,k,4]   = mean(ytest!=enet.class)
    dev[rep,k,4]     = deviance(ytest,cbind(1,Xtest),enet.coef)

    glm.enet         = glmnet(Xtrain,ytrain,family="binomial",alpha=0.5,lambda=grid)
    lasso.cvout      = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=1)
    lasso.bestlambda = lasso.cvout$lambda.min
    lasso.bestmodel  = glmnet(Xtrain,ytrain,family="binomial",alpha=1,lambda=lasso.bestlambda)
    lasso.coef       = coef(lasso.bestmodel)[,1]
    lasso.class      = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,Xtest,type="class")
    error[rep,k,5]   = mean(ytest!=lasso.class)
    dev[rep,k,5]     = deviance(ytest,cbind(1,Xtest),lasso.coef)
    
    # Classification tree (CART)
    tree       = tree(sta~.,data=icu[train,-1]) 
    pred.cart  = predict(tree,newdata=icu[test,-1])[,1]
    class.cart = rep(0,length(pred.cart))
    class.cart[pred.cart>0.5]=1
    error[rep,k,6] = mean(ytest!=class.cart)
    
    # Bayesian additive regression tree (BART) for binary response
    bart       = lbart(x.train=Xtrain,y.train=ytrain,x.test=Xtest)
    pred.bart  = apply(1/(1+exp(-bart$yhat.test)),2,median)
    class.bart = rep(0,length(pred.bart))
    class.bart[pred.bart>0.5]=1
    error[rep,k,7] = mean(ytest!=class.bart)
    
    # Random forest
    rf       = randomForest(sta~.,data=icu[,-1],subset=train,mtry=10,importance=TRUE) 
    pred.rf  = predict(rf,newdata=icu[-train,],"prob")[,1]
    class.rf = rep(0,length(pred.rf))
    class.rf[pred.rf>0.5]=1
    error[rep,k,8] = mean(ytest!=class.rf)
       
    # Computing true positive rate and AUC
    eta     = matrix(0,length(ytest),5)
    eta[,1] = cbind(1,Xtest)%*%glm.coef
    eta[,2] = cbind(1,Xtest)%*%bayes.coef
    eta[,3] = cbind(1,Xtest)%*%ridge.coef
    eta[,4] = cbind(1,Xtest)%*%enet.coef
    eta[,5] = cbind(1,Xtest)%*%lasso.coef
    pred    = exp(eta)/(1+exp(eta))
    pred    = cbind(pred,pred.cart)
    pred    = cbind(pred,pred.bart)
    pred    = cbind(pred,pred.rf)
    tpr     = matrix(0,ncuts,8)
    for (i in 1:ncuts){
      for (j in 1:8){
        class = rep(0,length(ytest))
        class[pred[,j]>=cuts[i]]=1
        tpr[i,j] = sum((class==1)&(ytest==1))/sum(ytest==1)
      }
    }
    for (j in 1:8)
      auc[rep,k,j] =  round(sum((cuts[2]-cuts[1])*tpr[,j]),3)
  }
}

tab = cbind(
matrix(error[,,1],Rep*K,1),
matrix(error[,,2],Rep*K,1),
matrix(error[,,3],Rep*K,1),
matrix(error[,,4],Rep*K,1),
matrix(error[,,5],Rep*K,1),
matrix(error[,,6],Rep*K,1),
matrix(error[,,7],Rep*K,1),
matrix(error[,,8],Rep*K,1))

tab1 = cbind(
matrix(dev[,,1],Rep*K,1),
matrix(dev[,,2],Rep*K,1),
matrix(dev[,,3],Rep*K,1),
matrix(dev[,,4],Rep*K,1),
matrix(dev[,,5],Rep*K,1))

cond = tab1==apply(tab1,1,max)
ind  = 1:(Rep*K)
tab1 = tab1[setdiff(ind,ind[is.na(cond[,1])]),]

tab2 = cbind(
matrix(auc[,,1],Rep*K,1),
matrix(auc[,,2],Rep*K,1),
matrix(auc[,,3],Rep*K,1),
matrix(auc[,,4],Rep*K,1),
matrix(auc[,,5],Rep*K,1),
matrix(auc[,,6],Rep*K,1),
matrix(auc[,,7],Rep*K,1),
matrix(auc[,,8],Rep*K,1))

pdf(file="icu-glm-bayes-ridge-enet-lasso-cart-bart-rf.pdf",width=12,height=8)

par(mfrow=c(1,1))
boxplot(tab,names=c("GLM","BGLM","RIDGE","ENET","LASSO","CART","BART","RF"),ylab="Classification error",
ylim=c(min(tab),0.45),outline=FALSE)
for (i in 1:8)
  text(i,0.45,sum(tab[,i]==apply(tab,1,min))/nrow(tab))
title(paste(Rep," replications of ",K,"-fold CV",sep=""))

par(mfrow=c(1,1))
boxplot(tab2,names=c("GLM","BGLM","RIDGE","ENET","LASSO","CART","BART","RF"),ylab="Area under the curve (AUC)",outline=FALSE,ylim=c(0.65,1.0))
for (i in 1:8){
  xxx=tab2[,i]==apply(tab2,1,max)
  text(i,0,round(mean(xxx),3))
}
for (i in 1:8){
  xxx=tab2[,i]==apply(tab2,1,max)
  text(i,1.0,round(mean(xxx),3))
}
title(paste(Rep," replications of ",K,"-fold CV",sep=""))


par(mfrow=c(1,1))
boxplot(-log(-tab1),names=c("GLM","BGLM","RIDGE","ENET","LASSO"),ylab="-Log(-deviance)",outline=FALSE,
ylim=c(-10,-5.5))
for (i in 1:5){
  xxx=tab1[,i]==apply(tab1,1,max)
  text(i,-5.5,round(mean(xxx),3))
}
title(paste(Rep," replications of ",K,"-fold CV",sep=""))


dev.off()