############################################################################################## # # Bayesian Variable selection in multiple logistic regression # # 1) GLM # 2) Bayes GLM # 3) GLMNET: ridge, elastic net and lasso penalties # 4) CART # 5) BART # 6) Random Forest # # Models are compared based on Bootstrap replication of K-fold cross validation # # Measures of comparison: # a) Out-of-sample classification error (probability cut-off is 0.5) # b) Out-of-sample Deviance (goodness of fit) # c) Out-of-sample area under the curve (AUC) # # By Hedibert F. Lopes # Professor of Statistics and Econometrics # Insper - Institute for Education and Research # This version: March 20th 2021 # ############################################################################################## #install.packages("aplore3") #install.packages("glmnet") #install.packages("arm") #install.packages("tree") #install.packages("BART") library("aplore3") library("glmnet") library("arm") library("tree") library("BART") library("randomForest") deviance = function(y,X,beta){ fit = X%*%beta return(-2*(sum(fit*y)+sum(log(1+exp(fit))))) } #?icu attach(icu) n = nrow(icu) # Response variable y = rep(0,n) y[sta=="Lived"] = 1 # Predictors X = matrix(0,n,21) X[,1] = age X[,2] = sys X[,3] = hra X[gender=="Male",4] = 1 X[race=="White",5] = 1 X[race=="Black",6] = 1 X[ser=="Medical",7] = 1 X[can=="Yes",8] = 1 X[crn=="Yes",9] = 1 X[inf=="Yes",10] = 1 X[cpr=="Yes",11] = 1 X[pre=="Yes",12] = 1 X[type=="Emergency",13] = 1 X[fra=="Yes",14] = 1 X[po2=="> 60",15] = 1 X[ph==">= 7.25",16] = 1 X[pco=="<= 45",17] = 1 X[bic==">= 18",18] = 1 X[cre=="<= 2.0",19] = 1 X[loc=="Stupor",20] = 1 X[loc=="Coma",21] = 1 p = ncol(X) ############################################################### # Generalized linear model - Bernoulli response ############################################################### glmfit = glm(y~X,family=binomial) glm.coef = glmfit$coef summary(glmfit) ############################################################### # Bayesian Generalized linear model - Bernoulli response ############################################################### bayesfit = bayesglm.fit(cbind(1,X),y,family=binomial(link = "logit")) bayes.coef = bayesfit$coef plot(glm.coef,ylim=range(glm.coef,bayes.coef),ylab="Coefficient",xlab="",pch=16) abline(h=0,lty=2) points(bayes.coef,col=2,pch=16) legend("topright",legend=c("GLM","Bayes GLM"),col=1:2,pch=16,bty="n") ############################################################### # Sparse Generalized linear model - Bernoulli response ############################################################### grid = exp(seq(-10,-2,by=0.1)) glm.ridge = glmnet(X,y,family="binomial",alpha=0,lambda=grid) glm.lasso = glmnet(X,y,family="binomial",alpha=1,lambda=grid) glm.enet = glmnet(X,y,family="binomial",alpha=0.5,lambda=grid) par(mfrow=c(1,3)) plot(glm.ridge,main="Ridge regression\n") plot(glm.enet,main="Elastic net regression\n") plot(glm.lasso,main="LASSO regression\n") par(mfrow=c(1,2)) plot(glm.ridge$lambda, glm.ridge $beta[1,],ylab="Estimated coefficient", xlab="Lambda",ylim=range(glm.ridge $beta),type="l",col=grey(0.7)) for (i in 1:p) lines(glm.ridge$lambda,glm.ridge$beta[i,],col=grey(0.7)) for (i in 1:p) lines(glm.lasso$lambda,glm.lasso$beta[i,],col=2) title("Ridge vs Lasso") plot(glm.enet$lambda,glm.enet$beta[1,],ylab="Estimated coefficient", xlab="Lambda",ylim=range(glm.enet$beta),type="l",col=grey(0.7)) for (i in 1:p) lines(glm.enet$lambda,glm.enet$beta[i,],col=grey(0.7)) for (i in 1:p) lines(glm.lasso$lambda,glm.lasso$beta[i,],col=2) title("Elastic net vs Lasso") par(mfrow=c(1,3)) ridge.cvout = cv.glmnet(X,y,family="binomial",alpha=0) plot(ridge.cvout) ridge.bestlambda = ridge.cvout$lambda.min ridge.bestmodel= glmnet(X,y,family="binomial",alpha=0,lambda=ridge.bestlambda) ridge.coef = coef(ridge.bestmodel)[,1] enet.cvout = cv.glmnet(X,y,family="binomial",alpha=1/2) plot(enet.cvout) enet.bestlambda = enet.cvout$lambda.min enet.bestmodel= glmnet(X,y,family="binomial",alpha=1/2,lambda=enet.bestlambda) enet.coef = coef(enet.bestmodel)[,1] lasso.cvout = cv.glmnet(X,y,family="binomial",alpha=1) plot(lasso.cvout) lasso.bestlambda = lasso.cvout$lambda.min lasso.bestmodel= glmnet(X,y,family="binomial",alpha=1,lambda=lasso.bestlambda) lasso.coef = coef(lasso.bestmodel)[,1] range = range(glm.coef,bayes.coef,ridge.coef,enet.coef,lasso.coef) par(mfrow=c(1,1)) plot(0:p,glm.coef,ylab="Coefficient",ylim=range,pch=16,xlab="coefficient",col=0) for (i in 0:p){ segments(i-0.25,0,i-0.25,glm.coef[i+1],col=1,lwd=3) segments(i-0.15,0,i-0.15,bayes.coef[i+1],col=2,lwd=3) segments(i,0,i,ridge.coef[i+1],col=3,lwd=3) segments(i+0.15,0,i+0.15,enet.coef[i+1],col=4,lwd=3) segments(i+0.25,0,i+0.25,lasso.coef[i+1],col=5,lwd=3) } legend("topright",legend=c("GLM","BGLM","Ridge","Elasticnet","Lasso"),col=1:5,lty=1,lwd=3) range = range(bayes.coef,ridge.coef,enet.coef,lasso.coef) par(mfrow=c(1,1)) plot(0:p,bayes.coef,ylab="Coefficient",ylim=range,pch=16,xlab="coefficient",col=0) for (i in 0:p){ segments(i-0.15,0,i-0.15,bayes.coef[i+1],col=2,lwd=3) segments(i-0.05,0,i-0.05,ridge.coef[i+1],col=3,lwd=3) segments(i+0.05,0,i+0.05,enet.coef[i+1],col=4,lwd=3) segments(i+0.15,0,i+0.15,lasso.coef[i+1],col=5,lwd=3) } legend("topright",legend=c("BGLM","Ridge","Elasticnet","Lasso"),col=2:5,lty=1,lwd=3) ################################################################# # Classification and regression tree (CART) - Bernoulli response ################################################################# tree.fit = tree(sta~.,data=icu[,-1]) summary(tree.fit) tree.fit plot(tree.fit,type="uniform") text(tree.fit,pretty=0) pred.cart = predict(tree.fit,data=icu[,-1])[,1] par(mfrow=c(2,2)) plot(glmfit$fit[y==1],ylim=c(0,1),xlab="Patients who lived",ylab="Probability patient lived") title("GLM") abline(h=0.5,lty=2) legend("bottomleft",legend=paste("<0.5 = ",round(mean(glmfit$fit[y==1]<0.5),3))) plot(glmfit$fit[y==0],ylim=c(0,1),xlab="Patients who died",ylab="Probability patient lived") title("GLM") abline(h=0.5,lty=2) legend("topright",legend=paste(">0.5 = ",round(mean(glmfit$fit[y==0]>0.5),3))) plot(pred.cart[y==1],ylim=c(0,1),xlab="Patients who lived",ylab="Probability patient lived") title("CART") abline(h=0.5,lty=2) legend("bottomleft",legend=paste("<0.5 = ",round(mean(pred.cart[y==1]<0.5),3))) plot(pred.cart[y==0],ylim=c(0,1),xlab="Patients who died",ylab="Probability patient lived") title("CART") abline(h=0.5,lty=2) legend("topright",legend=paste(">0.5 = ",round(mean(pred.cart[y==0]>0.5),3))) # ROC cutoff = seq(0.01,0.99,by=0.01) nc = length(cutoff) errors = matrix(0,nc,4) for (i in 1:nc){ errors[i,] = c(mean(glmfit$fit[y==1]<=cutoff[i]), mean(glmfit$fit[y==0]>cutoff[i]), mean(pred.cart[y==1]<=cutoff[i]), mean(pred.cart[y==0]>cutoff[i])) } par(mfrow=c(1,1)) plot(errors[,2],1-errors[,1],xlab="False positives",ylab="True positives",pch=16) points(errors[,4],1-errors[,3],col=2,pch=16) legend("bottomright",legend=c("GLM","CART"),col=1:2,pch=16,bty="n") # BART for classification bart = lbart(X,y) pred.bart = apply(1/(1+exp(-bart$yhat.train)),2,median) # Random forest rf = randomForest(sta~.,data=icu[,-1],mtry=10,importance=TRUE) pred.rf = rf$votes[,1] # Now comparing 8 models cuts = c(seq(0.001,0.999,by=0.01),0.9999) ncuts = length(cuts) tpr = matrix(0,ncuts,8) fpr = matrix(0,ncuts,8) eta = matrix(0,n,5) eta[,1] = cbind(1,X)%*%glm.coef eta[,2] = cbind(1,X)%*%bayes.coef eta[,3] = cbind(1,X)%*%ridge.coef eta[,4] = cbind(1,X)%*%enet.coef eta[,5] = cbind(1,X)%*%lasso.coef pred = exp(eta)/(1+exp(eta)) pred = cbind(pred,pred.cart) pred = cbind(pred,pred.bart) pred = cbind(pred,pred.rf) for (i in 1:ncuts){ cut = cuts[i] for (j in 1:8){ class = rep(0,n) class[pred[,j]>=cut] = 1 tpr[i,j] = sum((class==1)&(y==1))/sum(y==1) fpr[i,j] = sum((class==1)&(y==0))/sum(y==0) } } auc = rep(0,8) for (j in 1:8) auc[j] = round(sum((cuts[2]-cuts[1])*tpr[,j]),3) par(mfrow=c(1,1)) plot(fpr[,1],tpr[,1],xlim=c(0,1),ylim=c(0,1),type="l",lwd=2, xlab="False positive rate (1-Specificity)",ylab="True positive rate (Sensitivity)") abline(0,1,lty=2) for (i in 2:8) lines(fpr[,i],tpr[,i],col=i,lwd=2) legend("bottomright",legend=c( paste("GLM (AUC=",auc[1],")",sep=""), paste("BGLM (AUC=",auc[2],")",sep=""), paste("RIDGE (AUC=",auc[3],")",sep=""), paste("ENET (AUC=",auc[4],")",sep=""), paste("LASSO (AUC=",auc[5],")",sep=""), paste("CART (AUC=",auc[6],")",sep=""), paste("BART (AUC=",auc[7],")",sep=""), paste("RF (AUC=",auc[8],")",sep="")),col=1:8,lty=1,lwd=2) ############################################################################ # Bootstrapping with 150 for training and 50 for testing ############################################################################ set.seed(1243253) Rep = 100 gsize = 80 n2 = n-gsize K = 2 error = array(0,c(Rep,K,8)) auc = array(0,c(Rep,K,8)) dev = array(0,c(Rep,K,5)) for (rep in 1:Rep){ ind = sample(1:n) y1 = y[ind] X1 = X[ind,] for (k in 1:K){ # Training/testing split # ---------------------- test = sort(sample(1:n,size=gsize,replace=FALSE)) train = setdiff(1:n,test) ytrain = y1[train] Xtrain = X1[train,] ytest = y1[test] Xtest = X1[test,] # Classical GLM # ------------- glmfit = glm(ytrain~Xtrain,family=binomial) glm.coef = glmfit$coef eta = cbind(1,Xtest)%*%glm.coef glm.pred = exp(eta)/(1+exp(eta)) glm.class = rep(0,length(eta)) glm.class[glm.pred>0.5] = 1 error[rep,k,1] = mean(ytest!=glm.class) dev[rep,k,1] = deviance(ytest,cbind(1,Xtest),glm.coef) # Bayesian GLM # ------------ bayesfit = bayesglm.fit(cbind(1,Xtrain),ytrain,family=binomial(link = "logit")) bayes.coef = bayesfit$coef eta = exp(cbind(1,Xtest)%*%bayes.coef) bayes.pred = eta/(1+eta) bayes.class = rep(0,length(ytest)) bayes.class[bayes.pred>0.5]=1 error[rep,k,2] = mean(ytest!=bayes.class) dev[rep,k,2] = deviance(ytest,cbind(1,Xtest),bayes.coef) # Ridge, elastic net and LASSO logistic regressions # ------------------------------------------------- grid = exp(seq(-10,-2,by=0.1)) glm.ridge = glmnet(Xtrain,ytrain,family="binomial",alpha=0,lambda=grid) ridge.cvout = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=0) ridge.bestlambda = ridge.cvout$lambda.min ridge.bestmodel = glmnet(Xtrain,ytrain,family="binomial",alpha=0,lambda=ridge.bestlambda) ridge.coef = coef(ridge.bestmodel)[,1] ridge.class = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,Xtest,type="class") error[rep,k,3] = mean(ytest!=ridge.class) dev[rep,k,3] = deviance(ytest,cbind(1,Xtest),ridge.coef) glm.lasso = glmnet(Xtrain,ytrain,family="binomial",alpha=1,lambda=grid) enet.cvout = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=1/2) enet.bestlambda = enet.cvout$lambda.min enet.bestmodel = glmnet(Xtrain,ytrain,family="binomial",alpha=1/2,lambda=enet.bestlambda) enet.coef = coef(enet.bestmodel)[,1] enet.class = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,Xtest,type="class") error[rep,k,4] = mean(ytest!=enet.class) dev[rep,k,4] = deviance(ytest,cbind(1,Xtest),enet.coef) glm.enet = glmnet(Xtrain,ytrain,family="binomial",alpha=0.5,lambda=grid) lasso.cvout = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=1) lasso.bestlambda = lasso.cvout$lambda.min lasso.bestmodel = glmnet(Xtrain,ytrain,family="binomial",alpha=1,lambda=lasso.bestlambda) lasso.coef = coef(lasso.bestmodel)[,1] lasso.class = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,Xtest,type="class") error[rep,k,5] = mean(ytest!=lasso.class) dev[rep,k,5] = deviance(ytest,cbind(1,Xtest),lasso.coef) # Classification tree (CART) tree = tree(sta~.,data=icu[train,-1]) pred.cart = predict(tree,newdata=icu[test,-1])[,1] class.cart = rep(0,length(pred.cart)) class.cart[pred.cart>0.5]=1 error[rep,k,6] = mean(ytest!=class.cart) # Bayesian additive regression tree (BART) for binary response bart = lbart(x.train=Xtrain,y.train=ytrain,x.test=Xtest) pred.bart = apply(1/(1+exp(-bart$yhat.test)),2,median) class.bart = rep(0,length(pred.bart)) class.bart[pred.bart>0.5]=1 error[rep,k,7] = mean(ytest!=class.bart) # Random forest rf = randomForest(sta~.,data=icu[,-1],subset=train,mtry=10,importance=TRUE) pred.rf = predict(rf,newdata=icu[-train,],"prob")[,1] class.rf = rep(0,length(pred.rf)) class.rf[pred.rf>0.5]=1 error[rep,k,8] = mean(ytest!=class.rf) # Computing true positive rate and AUC eta = matrix(0,length(ytest),5) eta[,1] = cbind(1,Xtest)%*%glm.coef eta[,2] = cbind(1,Xtest)%*%bayes.coef eta[,3] = cbind(1,Xtest)%*%ridge.coef eta[,4] = cbind(1,Xtest)%*%enet.coef eta[,5] = cbind(1,Xtest)%*%lasso.coef pred = exp(eta)/(1+exp(eta)) pred = cbind(pred,pred.cart) pred = cbind(pred,pred.bart) pred = cbind(pred,pred.rf) tpr = matrix(0,ncuts,8) for (i in 1:ncuts){ for (j in 1:8){ class = rep(0,length(ytest)) class[pred[,j]>=cuts[i]]=1 tpr[i,j] = sum((class==1)&(ytest==1))/sum(ytest==1) } } for (j in 1:8) auc[rep,k,j] = round(sum((cuts[2]-cuts[1])*tpr[,j]),3) } } tab = cbind( matrix(error[,,1],Rep*K,1), matrix(error[,,2],Rep*K,1), matrix(error[,,3],Rep*K,1), matrix(error[,,4],Rep*K,1), matrix(error[,,5],Rep*K,1), matrix(error[,,6],Rep*K,1), matrix(error[,,7],Rep*K,1), matrix(error[,,8],Rep*K,1)) tab1 = cbind( matrix(dev[,,1],Rep*K,1), matrix(dev[,,2],Rep*K,1), matrix(dev[,,3],Rep*K,1), matrix(dev[,,4],Rep*K,1), matrix(dev[,,5],Rep*K,1)) cond = tab1==apply(tab1,1,max) ind = 1:(Rep*K) tab1 = tab1[setdiff(ind,ind[is.na(cond[,1])]),] tab2 = cbind( matrix(auc[,,1],Rep*K,1), matrix(auc[,,2],Rep*K,1), matrix(auc[,,3],Rep*K,1), matrix(auc[,,4],Rep*K,1), matrix(auc[,,5],Rep*K,1), matrix(auc[,,6],Rep*K,1), matrix(auc[,,7],Rep*K,1), matrix(auc[,,8],Rep*K,1)) pdf(file="icu-glm-bayes-ridge-enet-lasso-cart-bart-rf.pdf",width=12,height=8) par(mfrow=c(1,1)) boxplot(tab,names=c("GLM","BGLM","RIDGE","ENET","LASSO","CART","BART","RF"),ylab="Classification error", ylim=c(min(tab),0.45),outline=FALSE) for (i in 1:8) text(i,0.45,sum(tab[,i]==apply(tab,1,min))/nrow(tab)) title(paste(Rep," replications of ",K,"-fold CV",sep="")) par(mfrow=c(1,1)) boxplot(tab2,names=c("GLM","BGLM","RIDGE","ENET","LASSO","CART","BART","RF"),ylab="Area under the curve (AUC)",outline=FALSE,ylim=c(0.65,1.0)) for (i in 1:8){ xxx=tab2[,i]==apply(tab2,1,max) text(i,0,round(mean(xxx),3)) } for (i in 1:8){ xxx=tab2[,i]==apply(tab2,1,max) text(i,1.0,round(mean(xxx),3)) } title(paste(Rep," replications of ",K,"-fold CV",sep="")) par(mfrow=c(1,1)) boxplot(-log(-tab1),names=c("GLM","BGLM","RIDGE","ENET","LASSO"),ylab="-Log(-deviance)",outline=FALSE, ylim=c(-10,-5.5)) for (i in 1:5){ xxx=tab1[,i]==apply(tab1,1,max) text(i,-5.5,round(mean(xxx),3)) } title(paste(Rep," replications of ",K,"-fold CV",sep="")) dev.off()