Models are compared based on Bootstrap replication of K-fold cross validation. Measures of comparison are:
Out-of-sample classification error (probability cut-off is 0.5)
Out-of-sample Deviance (goodness of fit)
Out-of-sample area under the curve (AUC)
#install.packages("aplore3")
#install.packages("glmnet")
#install.packages("arm")
library("aplore3")
library("glmnet")
## Loading required package: Matrix
## Loaded glmnet 4.1-1
library("arm")
## Loading required package: MASS
## Loading required package: lme4
##
## arm (Version 1.11-2, built: 2020-7-27)
## Working directory is /Users/hedibert/Dropbox/Cursos/Cursos2021/BayesianLearning/Classes/Class8
deviance = function(y,X,beta){
fit = X%*%beta
return(-2*(sum(fit*y)+sum(log(1+exp(fit)))))
}
attach(icu)
n = nrow(icu)
# Response variable
y = rep(0,n)
y[sta=="Lived"] = 1
# Predictors
X = matrix(0,n,21)
X[,1] = age
X[,2] = sys
X[,3] = hra
X[gender=="Male",4] = 1
X[race=="White",5] = 1
X[race=="Black",6] = 1
X[ser=="Medical",7] = 1
X[can=="Yes",8] = 1
X[crn=="Yes",9] = 1
X[inf=="Yes",10] = 1
X[cpr=="Yes",11] = 1
X[pre=="Yes",12] = 1
X[type=="Emergency",13] = 1
X[fra=="Yes",14] = 1
X[po2=="> 60",15] = 1
X[ph==">= 7.25",16] = 1
X[pco=="<= 45",17] = 1
X[bic==">= 18",18] = 1
X[cre=="<= 2.0",19] = 1
X[loc=="Stupor",20] = 1
X[loc=="Coma",21] = 1
p = ncol(X)
glmfit = glm(y~X,family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm.coef = glmfit$coef
summary(glmfit)
##
## Call:
## glm(formula = y ~ X, family = binomial)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.01708 0.00019 0.17867 0.53717 1.50525
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.512e+00 3.066e+00 2.450 0.01429 *
## X1 -5.645e-02 1.848e-02 -3.055 0.00225 **
## X2 2.084e-02 9.443e-03 2.207 0.02732 *
## X3 2.915e-03 1.032e-02 0.282 0.77761
## X4 -7.215e-01 5.460e-01 -1.321 0.18639
## X5 5.829e-01 1.313e+00 0.444 0.65696
## X6 1.676e+01 1.314e+03 0.013 0.98982
## X7 -6.739e-01 6.289e-01 -1.071 0.28398
## X8 -3.483e+00 1.121e+00 -3.106 0.00189 **
## X9 -1.191e-01 8.449e-01 -0.141 0.88786
## X10 1.081e-01 5.557e-01 0.195 0.84573
## X11 -1.032e+00 9.901e-01 -1.043 0.29714
## X12 -1.279e+00 7.022e-01 -1.822 0.06842 .
## X13 -3.748e+00 1.342e+00 -2.792 0.00523 **
## X14 -1.649e+00 1.093e+00 -1.509 0.13139
## X15 -6.765e-01 9.402e-01 -0.720 0.47179
## X16 1.771e+00 1.212e+00 1.461 0.14410
## X17 -2.084e+00 1.165e+00 -1.789 0.07361 .
## X18 -2.623e-01 8.967e-01 -0.293 0.76985
## X19 1.004e-01 1.131e+00 0.089 0.92925
## X20 -3.771e+01 2.487e+03 -0.015 0.98790
## X21 -3.458e+00 1.342e+00 -2.578 0.00994 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 200.16 on 199 degrees of freedom
## Residual deviance: 112.17 on 178 degrees of freedom
## AIC: 156.17
##
## Number of Fisher Scoring iterations: 17
bayesfit = bayesglm.fit(cbind(1,X),y,family=binomial(link = "logit"))
bayes.coef = bayesfit$coef
plot(glm.coef,ylim=range(glm.coef,bayes.coef),ylab="Coefficient",xlab="",pch=16)
abline(h=0,lty=2)
points(bayes.coef,col=2,pch=16)
legend("topright",legend=c("GLM","Bayes GLM"),col=1:2,pch=16,bty="n")
grid = exp(seq(-10,-2,by=0.1))
glm.ridge = glmnet(X,y,family="binomial",alpha=0,lambda=grid)
glm.lasso = glmnet(X,y,family="binomial",alpha=1,lambda=grid)
glm.enet = glmnet(X,y,family="binomial",alpha=0.5,lambda=grid)
par(mfrow=c(1,3))
plot(glm.ridge,main="Ridge regression\n")
plot(glm.enet,main="Elastic net regression\n")
plot(glm.lasso,main="LASSO regression\n")
par(mfrow=c(1,2))
plot(glm.ridge$lambda, glm.ridge $beta[1,],ylab="Estimated coefficient",
xlab="Lambda",ylim=range(glm.ridge $beta),type="l",col=grey(0.7))
for (i in 1:p)
lines(glm.ridge$lambda,glm.ridge$beta[i,],col=grey(0.7))
for (i in 1:p)
lines(glm.lasso$lambda,glm.lasso$beta[i,],col=2)
title("Ridge vs Lasso")
plot(glm.enet$lambda,glm.enet$beta[1,],ylab="Estimated coefficient",
xlab="Lambda",ylim=range(glm.enet$beta),type="l",col=grey(0.7))
for (i in 1:p)
lines(glm.enet$lambda,glm.enet$beta[i,],col=grey(0.7))
for (i in 1:p)
lines(glm.lasso$lambda,glm.lasso$beta[i,],col=2)
title("Elastic net vs Lasso")
par(mfrow=c(1,3))
ridge.cvout = cv.glmnet(X,y,family="binomial",alpha=0)
plot(ridge.cvout)
ridge.bestlambda = ridge.cvout$lambda.min
ridge.bestmodel= glmnet(X,y,family="binomial",alpha=0,lambda=ridge.bestlambda)
ridge.coef = coef(ridge.bestmodel)[,1]
enet.cvout = cv.glmnet(X,y,family="binomial",alpha=1/2)
plot(enet.cvout)
enet.bestlambda = enet.cvout$lambda.min
enet.bestmodel= glmnet(X,y,family="binomial",alpha=1/2,lambda=enet.bestlambda)
enet.coef = coef(enet.bestmodel)[,1]
lasso.cvout = cv.glmnet(X,y,family="binomial",alpha=1)
plot(lasso.cvout)
lasso.bestlambda = lasso.cvout$lambda.min
lasso.bestmodel= glmnet(X,y,family="binomial",alpha=1,lambda=lasso.bestlambda)
lasso.coef = coef(lasso.bestmodel)[,1]
range = range(glm.coef,bayes.coef,ridge.coef,enet.coef,lasso.coef)
par(mfrow=c(1,1))
plot(0:p,glm.coef,ylab="Coefficient",ylim=range,pch=16,xlab="coefficient",col=0)
for (i in 0:p){
segments(i-0.25,0,i-0.25,glm.coef[i+1],col=1,lwd=3)
segments(i-0.15,0,i-0.15,bayes.coef[i+1],col=2,lwd=3)
segments(i,0,i,ridge.coef[i+1],col=3,lwd=3)
segments(i+0.15,0,i+0.15,enet.coef[i+1],col=4,lwd=3)
segments(i+0.25,0,i+0.25,lasso.coef[i+1],col=5,lwd=3)
}
legend("topright",legend=c("GLM","BGLM","Ridge","Elasticnet","Lasso"),col=1:5,lty=1,lwd=3)
range = range(bayes.coef,ridge.coef,enet.coef,lasso.coef)
par(mfrow=c(1,1))
plot(0:p,bayes.coef,ylab="Coefficient",ylim=range,pch=16,xlab="coefficient",col=0)
for (i in 0:p){
segments(i-0.15,0,i-0.15,bayes.coef[i+1],col=2,lwd=3)
segments(i-0.05,0,i-0.05,ridge.coef[i+1],col=3,lwd=3)
segments(i+0.05,0,i+0.05,enet.coef[i+1],col=4,lwd=3)
segments(i+0.15,0,i+0.15,lasso.coef[i+1],col=5,lwd=3)
}
legend("topright",legend=c("BGLM","Ridge","Elasticnet","Lasso"),col=2:5,lty=1,lwd=3)
cuts = c(seq(0.001,0.999,by=0.01),0.9999)
ncuts = length(cuts)
tpr = matrix(0,ncuts,5)
fpr = matrix(0,ncuts,5)
eta = matrix(0,n,5)
eta[,1] = cbind(1,X)%*%glm.coef
eta[,2] = cbind(1,X)%*%bayes.coef
eta[,3] = cbind(1,X)%*%ridge.coef
eta[,4] = cbind(1,X)%*%enet.coef
eta[,5] = cbind(1,X)%*%lasso.coef
pred = exp(eta)/(1+exp(eta))
for (i in 1:ncuts){
cut = cuts[i]
for (j in 1:5){
class = rep(0,n)
class[pred[,j]>=cut] = 1
tpr[i,j] = sum((class==1)&(y==1))/sum(y==1)
fpr[i,j] = sum((class==1)&(y==0))/sum(y==0)
}
}
auc = rep(0,5)
for (j in 1:5)
auc[j] = round(sum((cuts[2]-cuts[1])*tpr[,j]),3)
par(mfrow=c(1,1))
plot(fpr[,1],tpr[,1],xlim=c(0,1),ylim=c(0,1),type="l",lwd=2,
xlab="False positive rate (1-Specificity)",ylab="True positive rate (Sensitivity)")
abline(0,1,lty=2)
for (i in 2:5)
lines(fpr[,i],tpr[,i],col=i,lwd=2)
legend("bottomright",legend=c(
paste("GLM (AUC=",auc[1],")",sep=""),
paste("BGLM (AUC=",auc[2],")",sep=""),
paste("RIDGE (AUC=",auc[3],")",sep=""),
paste("ENET (AUC=",auc[4],")",sep=""),
paste("LASSO (AUC=",auc[5],")",sep="")),col=1:5,lty=1,lwd=2)
set.seed(1243253)
Rep = 100
K = 2
gsize = n/K
n2 = n-gsize
error = array(0,c(Rep,K,5))
dev = array(0,c(Rep,K,5))
auc = array(0,c(Rep,K,5))
for (rep in 1:Rep){
ind = sample(1:n)
y1 = y[ind]
X1 = X[ind,]
for (k in 1:K){
test = ((k-1)*gsize+1):(k*gsize)
train = setdiff(1:n,test)
ytrain = y1[train]
Xtrain = X1[train,]
ytest = y1[test]
Xtest = X1[test,]
# Classical GLM
# -------------
glmfit = glm(ytrain~Xtrain,family=binomial)
glm.coef = glmfit$coef
eta = cbind(1,Xtest)%*%glm.coef
glm.pred = exp(eta)/(1+exp(eta))
glm.class = rep(0,length(eta))
glm.class[glm.pred>0.5] = 1
error[rep,k,1] = mean(ytest!=glm.class)
dev[rep,k,1] = deviance(ytest,cbind(1,Xtest),glm.coef)
# Bayesian GLM
# ------------
bayesfit = bayesglm.fit(cbind(1,Xtrain),ytrain,family=binomial(link = "logit"))
bayes.coef = bayesfit$coef
eta = exp(cbind(1,Xtest)%*%bayes.coef)
bayes.pred = eta/(1+eta)
bayes.class = rep(0,length(ytest))
bayes.class[bayes.pred>0.5]=1
error[rep,k,2] = mean(ytest!=bayes.class)
dev[rep,k,2] = deviance(ytest,cbind(1,Xtest),bayes.coef)
# Ridge, elastic net and LASSO
# ----------------------------
grid = exp(seq(-10,-2,by=0.1))
glm.ridge = glmnet(Xtrain,ytrain,family="binomial",alpha=0,lambda=grid)
ridge.cvout = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=0)
ridge.bestlambda = ridge.cvout$lambda.min
ridge.bestmodel = glmnet(Xtrain,ytrain,family="binomial",alpha=0,lambda=ridge.bestlambda)
ridge.coef = coef(ridge.bestmodel)[,1]
ridge.class = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,Xtest,type="class")
error[rep,k,3] = mean(ytest!=ridge.class)
dev[rep,k,3] = deviance(ytest,cbind(1,Xtest),ridge.coef)
glm.lasso = glmnet(Xtrain,ytrain,family="binomial",alpha=1,lambda=grid)
enet.cvout = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=1/2)
enet.bestlambda = enet.cvout$lambda.min
enet.bestmodel = glmnet(Xtrain,ytrain,family="binomial",alpha=1/2,lambda=enet.bestlambda)
enet.coef = coef(enet.bestmodel)[,1]
enet.class = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,Xtest,type="class")
error[rep,k,4] = mean(ytest!=enet.class)
dev[rep,k,4] = deviance(ytest,cbind(1,Xtest),enet.coef)
glm.enet = glmnet(Xtrain,ytrain,family="binomial",alpha=0.5,lambda=grid)
lasso.cvout = cv.glmnet(Xtrain,ytrain,family="binomial",alpha=1)
lasso.bestlambda = lasso.cvout$lambda.min
lasso.bestmodel = glmnet(Xtrain,ytrain,family="binomial",alpha=1,lambda=lasso.bestlambda)
lasso.coef = coef(lasso.bestmodel)[,1]
lasso.class = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,Xtest,type="class")
error[rep,k,5] = mean(ytest!=lasso.class)
dev[rep,k,5] = deviance(ytest,cbind(1,Xtest),lasso.coef)
eta = matrix(0,length(ytest),5)
eta[,1] = cbind(1,Xtest)%*%glm.coef
eta[,2] = cbind(1,Xtest)%*%bayes.coef
eta[,3] = cbind(1,Xtest)%*%ridge.coef
eta[,4] = cbind(1,Xtest)%*%enet.coef
eta[,5] = cbind(1,Xtest)%*%lasso.coef
pred = exp(eta)/(1+exp(eta))
tpr = matrix(0,ncuts,5)
for (i in 1:ncuts){
for (j in 1:5){
class = rep(0,length(ytest))
class[pred[,j]>=cuts[i]]=1
tpr[i,j] = sum((class==1)&(ytest==1))/sum(ytest==1)
}
}
for (j in 1:5)
auc[rep,k,j] = round(sum((cuts[2]-cuts[1])*tpr[,j]),3)
}
}
tab = cbind(matrix(error[,,1],Rep*K,1),
matrix(error[,,2],Rep*K,1),
matrix(error[,,3],Rep*K,1),
matrix(error[,,4],Rep*K,1),
matrix(error[,,5],Rep*K,1))
tab1 = cbind(matrix(dev[,,1],Rep*K,1),
matrix(dev[,,2],Rep*K,1),
matrix(dev[,,3],Rep*K,1),
matrix(dev[,,4],Rep*K,1),
matrix(dev[,,5],Rep*K,1))
cond = tab1==apply(tab1,1,max)
ind = 1:(Rep*K)
tab1 = tab1[setdiff(ind,ind[is.na(cond[,1])]),]
tab2 = cbind(matrix(auc[,,1],Rep*K,1),
matrix(auc[,,2],Rep*K,1),
matrix(auc[,,3],Rep*K,1),
matrix(auc[,,4],Rep*K,1),
matrix(auc[,,5],Rep*K,1))
par(mfrow=c(1,3))
boxplot(tab,names=c("GLM","BGLM","RIDGE","ENET","LASSO"),ylab="Classification error",
ylim=c(min(tab),0.35),outline=FALSE)
for (i in 1:5)
text(i,0.35,sum(tab[,i]==apply(tab,1,min))/nrow(tab))
title(paste(Rep," replications of ",K,"-fold CV",sep=""))
boxplot(tab1,names=c("GLM","BGLM","RIDGE","ENET","LASSO"),ylab="Deviance",outline=FALSE,
ylim=c(-8500,0))
for (i in 1:5){
xxx=tab1[,i]==apply(tab1,1,max)
text(i,0,round(mean(xxx),3))
}
title(paste(Rep," replications of ",K,"-fold CV",sep=""))
boxplot(tab2,names=c("GLM","BGLM","RIDGE","ENET","LASSO"),ylab="Area under the curve (AUC)",outline=FALSE,ylim=c(0.65,0.95))
for (i in 1:5){
xxx=tab2[,i]==apply(tab2,1,max)
text(i,0,round(mean(xxx),3))
}
for (i in 1:5){
xxx=tab2[,i]==apply(tab2,1,max)
text(i,0.95,round(mean(xxx),3))
}
title(paste(Rep," replications of ",K,"-fold CV",sep=""))