################################################################################################### # # Sparse logistic regression - the spam/ham dataset # # https://archive.ics.uci.edu/ml/datasets/spambase # For more information, see file 'spambase.DOCUMENTATION' at the # UCI Machine Learning Repository: http://www.ics.uci.edu/~mlearn/MLRepository.html # ################################################################################################### #install.packages("glmnet") #install.packages("BayesLogit") rm(list=ls()) library(glmnet) library(BayesLogit) names = c("make","address","all","3d","our","over","remove","internet","order","mail", "receive","will","people","report","addresses","free","business","email","you","credit", "your","font","000","money","hp","hpl","george","650","lab","labs","telnet","857","data", "415","85","technology","1999","parts","pm","direct","cs","meeting","original","project", "re","edu","table","conference","char;","char(","char[","char!","char$","char#","cap.ave", "cap.long","cap.tot") ####################################################### # Reading and transforming the data ####################################################### data = read.table("spamdata.txt",header=FALSE) p = ncol(data)-1 n = nrow(data) y = data[,58] x = as.matrix(data[,1:57]) x = x - matrix(apply(x,2,mean),n,p,byrow=TRUE) x = x %*% diag(sqrt(1/apply(x,2,var))) colnames(x) = names ####################################################### # Exploratory data anlaysis: means, variances, coefficient of variation ####################################################### my1 = apply(x[y==1,],2,mean) my0 = apply(x[y==0,],2,mean) sy1 = sqrt(apply(x[y==1,],2,var)) sy0 = sqrt(apply(x[y==0,],2,var)) diff = abs(my1-my0) ord = order(diff,decreasing=TRUE) pdf(file="spam-graph1.pdf",width=12,height=6) par(mfrow=c(1,1)) plot(my1[ord],ylim=c(-0.4,0.5),ylab="Predictor's sample mean",xlab="",axes=FALSE,pch=16) axis(2);box();axis(1,at=1:p,lab=names[ord],las=2) for (i in 1:p) segments(i,my1[ord[i]],i,my0[ord[i]],col=grey(0.8)) points(my1[ord],pch=16) points(my0[ord],col=2,pch=16) abline(h=0,lty=3) legend("topright",legend=c("SPAM","HAM"),col=1:2,pch=16) dev.off() pdf(file="spam-graph2.pdf",width=12,height=6) plot(sy1[ord]/abs(my1[ord]),ylim=c(0,30),ylab="Coefficient of variation",xlab="",axes=FALSE,pch=16) axis(2);box();axis(1,at=1:p,lab=names[ord],las=2) points(sy0[ord]/abs(my0[ord]),col=2,pch=16) abline(h=1,lty=3) dev.off() ####################################################### # Setting up training and sample subgroups ####################################################### set.seed(1) train = sample(1:nrow(x),nrow(x)/2) test = (-train) y.test = y[test] ####################################################### # GLS and Bayes fits ####################################################### glmfit = glm(y[train]~x[train,],family=binomial) glm.coef = glmfit$coef bayesfit = logit(y[train],cbind(1,x[train,])) bayes.coef = apply(bayesfit$beta,2,mean) ####################################################### # Ridge, Elasticnet & LASSO fits ####################################################### grid = exp(seq(-10,-2,by=0.1)) ridge.model = glmnet(x[train,],y[train],family="binomial",alpha=0,lambda=grid) enet.model = glmnet(x[train,],y[train],family="binomial",alpha=0.5,lambda=grid) lasso.model = glmnet(x[train,],y[train],family="binomial",alpha=1,lambda=grid) pdf(file="spam-graph3.pdf",width=12,height=6) par(mfrow=c(1,3)) plot(ridge.model,ylim=c(-17,2)) plot(enet.model,ylim=c(-17,2)) plot(lasso.model,ylim=c(-17,2)) dev.off() pdf(file="spam-graph4.pdf",width=12,height=6) par(mfrow=c(1,3)) ridge.cvout = cv.glmnet(x[train,],y[train],family="binomial",alpha=0) plot(ridge.cvout) ridge.bestlambda = ridge.cvout$lambda.min ridge.bestmodel= glmnet(x,y,family="binomial",alpha=0,lambda=ridge.bestlambda) ridge.coef = coef(ridge.bestmodel)[,1] enet.cvout = cv.glmnet(x[train,],y[train],family="binomial",alpha=0.5) plot(enet.cvout) enet.bestlambda = enet.cvout$lambda.min enet.bestmodel= glmnet(x,y,family="binomial",alpha=0.5,lambda=enet.bestlambda) enet.coef = coef(enet.bestmodel)[,1] lasso.cvout = cv.glmnet(x[train,],y[train],family="binomial",alpha=1) plot(lasso.cvout) lasso.bestlambda = lasso.cvout$lambda.min lasso.bestmodel= glmnet(x,y,family="binomial",alpha=0,lambda=lasso.bestlambda) lasso.coef = coef(lasso.bestmodel)[,1] dev.off() pdf(file="spam-graph5.pdf",width=9,height=6) par(mfrow=c(1,1)) plot(glm.coef,ylab="Coefficient",ylim=c(-3,3),pch=16,xlab="",axes=FALSE) axis(2);box();axis(1,at=1:p,lab=names,las=2) points(ridge.coef,col=2,pch=16) points(enet.coef,col=3,pch=16) points(lasso.coef,col=4,pch=16) points(bayes.coef,col=5,pch=16) abline(h=0,lty=2) legend("bottomright",legend=c("OLS","Ridge","Elasticnet","Lasso","Bayes"),col=1:5,pch=16) dev.off() pdf(file="spam-graph6.pdf",width=12,height=6) par(mfrow=c(1,4)) plot(glm.coef,ridge.coef,xlab="GLM",ylab="RIDGE",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3) plot(glm.coef,enet.coef,xlab="GLM",ylab="ELASTICNET",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3) plot(glm.coef,lasso.coef,xlab="GLM",ylab="LASSO",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3) plot(glm.coef,bayes.coef,xlab="GLM",ylab="BAYES",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3) dev.off() ####################################################### # Misclassification rates ####################################################### eta = cbind(1,x[test,])%*%glm.coef glm.pred = exp(eta)/(1+exp(eta)) glm.class = rep(0,length(eta)) glm.class[glm.pred>0.5]=1 eta = cbind(1,x[test,])%*%bayes.coef bayes.pred = exp(eta)/(1+exp(eta)) bayes.class = rep(0,length(eta)) bayes.class[bayes.pred>0.5]=1 ridge.class = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,x[test,],type="class") enet.class = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,x[test,],type="class") lasso.class = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,x[test,],type="class") tab = rbind(matrix(table(y[test],glm.class),1,4)[,c(1,3,2,4)], matrix(table(y[test],ridge.class),1,4)[,c(1,3,2,4)], matrix(table(y[test],enet.class),1,4)[,c(1,3,2,4)], matrix(table(y[test],lasso.class),1,4)[,c(1,3,2,4)], matrix(table(y[test],bayes.class),1,4)[,c(1,3,2,4)]) rates = apply(tab[,c(1,4)],1,sum)/apply(tab,1,sum) tab = cbind(tab,round(100*(1-rates),2)) colnames(tab) = c("y=0,yhat=0","y=0,yhat=1","y=1,yhat=0","y=1,yhat=1","error") rownames(tab) = c("glm","ridge","enet","lasso","bayes") tab # y=0,yhat=0 y=0,yhat=1 y=1,yhat=0 y=1,yhat=1 error # glm 1325 80 93 803 7.52 # ridge 1345 60 128 768 8.17 # enet 1349 56 94 802 6.52 # lasso 1347 58 95 801 6.65 # bayes 1326 79 91 805 7.39 falses = t(rbind(tab[,3]/apply(tab[,3:4],1,sum),tab[,2]/apply(tab[,1:2],1,sum),tab[,2]/apply(tab[,c(2:4)],1,sum))) colnames(falses) = c("false.negative","false.positive","false.discovery") rownames(falses) = c("glm","ridge","enet","lasso","bayes") round(100*falses,2) # false.negative false.positive false.discovery # glm 10.38 5.69 8.20 # ridge 14.29 4.27 6.28 # enet 10.49 3.99 5.88 # lasso 10.60 4.13 6.08 # bayes 10.16 5.62 8.10 ridge.pred = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,x[test,],type="response") enet.pred = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,x[test,],type="response") lasso.pred = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,x[test,],type="response") pdf(file="spam-graph7.pdf",width=12,height=6) par(mfrow=c(1,5)) boxplot(glm.pred[y[test]==1],glm.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam") abline(h=0.5,lty=3) title("GLM") boxplot(ridge.pred[y[test]==1],ridge.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam") abline(h=0.5,lty=3) title("Ridge") boxplot(enet.pred[y[test]==1],enet.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam") abline(h=0.5,lty=3) title("Elasticnet") boxplot(lasso.pred[y[test]==1],lasso.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam") abline(h=0.5,lty=3) title("LASSO") boxplot(bayes.pred[y[test]==1],bayes.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam") abline(h=0.5,lty=3) title("Bayes") dev.off() pdf(file="spam-graph8.pdf",width=12,height=6) par(mfrow=c(1,2)) boxplot(glm.pred[y[test]==1], ridge.pred[y[test]==1], enet.pred[y[test]==1], lasso.pred[y[test]==1], bayes.pred[y[test]==1],ylim=c(0,1.1), names=c("glm","ridge","enet","lasso","Bayes"),main="SPAM",ylab="Probability of spam") abline(h=0.5,lty=3) text(1:5,1.1,round(100*falses[,1],2)) boxplot(glm.pred[y[test]==0], ridge.pred[y[test]==0], enet.pred[y[test]==0], lasso.pred[y[test]==0], bayes.pred[y[test]==0],ylim=c(0,1.1), names=c("glm","ridge","enet","lasso","Bayes"),main="HAM",ylab="Probability of spam") abline(h=0.5,lty=3) text(1:5,1.1,round(100*falses[,2],2)) dev.off()