###################################################################################################
#
#  Sparse logistic regression - the spam/ham dataset
#
# https://archive.ics.uci.edu/ml/datasets/spambase
# For more information, see file 'spambase.DOCUMENTATION' at the
# UCI Machine Learning Repository: http://www.ics.uci.edu/~mlearn/MLRepository.html
#
###################################################################################################

#install.packages("glmnet")
#install.packages("BayesLogit")

rm(list=ls())
library(glmnet)
library(BayesLogit)

names = c("make","address","all","3d","our","over","remove","internet","order","mail",
"receive","will","people","report","addresses","free","business","email","you","credit",
"your","font","000","money","hp","hpl","george","650","lab","labs","telnet","857","data",
"415","85","technology","1999","parts","pm","direct","cs","meeting","original","project",
"re","edu","table","conference","char;","char(","char[","char!","char$","char#","cap.ave",
"cap.long","cap.tot")


#######################################################
# Reading and transforming the data
#######################################################
data = read.table("spamdata.txt",header=FALSE)
p = ncol(data)-1
n = nrow(data)
y = data[,58]
x = as.matrix(data[,1:57])
x = x - matrix(apply(x,2,mean),n,p,byrow=TRUE)
x = x %*% diag(sqrt(1/apply(x,2,var)))
colnames(x) = names

#######################################################
# Exploratory data anlaysis: means, variances, coefficient of variation
#######################################################
my1 = apply(x[y==1,],2,mean)
my0 = apply(x[y==0,],2,mean)
sy1  =  sqrt(apply(x[y==1,],2,var))
sy0  =  sqrt(apply(x[y==0,],2,var))
diff = abs(my1-my0)
ord = order(diff,decreasing=TRUE)

pdf(file="spam-graph1.pdf",width=12,height=6)
par(mfrow=c(1,1))
plot(my1[ord],ylim=c(-0.4,0.5),ylab="Predictor's sample mean",xlab="",axes=FALSE,pch=16)
axis(2);box();axis(1,at=1:p,lab=names[ord],las=2)
for (i in 1:p)
  segments(i,my1[ord[i]],i,my0[ord[i]],col=grey(0.8))
points(my1[ord],pch=16)
points(my0[ord],col=2,pch=16)
abline(h=0,lty=3)
legend("topright",legend=c("SPAM","HAM"),col=1:2,pch=16)
dev.off()

pdf(file="spam-graph2.pdf",width=12,height=6)
plot(sy1[ord]/abs(my1[ord]),ylim=c(0,30),ylab="Coefficient of variation",xlab="",axes=FALSE,pch=16)
axis(2);box();axis(1,at=1:p,lab=names[ord],las=2)
points(sy0[ord]/abs(my0[ord]),col=2,pch=16)
abline(h=1,lty=3)
dev.off()

#######################################################
# Setting up training and sample subgroups
#######################################################
set.seed(1)
train = sample(1:nrow(x),nrow(x)/2)
test = (-train)
y.test = y[test]

#######################################################
# GLS and Bayes fits
#######################################################
glmfit = glm(y[train]~x[train,],family=binomial)
glm.coef = glmfit$coef

bayesfit = logit(y[train],cbind(1,x[train,]))
bayes.coef = apply(bayesfit$beta,2,mean)

#######################################################
# Ridge, Elasticnet & LASSO fits
#######################################################
grid = exp(seq(-10,-2,by=0.1))
ridge.model = glmnet(x[train,],y[train],family="binomial",alpha=0,lambda=grid)
enet.model  = glmnet(x[train,],y[train],family="binomial",alpha=0.5,lambda=grid)
lasso.model = glmnet(x[train,],y[train],family="binomial",alpha=1,lambda=grid)

pdf(file="spam-graph3.pdf",width=12,height=6)
par(mfrow=c(1,3))
plot(ridge.model,ylim=c(-17,2))
plot(enet.model,ylim=c(-17,2))
plot(lasso.model,ylim=c(-17,2))
dev.off()

pdf(file="spam-graph4.pdf",width=12,height=6)
par(mfrow=c(1,3))
ridge.cvout = cv.glmnet(x[train,],y[train],family="binomial",alpha=0)
plot(ridge.cvout)
ridge.bestlambda = ridge.cvout$lambda.min
ridge.bestmodel= glmnet(x,y,family="binomial",alpha=0,lambda=ridge.bestlambda)
ridge.coef = coef(ridge.bestmodel)[,1]

enet.cvout = cv.glmnet(x[train,],y[train],family="binomial",alpha=0.5)
plot(enet.cvout)
enet.bestlambda = enet.cvout$lambda.min
enet.bestmodel= glmnet(x,y,family="binomial",alpha=0.5,lambda=enet.bestlambda)
enet.coef = coef(enet.bestmodel)[,1]

lasso.cvout = cv.glmnet(x[train,],y[train],family="binomial",alpha=1)
plot(lasso.cvout)
lasso.bestlambda = lasso.cvout$lambda.min
lasso.bestmodel= glmnet(x,y,family="binomial",alpha=0,lambda=lasso.bestlambda)
lasso.coef = coef(lasso.bestmodel)[,1]
dev.off()

pdf(file="spam-graph5.pdf",width=9,height=6)
par(mfrow=c(1,1))
plot(glm.coef,ylab="Coefficient",ylim=c(-3,3),pch=16,xlab="",axes=FALSE)
axis(2);box();axis(1,at=1:p,lab=names,las=2)
points(ridge.coef,col=2,pch=16)
points(enet.coef,col=3,pch=16)
points(lasso.coef,col=4,pch=16)
points(bayes.coef,col=5,pch=16)
abline(h=0,lty=2)
legend("bottomright",legend=c("OLS","Ridge","Elasticnet","Lasso","Bayes"),col=1:5,pch=16)
dev.off()

pdf(file="spam-graph6.pdf",width=12,height=6)
par(mfrow=c(1,4))
plot(glm.coef,ridge.coef,xlab="GLM",ylab="RIDGE",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3)
plot(glm.coef,enet.coef,xlab="GLM",ylab="ELASTICNET",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3)
plot(glm.coef,lasso.coef,xlab="GLM",ylab="LASSO",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3)
plot(glm.coef,bayes.coef,xlab="GLM",ylab="BAYES",xlim=c(-3,3),ylim=c(-3,3));abline(0,1,lty=3);abline(h=0,lty=3)
dev.off()

#######################################################
# Misclassification rates
#######################################################
eta = cbind(1,x[test,])%*%glm.coef
glm.pred = exp(eta)/(1+exp(eta))
glm.class = rep(0,length(eta))
glm.class[glm.pred>0.5]=1

eta = cbind(1,x[test,])%*%bayes.coef
bayes.pred = exp(eta)/(1+exp(eta))
bayes.class = rep(0,length(eta))
bayes.class[bayes.pred>0.5]=1

ridge.class = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,x[test,],type="class")
enet.class = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,x[test,],type="class")
lasso.class = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,x[test,],type="class")

tab = rbind(matrix(table(y[test],glm.class),1,4)[,c(1,3,2,4)],
matrix(table(y[test],ridge.class),1,4)[,c(1,3,2,4)],
matrix(table(y[test],enet.class),1,4)[,c(1,3,2,4)],
matrix(table(y[test],lasso.class),1,4)[,c(1,3,2,4)],
matrix(table(y[test],bayes.class),1,4)[,c(1,3,2,4)])
rates = apply(tab[,c(1,4)],1,sum)/apply(tab,1,sum)
tab = cbind(tab,round(100*(1-rates),2))
colnames(tab) = c("y=0,yhat=0","y=0,yhat=1","y=1,yhat=0","y=1,yhat=1","error")
rownames(tab) = c("glm","ridge","enet","lasso","bayes")
tab
      # y=0,yhat=0 y=0,yhat=1 y=1,yhat=0 y=1,yhat=1 error
# glm         1325         80         93        803  7.52
# ridge       1345         60        128        768  8.17
# enet        1349         56         94        802  6.52
# lasso       1347         58         95        801  6.65
# bayes       1326         79         91        805  7.39

falses = t(rbind(tab[,3]/apply(tab[,3:4],1,sum),tab[,2]/apply(tab[,1:2],1,sum),tab[,2]/apply(tab[,c(2:4)],1,sum)))
colnames(falses) = c("false.negative","false.positive","false.discovery")
rownames(falses) = c("glm","ridge","enet","lasso","bayes")
round(100*falses,2)
      # false.negative false.positive false.discovery
# glm            10.38           5.69            8.20
# ridge          14.29           4.27            6.28
# enet           10.49           3.99            5.88
# lasso          10.60           4.13            6.08
# bayes          10.16           5.62            8.10


ridge.pred  = predict(ridge.bestmodel,alpha=0,s=ridge.bestlambda,x[test,],type="response")
enet.pred   = predict(enet.bestmodel,alpha=0.5,s=enet.bestlambda,x[test,],type="response")
lasso.pred = predict(lasso.bestmodel,alpha=1,s=lasso.bestlambda,x[test,],type="response")

pdf(file="spam-graph7.pdf",width=12,height=6)
par(mfrow=c(1,5))
boxplot(glm.pred[y[test]==1],glm.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam")
abline(h=0.5,lty=3)
title("GLM")
boxplot(ridge.pred[y[test]==1],ridge.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam")
abline(h=0.5,lty=3)
title("Ridge")
boxplot(enet.pred[y[test]==1],enet.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam")
abline(h=0.5,lty=3)
title("Elasticnet")
boxplot(lasso.pred[y[test]==1],lasso.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam")
abline(h=0.5,lty=3)
title("LASSO")
boxplot(bayes.pred[y[test]==1],bayes.pred[y[test]==0],names=c("Spam","Ham"),ylab="Probability of spam")
abline(h=0.5,lty=3)
title("Bayes")
dev.off()


pdf(file="spam-graph8.pdf",width=12,height=6)
par(mfrow=c(1,2))
boxplot(glm.pred[y[test]==1],
ridge.pred[y[test]==1],
enet.pred[y[test]==1],
lasso.pred[y[test]==1],
bayes.pred[y[test]==1],ylim=c(0,1.1),
names=c("glm","ridge","enet","lasso","Bayes"),main="SPAM",ylab="Probability of spam")
abline(h=0.5,lty=3)
text(1:5,1.1,round(100*falses[,1],2))

boxplot(glm.pred[y[test]==0],
ridge.pred[y[test]==0],
enet.pred[y[test]==0],
lasso.pred[y[test]==0],
bayes.pred[y[test]==0],ylim=c(0,1.1),
names=c("glm","ridge","enet","lasso","Bayes"),main="HAM",ylab="Probability of spam")
abline(h=0.5,lty=3)
text(1:5,1.1,round(100*falses[,2],2))
dev.off()