############################################################################################################
#
# Receiver Operating Characteristics (ROC) 
#
# and
# 
# The Area Under the ROC Curve (AUC)
#
#
############################################################################################################
#
# The following paragraphs are taken from the (2013) book "Data Science for Business: What you 
# Need to Know About Data Mining and Data-Analytic Thinking" by Foster Provost and Tom Fawcett.  
# Specifically we quote parts of page 214 to page 219.
#
# Additional references are:
# Swets (1988) Measuring the accuracy of diagnostic systems. Science, 240, 1285?1293.
# Swets, Dawes and Monahan (2000) Better decisions through science. Scientific American, 283, 82?87.
# Fawcett (2006) An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861?874.
#
# A ROC graph is a two-dimensional plot of a classifier with false positive rate on the x axis 
# against true positive rate on the y axis. As such, a ROC graph depicts relative trade-offs 
# that a classifier makes between benefits (true positives) and costs (false positives).
#
# Although the confusion matrix contains four numbers, we really only need two of the rates: 
# either the true positive rate or the false negative rate, and either the false positive rate 
# or the true negative rate. Given one from either pair the other can be derived since they 
# sum to one. It is conventional to use the true positive rate (tp rate) and the false 
# positive rate (fp rate), and we will keep to that convention so the ROC graph will make sense.
#
# Each discrete classifier produces an (fp rate, tp rate) pair corresponding to a single 
# point in ROC space. Importantly for what follows, the tp rate is computed using only 
# the actual positive examples, and the fp rate is computed using only the actual negative examples.
#
# Remembering exactly what statistics the tp rate and fp rate refer to can be confusing for 
# someone who does not deal with such things on a daily basis. It can be easier to remember 
# by using less formal but more intuitive names for the statistics: the tp rate is sometimes 
# referred to as the hit rate ? what percent of the actual positives does the classifier get 
# right. The fp rate is sometimes referred to as the false alarm rate ? what percent of the 
# actual negative examples does the classifier get wrong (i.e., predict to be positive).
#
# The lower left point (0,0) represents the strategy of never issuing a positive classification; 
# such a classifier commits no false positive errors but also gains no true positives.  The opposite 
# strategy, of unconditionally issuing positive classifications, is represented by the upper 
# right point (1,1).  The point (0, 1) represents perfect classification, represented by a star. 
# The diagonal line connecting (0, 0) to (1, 1) represents the policy of guessing a class.  A random 
# classifier will produce a ROC point that moves back and forth on the diagonal based on the 
# frequency with which it guesses the positive class.
#
# One point in ROC space is superior to another if it is to the northwest of the first (tp
# rate is higher and fp rate is no worse; fp rate is lower and tp rate is no worse, or both are
# better). Classifiers appearing on the lefthand side of a ROC graph, near the x axis, may
# be thought of as ?conservative?: they raise alarms (make positive classifications) only
# with strong evidence so they make few false positive errors, but they often have low true
# positive rates as well. Classifiers on the upper righthand side of a ROC graph may be
# thought of as ?permissive?: they make positive classifications with weak evidence so they
# classify nearly all positives correctly, but they often have high false positive rates.
#
# The Area Under the ROC Curve (AUC): 
# An important summary statistic is the area under the ROC curve (AUC). As the name
# implies, this is simply the area under a classifier?s curve expressed as a fraction of the
# unit square. Its value ranges from zero to one. Though a ROC curve provides more
# information than its area, the AUC is useful when a single number is needed to summarize
# performance, or when nothing is known about the operating conditions. Later,
# in ?Example: Performance Analytics for Churn Modeling? on page 223, we will show a
# use of the AUC statistic. For now it is enough to realize that it?s a good general summary
# statistic of the predictiveness of a classifier.
#
############################################################################################################

library(ISLR)

data(Default)

dim(Default)

Default[1:10,]

n = nrow(Default)

attach(Default)

default.binary = rep(0,n)
default.binary[default=="Yes"]=1


# Step-by-step explanation
fit = glm(default.binary~balance,family=binomial)

par(mfrow=c(1,2))
boxplot(fit$fitted.values[default.binary==0],fit$fitted.values[default.binary==1],
        names=c("Default=No","Default=Yes"))
abline(h=0.5,col=2)
abline(h=0.75,col=4)
boxplot(fit$fitted.values[default.binary==0],fit$fitted.values[default.binary==1],
        names=c("Default=No","Default=Yes"),outline=FALSE)
abline(h=0.5,col=2)
abline(h=0.75,col=4)

cut = 0.5
class = rep(0,n)
class[fit$fitted.values>=cut] = 1
table(class,default.binary)

cut = 0.75
class = rep(0,n)
class[fit$fitted.values>=cut] = 1
table(class,default.binary)

cut = 0.032
class = rep(0,n)
class[fit$fitted.values>=cut] = 1
table(class,default.binary)

# sensitivity 
truepositiverate  = sum((class==1)&(default.binary==1))/sum(default.binary==1)

# 1-specificity
falsepositiverate = sum((class==1)&(default.binary==0))/sum(default.binary==0)
c(truepositiverate,falsepositiverate)


# Now comparing 4 models
fit1 = glm(default.binary~balance,family=binomial)

fit2 = glm(default.binary~student,family=binomial)

fit3 = glm(default.binary~income,family=binomial)

fit4 = glm(default.binary~balance+student+income,family=binomial)

cuts  = c(seq(0.001,0.999,by=0.01),0.9999)
ncuts = length(cuts)
tpr   = matrix(0,ncuts,4)
fpr   = matrix(0,ncuts,4)
for (i in 1:ncuts){
  cut = cuts[i]
  class = rep(0,n)
  class[fit1$fitted.values>=cut]=1
  tpr[i,1] = sum((class==1)&(default.binary==1))/sum(default.binary==1)
  fpr[i,1] = sum((class==1)&(default.binary==0))/sum(default.binary==0)
  class = rep(0,n)
  class[fit2$fitted.values>=cut]=1
  tpr[i,2] = sum((class==1)&(default.binary==1))/sum(default.binary==1)
  fpr[i,2] = sum((class==1)&(default.binary==0))/sum(default.binary==0)
  class = rep(0,n)
  class[fit3$fitted.values>=cut]=1
  tpr[i,3] = sum((class==1)&(default.binary==1))/sum(default.binary==1)
  fpr[i,3] = sum((class==1)&(default.binary==0))/sum(default.binary==0)
  class = rep(0,n)
  class[fit4$fitted.values>=cut]=1
  tpr[i,4] = sum((class==1)&(default.binary==1))/sum(default.binary==1)
  fpr[i,4] = sum((class==1)&(default.binary==0))/sum(default.binary==0)
}

par(mfrow=c(1,1))
plot(fpr[,1],tpr[,1],xlim=c(0,1),ylim=c(0,1),type="l",lwd=2,
     xlab="False positive rate (1-Specificity)",ylab="True positive rate (Sensitivity)")
abline(0,1,lty=2)
lines(fpr[,2],tpr[,2],col=2,lwd=2)
lines(fpr[,3],tpr[,3],col=3,lwd=3)
lines(fpr[,4],tpr[,4],col=4,lwd=3)
legend("bottomright",legend=c("balance","student","income","full"),col=1:4,lty=1,lwd=2)

library(pROC)

par(mfrow = c(2,2))
prob = predict(fit1,newdata=Default,type="response")
auc1 = auc(default,predict(fit1,newdata=Default,type="response"))
plot.roc(default, prob, col = "blue", grid = TRUE,
         xlab = "False Positive Rate (1 -  Specificity)",
         ylab = "True Positive Rate (Sensitivity)",
         main = paste("AUC=",round(auc1,3),sep=""), 
         legacy.axes = TRUE, asp = FALSE, las = 1,
         print.thres.pattern = " %.3f",
#        print.thres = threshold)
         print.thres = "best")

prob = predict(fit2,newdata=Default,type="response")
auc2 = auc(default,predict(fit2,newdata=Default,type="response"))
plot.roc(default, prob, col = "blue", grid = TRUE,
         xlab = "False Positive Rate (1 -  Specificity)",
         ylab = "True Positive Rate (Sensitivity)",
         main = paste("AUC=",round(auc2,3),sep=""), 
         legacy.axes = TRUE, asp = FALSE, las = 1,
         print.thres.pattern = " %.3f",
#        print.thres = threshold)
         print.thres = "best")

prob = predict(fit3,newdata=Default,type="response")
auc3  = auc(default,predict(fit3,newdata=Default,type="response"))
plot.roc(default, prob, col = "blue", grid = TRUE,
         xlab = "False Positive Rate (1 -  Specificity)",
         ylab = "True Positive Rate (Sensitivity)",
         main = paste("AUC=",round(auc3,3),sep=""), 
         legacy.axes = TRUE, asp = FALSE, las = 1,
         print.thres.pattern = " %.3f",
#        print.thres = threshold)
         print.thres = "best")

prob = predict(fit4,newdata=Default,type="response")
auc4 = auc(default,predict(fit4,newdata=Default,type="response"))
plot.roc(default, prob, col = "blue", grid = TRUE,
         xlab = "False Positive Rate (1 -  Specificity)",
         ylab = "True Positive Rate (Sensitivity)",
         main = paste("AUC=",round(auc4,3),sep=""), 
         legacy.axes = TRUE, asp = FALSE, las = 1,
         print.thres.pattern = " %.3f",
#        print.thres = threshold)
         print.thres = "best")