############################################################################################################ # # Receiver Operating Characteristics (ROC) # # and # # The Area Under the ROC Curve (AUC) # # ############################################################################################################ # # The following paragraphs are taken from the (2013) book "Data Science for Business: What you # Need to Know About Data Mining and Data-Analytic Thinking" by Foster Provost and Tom Fawcett. # Specifically we quote parts of page 214 to page 219. # # Additional references are: # Swets (1988) Measuring the accuracy of diagnostic systems. Science, 240, 1285?1293. # Swets, Dawes and Monahan (2000) Better decisions through science. Scientific American, 283, 82?87. # Fawcett (2006) An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861?874. # # A ROC graph is a two-dimensional plot of a classifier with false positive rate on the x axis # against true positive rate on the y axis. As such, a ROC graph depicts relative trade-offs # that a classifier makes between benefits (true positives) and costs (false positives). # # Although the confusion matrix contains four numbers, we really only need two of the rates: # either the true positive rate or the false negative rate, and either the false positive rate # or the true negative rate. Given one from either pair the other can be derived since they # sum to one. It is conventional to use the true positive rate (tp rate) and the false # positive rate (fp rate), and we will keep to that convention so the ROC graph will make sense. # # Each discrete classifier produces an (fp rate, tp rate) pair corresponding to a single # point in ROC space. Importantly for what follows, the tp rate is computed using only # the actual positive examples, and the fp rate is computed using only the actual negative examples. # # Remembering exactly what statistics the tp rate and fp rate refer to can be confusing for # someone who does not deal with such things on a daily basis. It can be easier to remember # by using less formal but more intuitive names for the statistics: the tp rate is sometimes # referred to as the hit rate ? what percent of the actual positives does the classifier get # right. The fp rate is sometimes referred to as the false alarm rate ? what percent of the # actual negative examples does the classifier get wrong (i.e., predict to be positive). # # The lower left point (0,0) represents the strategy of never issuing a positive classification; # such a classifier commits no false positive errors but also gains no true positives. The opposite # strategy, of unconditionally issuing positive classifications, is represented by the upper # right point (1,1). The point (0, 1) represents perfect classification, represented by a star. # The diagonal line connecting (0, 0) to (1, 1) represents the policy of guessing a class. A random # classifier will produce a ROC point that moves back and forth on the diagonal based on the # frequency with which it guesses the positive class. # # One point in ROC space is superior to another if it is to the northwest of the first (tp # rate is higher and fp rate is no worse; fp rate is lower and tp rate is no worse, or both are # better). Classifiers appearing on the lefthand side of a ROC graph, near the x axis, may # be thought of as ?conservative?: they raise alarms (make positive classifications) only # with strong evidence so they make few false positive errors, but they often have low true # positive rates as well. Classifiers on the upper righthand side of a ROC graph may be # thought of as ?permissive?: they make positive classifications with weak evidence so they # classify nearly all positives correctly, but they often have high false positive rates. # # The Area Under the ROC Curve (AUC): # An important summary statistic is the area under the ROC curve (AUC). As the name # implies, this is simply the area under a classifier?s curve expressed as a fraction of the # unit square. Its value ranges from zero to one. Though a ROC curve provides more # information than its area, the AUC is useful when a single number is needed to summarize # performance, or when nothing is known about the operating conditions. Later, # in ?Example: Performance Analytics for Churn Modeling? on page 223, we will show a # use of the AUC statistic. For now it is enough to realize that it?s a good general summary # statistic of the predictiveness of a classifier. # ############################################################################################################ library(ISLR) data(Default) dim(Default) Default[1:10,] n = nrow(Default) attach(Default) default.binary = rep(0,n) default.binary[default=="Yes"]=1 # Step-by-step explanation fit = glm(default.binary~balance,family=binomial) par(mfrow=c(1,2)) boxplot(fit$fitted.values[default.binary==0],fit$fitted.values[default.binary==1], names=c("Default=No","Default=Yes")) abline(h=0.5,col=2) abline(h=0.75,col=4) boxplot(fit$fitted.values[default.binary==0],fit$fitted.values[default.binary==1], names=c("Default=No","Default=Yes"),outline=FALSE) abline(h=0.5,col=2) abline(h=0.75,col=4) cut = 0.5 class = rep(0,n) class[fit$fitted.values>=cut] = 1 table(class,default.binary) cut = 0.75 class = rep(0,n) class[fit$fitted.values>=cut] = 1 table(class,default.binary) cut = 0.032 class = rep(0,n) class[fit$fitted.values>=cut] = 1 table(class,default.binary) # sensitivity truepositiverate = sum((class==1)&(default.binary==1))/sum(default.binary==1) # 1-specificity falsepositiverate = sum((class==1)&(default.binary==0))/sum(default.binary==0) c(truepositiverate,falsepositiverate) # Now comparing 4 models fit1 = glm(default.binary~balance,family=binomial) fit2 = glm(default.binary~student,family=binomial) fit3 = glm(default.binary~income,family=binomial) fit4 = glm(default.binary~balance+student+income,family=binomial) cuts = c(seq(0.001,0.999,by=0.01),0.9999) ncuts = length(cuts) tpr = matrix(0,ncuts,4) fpr = matrix(0,ncuts,4) for (i in 1:ncuts){ cut = cuts[i] class = rep(0,n) class[fit1$fitted.values>=cut]=1 tpr[i,1] = sum((class==1)&(default.binary==1))/sum(default.binary==1) fpr[i,1] = sum((class==1)&(default.binary==0))/sum(default.binary==0) class = rep(0,n) class[fit2$fitted.values>=cut]=1 tpr[i,2] = sum((class==1)&(default.binary==1))/sum(default.binary==1) fpr[i,2] = sum((class==1)&(default.binary==0))/sum(default.binary==0) class = rep(0,n) class[fit3$fitted.values>=cut]=1 tpr[i,3] = sum((class==1)&(default.binary==1))/sum(default.binary==1) fpr[i,3] = sum((class==1)&(default.binary==0))/sum(default.binary==0) class = rep(0,n) class[fit4$fitted.values>=cut]=1 tpr[i,4] = sum((class==1)&(default.binary==1))/sum(default.binary==1) fpr[i,4] = sum((class==1)&(default.binary==0))/sum(default.binary==0) } par(mfrow=c(1,1)) plot(fpr[,1],tpr[,1],xlim=c(0,1),ylim=c(0,1),type="l",lwd=2, xlab="False positive rate (1-Specificity)",ylab="True positive rate (Sensitivity)") abline(0,1,lty=2) lines(fpr[,2],tpr[,2],col=2,lwd=2) lines(fpr[,3],tpr[,3],col=3,lwd=3) lines(fpr[,4],tpr[,4],col=4,lwd=3) legend("bottomright",legend=c("balance","student","income","full"),col=1:4,lty=1,lwd=2) library(pROC) par(mfrow = c(2,2)) prob = predict(fit1,newdata=Default,type="response") auc1 = auc(default,predict(fit1,newdata=Default,type="response")) plot.roc(default, prob, col = "blue", grid = TRUE, xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)", main = paste("AUC=",round(auc1,3),sep=""), legacy.axes = TRUE, asp = FALSE, las = 1, print.thres.pattern = " %.3f", # print.thres = threshold) print.thres = "best") prob = predict(fit2,newdata=Default,type="response") auc2 = auc(default,predict(fit2,newdata=Default,type="response")) plot.roc(default, prob, col = "blue", grid = TRUE, xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)", main = paste("AUC=",round(auc2,3),sep=""), legacy.axes = TRUE, asp = FALSE, las = 1, print.thres.pattern = " %.3f", # print.thres = threshold) print.thres = "best") prob = predict(fit3,newdata=Default,type="response") auc3 = auc(default,predict(fit3,newdata=Default,type="response")) plot.roc(default, prob, col = "blue", grid = TRUE, xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)", main = paste("AUC=",round(auc3,3),sep=""), legacy.axes = TRUE, asp = FALSE, las = 1, print.thres.pattern = " %.3f", # print.thres = threshold) print.thres = "best") prob = predict(fit4,newdata=Default,type="response") auc4 = auc(default,predict(fit4,newdata=Default,type="response")) plot.roc(default, prob, col = "blue", grid = TRUE, xlab = "False Positive Rate (1 - Specificity)", ylab = "True Positive Rate (Sensitivity)", main = paste("AUC=",round(auc4,3),sep=""), legacy.axes = TRUE, asp = FALSE, las = 1, print.thres.pattern = " %.3f", # print.thres = threshold) print.thres = "best")