######################################################################################
# covid-19.R - 2020-04-09
#
# Paulo C. Marques F.       <PauloCMF1@insper.edu.br>
# Tiago Mendonca dos Santos <TiagoMS1@insper.edu.br>
# Hedibert F. Lopes         <HedibertFL@insper.edu.br>
#
# Original data: https://www.kaggle.com/einsteindata4u/covid19/version/4
#
#
# Abstract:
# The following is a classifier built from the dataset that Hospital Israelita 
# Albert Einstein made publicly available through Kaggle. The original dataset 
# has 126 predictors (with some duplication). After a long exploratory data 
# analysis, we managed to build a model with only 6 predictors. After that, 
# we built a Random Forest classifier, and evaluated its performance through 
# a simplified cross validation scheme. Despite the fact that the processed 
# dataset has a small size (n = 501 patients), and it's unbalanced (only 13.8% 
# of the patients tested positive for the virus), the classifier performs well, 
# displaying a sensitivity of 86.1% and a specificity of 81.8%.
#
######################################################################################
rm(list = ls())

set.seed(1234)

library(tidyverse)
library(rsample)
library(pROC)
library(randomForest)

# the data after some feature engineering
db <- read.table("http://hedibert.org/wp-content/uploads/2020/04/covid-19.txt",header=TRUE)

N <- 10^3

metrics <- matrix(0, nrow = N, ncol = 3)

colnames(metrics) <- c("Sensitivity", "Specificity", "AUC")

pb <- txtProgressBar(min = 1, max = N, style = 3, width = 30)

for (i in 1:N) {
    db_split <- initial_split(db, strata = result, prop = 0.7)
    training <- training(db_split)
    test <- testing(db_split)
    rf <- randomForest(result ~ ., data = training, ntree = 1000)
    pr <- predict(rf, newdata = test, type = "prob")[, 2]
    roc <- roc(test$result, pr, levels = c("NEG", "POS"), direction = "<")
    best <- which.max(roc$sensitivities + roc$specificities)
    metrics[i, 1] <- roc$sensitivities[best]
    metrics[i, 2] <- roc$specificities[best]
    metrics[i, 3] <- roc$auc
    setTxtProgressBar(pb, i)	
}

close(pb)

sensitivity <- mean(metrics[, "Sensitivity"])
specificity <- mean(metrics[, "Specificity"])
auc <- mean(metrics[, "AUC"])

cat("|     Model     | Sensitivity | Specificity |   AUC   |\n",
    "|---------------+-------------+-------------+---------|\n",
    sprintf("| %-13s |    %.3f    |    %.3f    |  %.3f  |\n", "Random Forest",
            sensitivity, specificity, auc), sep = "")