###################################################################################### # covid-19.R - 2020-04-09 # # Paulo C. Marques F. # Tiago Mendonca dos Santos # Hedibert F. Lopes # # Original data: https://www.kaggle.com/einsteindata4u/covid19/version/4 # # # Abstract: # The following is a classifier built from the dataset that Hospital Israelita # Albert Einstein made publicly available through Kaggle. The original dataset # has 126 predictors (with some duplication). After a long exploratory data # analysis, we managed to build a model with only 6 predictors. After that, # we built a Random Forest classifier, and evaluated its performance through # a simplified cross validation scheme. Despite the fact that the processed # dataset has a small size (n = 501 patients), and it's unbalanced (only 13.8% # of the patients tested positive for the virus), the classifier performs well, # displaying a sensitivity of 86.1% and a specificity of 81.8%. # ###################################################################################### rm(list = ls()) set.seed(1234) library(tidyverse) library(rsample) library(pROC) library(randomForest) # the data after some feature engineering db <- read.table("http://hedibert.org/wp-content/uploads/2020/04/covid-19.txt",header=TRUE) N <- 10^3 metrics <- matrix(0, nrow = N, ncol = 3) colnames(metrics) <- c("Sensitivity", "Specificity", "AUC") pb <- txtProgressBar(min = 1, max = N, style = 3, width = 30) for (i in 1:N) { db_split <- initial_split(db, strata = result, prop = 0.7) training <- training(db_split) test <- testing(db_split) rf <- randomForest(result ~ ., data = training, ntree = 1000) pr <- predict(rf, newdata = test, type = "prob")[, 2] roc <- roc(test$result, pr, levels = c("NEG", "POS"), direction = "<") best <- which.max(roc$sensitivities + roc$specificities) metrics[i, 1] <- roc$sensitivities[best] metrics[i, 2] <- roc$specificities[best] metrics[i, 3] <- roc$auc setTxtProgressBar(pb, i) } close(pb) sensitivity <- mean(metrics[, "Sensitivity"]) specificity <- mean(metrics[, "Specificity"]) auc <- mean(metrics[, "AUC"]) cat("| Model | Sensitivity | Specificity | AUC |\n", "|---------------+-------------+-------------+---------|\n", sprintf("| %-13s | %.3f | %.3f | %.3f |\n", "Random Forest", sensitivity, specificity, auc), sep = "")