############################################################################################ # # Response variable # y: measure of disease progression one year after baseline # # Explanatory variables (10 columns of x) # age: age of patient # sex: sex of patient # bmi: body mass index # map: average blood pressure # tc: total count of white blood cells per cubic mm of blood # ldl: low-density lipoprotein test (bad cholesterol) # hdl: high-density lipoprotein test (good cholesterol) # tch: thyroid stimulating hormone test # ltg: low-tension glaucoma test # glu: glucose test # # Source code: # https://jrnold.github.io/bayesian_notes/shrinkage-and-regularized-regression.html # ############################################################################################ install.packages("lars") install.packages("tidyverse") install.packages("glmnet") install.packages("broom") install.packages("recipes") library("lars") library("tidyverse") library("broom") library("glmnet") library("recipes") data("diabetes", package = "lars") dim(diabetes) ?diabetes # 1. Flat priors # 2. Ridge regression/Normal priors/L2 penalty # 3. Lasso regresssion/Laplace (double exponential) priors/L1 penalty diabetes_map_lm <- lm(y ~ x, data = diabetes) diabetes_map_ridge <- glmnet(diabetes$x, diabetes$y, alpha = 1) diabetes_map_lasso <- glmnet(diabetes$x, diabetes$y, alpha = 0) diabetes_coefpaths <- bind_rows( mutate(tidy(diabetes_map_lasso), model = "Lasso (L1)"), mutate(tidy(diabetes_map_ridge), model = "Ridge (L2)")) %>% filter(term != "(Intercept)") diabetes_coefpaths %>% ggplot(aes(x = -log(lambda), y = estimate, group = term)) + geom_line() + facet_wrap(~ model, ncol = 1, scales = "free_x") + labs(x = expression(-log(lambda)), y = expression(beta)) diabetes_coefpaths %>% ggplot(aes(x = dev.ratio, y = estimate, group = term)) + geom_line() + facet_wrap(~ model, ncol = 1, scales = "free_x") + labs(x = expression("% Deviance"), y = expression(beta)) # K-fold cross-validation diabetes_ridge_cv <- cv.glmnet(diabetes$x, diabetes$y, alpha = 0, nfolds = 10, parallel = TRUE) plot(diabetes_ridge_cv) diabetes_lasso_cv <- cv.glmnet(diabetes$x, diabetes$y, alpha = 1, nfolds = 10, parallel = TRUE) plot(diabetes_lasso_cv) lasso_nonzeros <- function(x, se = TRUE) { lambdahat <- if (se) "lambda.1se" else "lambda.min" unname(x$nzero[which(x$lambda == x[[lambdahat]])]) } lasso_nonzeros(diabetes_lasso_cv) get_glmnet_coef <- function(x) { lambdahat <- "lambda.1se" coefpaths <- tidy(x[["glmnet.fit"]]) filter(coefpaths, lambda == x[[lambdahat]]) } bind_rows( mutate(tidy(diabetes_map_lm), model = "OLS"), mutate(get_glmnet_coef(diabetes_ridge_cv), model = "ridge"), mutate(get_glmnet_coef(diabetes_lasso_cv), model = "lasso")) %>% mutate(term = if_else(model %in% c("OLS"), str_replace(term, "^x", ""), term)) %>% select(model, term, estimate) %>% complete(model, term, fill = list(estimate = 0)) %>% filter(!term %in% "(Intercept)") %>% mutate(term = fct_reorder(term, if_else(model == "OLS", abs(estimate), NA_real_), mean, na.rm = TRUE)) %>% ggplot(aes(x = term, y = estimate, colour = model)) + geom_hline(yintercept = 0, colour = "white", size = 2) + geom_point() + coord_flip() + labs(y = expression(hat(beta)), x = "")