diff --git a/workflows/comp-research/README.md b/workflows/comp-research/README.md index c661c00..669acd5 100644 --- a/workflows/comp-research/README.md +++ b/workflows/comp-research/README.md @@ -2,9 +2,9 @@ This folder contains examples of Popper workflows of intermediate complexity in computational science. Both attempt to reflect best practices for developing Popper workflows in computational fiels -- [python](https://github.com/getpopper/popper-examples/tree/master/workflows/comp-research-python) +- [python](https://github.com/getpopper/popper-examples/tree/master/workflows/comp-research/python) is a machine learning workflow in Python using the Jupyter Lab IDE. -- [rstudio](https://github.com/getpopper/popper-examples/tree/master/workflows/comp-research-rstudio) +- [rstudio](https://github.com/getpopper/popper-examples/tree/master/workflows/comp-research/rstudio) is a machine learning workflow in R using the RStudio Server IDE. To adapt the approach demonstrated here to your own work, see the guide for [computational research with Popper](). diff --git a/workflows/comp-research/python/containers/Dockerfile b/workflows/comp-research/python/containers/Dockerfile index 3ca6f39..88ac733 100644 --- a/workflows/comp-research/python/containers/Dockerfile +++ b/workflows/comp-research/python/containers/Dockerfile @@ -13,4 +13,7 @@ COPY environment.yml . RUN conda env update -f environment.yml \ && conda clean -afy \ && find /opt/conda/ -follow -type f -name '*.pyc' -delete -CMD [ "/bin/bash" ] \ No newline at end of file + +EXPOSE 8888 + +CMD [ "/bin/sh" ] \ No newline at end of file diff --git a/workflows/comp-research/python/paper/paper.tex b/workflows/comp-research/python/paper/paper.tex index 6879991..3723c63 100644 --- a/workflows/comp-research/python/paper/paper.tex +++ b/workflows/comp-research/python/paper/paper.tex @@ -64,7 +64,7 @@ \section{Future Work} Our model's validation curve shows few signs of over-fitting. As such, it is likely that higher scores can be achieved by using a more flexible model (e.g. gradient boosted trees). Furthermore, model stacking will generally improve results \cite{elements}, in particular -in this type of machine learning competitions where test sets are guaranteed to be sourced +in this type of machine learning competition, where test sets are guaranteed to be sourced from the same distribution as training data. \bibliography{references} diff --git a/workflows/comp-research/rstudio/.gitignore b/workflows/comp-research/rstudio/.gitignore new file mode 100644 index 0000000..95d0419 --- /dev/null +++ b/workflows/comp-research/rstudio/.gitignore @@ -0,0 +1,61 @@ +# History files +.Rhistory +.Rapp.history + +# Session Data files +.RData + +# User-specific files +.Ruserdata + +# Example code in package build process +*-Ex.R + +# Output files from R CMD build +/*.tar.gz + +# Output files from R CMD check +/*.Rcheck/ + +# RStudio files +.Rproj.user/ + +# produced vignettes +vignettes/*.html +vignettes/*.pdf + +# OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 +.httr-oauth + +# knitr and R markdown default cache directories +*_cache/ +/cache/ + +# Temporary files created by R markdown +*.utf8.md +*.knit.md + +# R Environment Variables +.Renviron + +# translation temp files +po/*~ + +# R projects artifacts +*.Rproj +.Rproj.user + +# knit output +*.html + +# data +*csv + +# latex output +*.aux +*.bbl +*.blg +*.fdb_latexmk +*.fls +*.log +*.out \ No newline at end of file diff --git a/workflows/comp-research/rstudio/Dockerfile b/workflows/comp-research/rstudio/Dockerfile new file mode 100644 index 0000000..67439e5 --- /dev/null +++ b/workflows/comp-research/rstudio/Dockerfile @@ -0,0 +1,5 @@ +FROM getpopper/verse:3.6.2 +LABEL maintainer="apoirel@ucsc.edu" +RUN ["install2.r", "dplyr", "forcats", "ggplot2", "purrr", "readr", "stringr", "tibble", "tidyr", "tidyverse", "rsample", "parsnip", "recipes", "workflows", "tune", "yardstick", "broom", "dials", "tidymodels", "glmnet"] +EXPOSE 8787 +CMD ["R"] diff --git a/workflows/comp-research/rstudio/README.md b/workflows/comp-research/rstudio/README.md new file mode 100644 index 0000000..deb798e --- /dev/null +++ b/workflows/comp-research/rstudio/README.md @@ -0,0 +1,43 @@ +# RStudio workflow example + +Anders Poirel, 12/10/2020 + +This is workflow developed for the [Flu Shot Learning](https://www.drivendata.org/competitions/66/flu-shot-learning/) +machine learning competition on Driven Data. The competition's goal is to predict how + likely individuals are to receive their H1N1 and. Specifically, participants are asked + to predict a probability for each vaccine. Participants are ranked the + [ROC AUC score](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) of their + predictions on a hold-out test set. + +This workflow shows examples of Popper automating common tasks in computaitonal research +- downloading data +- using a computational notebook +- fitting/simulating a model +- building a paper with up-to-date results + +## Requirements +- Popper +- Docker +- Python > 3.6 + +## Reproducing the results + +```sh +popper run -f wf.yml +``` + +## Project structure + +``` +├── Dockerfile <- Dockerfile generated by containerit +├── LICENSE +├── README.md <- The top-level README. +├── wf.yml <- Popper workflow definition. +├── data <- Data used in workflow. +├── paper <- Generated paper as PDF, LaTeX. +├── output +| ├── models <- Model predictions, serialized models, etc. +| └── figures <- Graphics created during workflow. +└── src <- Source code for this project. + └── notebooks <- RMarkdown notebooks. +``` \ No newline at end of file diff --git a/workflows/comp-research/rstudio/.gitkeep b/workflows/comp-research/rstudio/data/.gitkeep similarity index 100% rename from workflows/comp-research/rstudio/.gitkeep rename to workflows/comp-research/rstudio/data/.gitkeep diff --git a/workflows/comp-research/rstudio/output/figures/.gitkeep b/workflows/comp-research/rstudio/output/figures/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workflows/comp-research/rstudio/output/models/.gitkeep b/workflows/comp-research/rstudio/output/models/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/workflows/comp-research/rstudio/paper/paper.pdf b/workflows/comp-research/rstudio/paper/paper.pdf new file mode 100644 index 0000000..1837b93 Binary files /dev/null and b/workflows/comp-research/rstudio/paper/paper.pdf differ diff --git a/workflows/comp-research/rstudio/paper/paper.tex b/workflows/comp-research/rstudio/paper/paper.tex new file mode 100644 index 0000000..f2c69fb --- /dev/null +++ b/workflows/comp-research/rstudio/paper/paper.tex @@ -0,0 +1,83 @@ +\documentclass{article}[12pt] + +\usepackage{hyperref} +\hypersetup{ + colorlinks=true, + linkcolor=blue, + filecolor=magenta, + urlcolor=cyan, +} +\usepackage{multicol} +\usepackage{graphicx} +\graphicspath{ {./../output/figures/} } + +\title{Flu Shot Learning: Prediction H1N1 and Seasonal Flu Vaccines} +\author{Anders Poirel \\ University of California, Santa Cruz} +\date{\today} + +\begin{document} + +\maketitle + +\section{Introduction} + +This paper describes our approach in +the \href{https://www.drivendata.org/competitions/66/flu-shot-learning/}{Flu Shot Learning} +on Driven Data \cite{driven-data} +The goal of the competition is to predict how likely individuals are to receive their +H1N1 and seasonal flue vaccines. Specifically, participants are asked to predict a +probability for each vaccine. Competition ranking is based on the ROC AUC of predictions +on a hold-out test set. + +\section{Model} + +Our approach uses a logistic regression model. Our pipeline was developed using +\texttt{tidyverse}\cite{tidyverse} and \texttt{tidymodels}\cite{tidymodels}. +Numerical features were standardized and imputed using the mean. Categorical features were one-hot encoded and imputed with ``missing'' +flags. \\ +We tuned a single hyper-parameter for this pipeline, logistic regression's regularization +parameter $C$, using cross-validation on 5 folds. + +\section{Results} + +\subsection{Cross-validation} + +Cross-validation AUC scores suggest that the model does not over-fit for any choice +of $C$. Indeed, performance degrades for low values of $C$ (stronger regularization). +The best tested value is $C = 0.010$ for both the \texttt{h1n1\_vaccine} and +\texttt{seasonal\_vaccine} models. Selecting higher values of $C$ resulred in inconsistent +model performance. + +\begin{center} + +\begin{figure} +\includegraphics[width=0.6\textwidth]{cv_seasonal_vaccine.png} +\caption{Cross validation mean AUC for \texttt{seasonal\_vaccine}} +\end{figure} + +\begin{figure} +\includegraphics[width=0.6\textwidth]{cv_h1n1_vaccine.png} +\caption{Cross validation mean AUC for \texttt{h1n1\_vaccine}} +\end{figure} + +\end{center} + + +\subsection{Hidden test set} + +After submission to the competition website, the model's predictions scored 0.8342 ROC AUC +on the hidden test set, enough to beat the organizer's benchmark (.8185). +At time of writing, this score places 181st out of 948 on the competition leaderboard. + +\section{Future Work} + +Our model's validation curve shows no sign of over-fitting. As such, it is likely that +higher scores can be achieved by using a more flexible model (e.g. gradient boosted trees). +Furthermore, model stacking will generally improve results \cite{elements}, in particular +in this kind of machine learning competition, where test sets are guaranteed to be sourced +from the same distribution as training data. + +\bibliography{references} +\bibliographystyle{plain} + +\end{document} \ No newline at end of file diff --git a/workflows/comp-research/rstudio/paper/references.bib b/workflows/comp-research/rstudio/paper/references.bib new file mode 100644 index 0000000..779b239 --- /dev/null +++ b/workflows/comp-research/rstudio/paper/references.bib @@ -0,0 +1,35 @@ +@misc{driven-data, + title={Harnessing the Power of the Crowd to Increase Capacity for Data Science in the Social Sector}, + author={Peter Bull and Isaac Slavitt and Greg Lipstein}, + year={2016}, + eprint={1606.07781}, + archivePrefix={arXiv}, + primaryClass={cs.HC} +} + +@Article{tidyverse, + title={Welcome to the {tidyverse}}, + author={Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, + year={2019}, + journal={Journal of Open Source Software}, + volume={4}, + number={43}, + pages={1686}, + doi={10.21105/joss.01686}, +} + +@Manual{tidymodels, + title={Tidymodels: a collection of packages for modeling and + machine learning using tidyverse principles.}, + author={Max Kuhn and Hadley Wickham}, + url={https://www.tidymodels.org}, + year={2020}, + } + +@book{elements, + title={The Elements of Statistical Learning: Data Mining, Inference, and Prediction}, + author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome}, + year={2009}, + publisher={Springer Science \& Business Media}, + chapter={Model Inference and Averagin} +} \ No newline at end of file diff --git a/workflows/comp-research/rstudio/src/evaluate_model.R b/workflows/comp-research/rstudio/src/evaluate_model.R new file mode 100644 index 0000000..a7d567b --- /dev/null +++ b/workflows/comp-research/rstudio/src/evaluate_model.R @@ -0,0 +1,62 @@ +library(tidyverse) +library(tidymodels) + +DATA_PATH = "data" +OUTPUT_PATH = "output" + +source("src/models.R") + +df_train <- read_csv(paste(DATA_PATH, "training_set_features.csv", sep = "/")) +y_train <- read_csv(paste(DATA_PATH, "training_set_labels.csv", sep = "/")) + +df_train <- + left_join(df_train, y_train, on = "respondent_id", keep = FALSE) %>% + select(!"respondent_id") + +get_cv_results <- function(df_train, target, ignored) { + + # define model + lr_model <- + logistic_reg(penalty = tune(), mixture = 1) %>% + set_engine("glmnet") + + wf <- + workflow() %>% + add_recipe(get_preprocessor(df_train, target, ignored)) %>% + add_model(lr_model) + + # cv parameters + folds <- df_train %>% vfold_cv(v = 5) + lr_grid <- + grid_regular( + penalty(range = c(-2,1), trans = log10_trans()), + levels = 10 + ) + + # collect cv results + cv_res <- + wf %>% + tune_grid( + resamples = folds, + grid = lr_grid, + metric = metric_set(roc_auc) + ) %>% + collect_metrics() + + # plot_results + cv_res %>% + ggplot(aes(penalty, mean)) + + geom_line(size = 1.2, color = "red", alpha = 0.5) + + geom_point(color = "red") + + scale_x_log10(labels = scales::label_number()) + + scale_color_manual(values = c("#CC6666")) + + ggtitle(expression(paste("AUC for different ", L[1], " penalties"))) + + ggsave( + paste("cv_", target, ".png", sep = ""), + path = paste(OUTPUT_PATH, "figures", sep = "/") + ) +} + +get_cv_results(df_train, "h1n1_vaccine", "seasonal_vaccine") +get_cv_results(df_train, "seasonal_vaccine", "h1n1_vaccine") \ No newline at end of file diff --git a/workflows/comp-research/rstudio/src/get_data.sh b/workflows/comp-research/rstudio/src/get_data.sh new file mode 100644 index 0000000..7d8f7df --- /dev/null +++ b/workflows/comp-research/rstudio/src/get_data.sh @@ -0,0 +1,9 @@ +#!/bin/sh +cd $1 + +wget "https://drivendata-prod.s3.amazonaws.com/data/66/public/test_set_features.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200928%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200928T210959Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=3c0c7858eb58999ea46fe16dedd4debd96b46e5561145da7012ec677dd4aa5d3" +wget "https://drivendata-prod.s3.amazonaws.com/data/66/public/training_set_features.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200928%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200928T210959Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=b4d3611239c93e2a354779b9e006c2440400af0b4df44707a45dc9babf81447a" +wget "https://drivendata-prod.s3.amazonaws.com/data/66/public/training_set_labels.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200928%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200928T210959Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=636630b31fbd10273dc406d1c5415c8696329bc21695f9deed7ac4756629f2e5" + +echo "Files downloaded: $(ls)" + \ No newline at end of file diff --git a/workflows/comp-research/rstudio/src/models.R b/workflows/comp-research/rstudio/src/models.R new file mode 100644 index 0000000..8664f9e --- /dev/null +++ b/workflows/comp-research/rstudio/src/models.R @@ -0,0 +1,20 @@ +library(tidyverse) +library(tidymodels) + +get_preprocessor <- function(df_train, target, ignored) { + df_train <- df_train %>% select(!ignored) + rec <- + recipe(as.formula(paste(target, "~ .")), data = df_train) %>% + step_medianimpute(all_numeric()) %>% + step_normalize(all_numeric(), -all_outcomes()) %>% + step_unknown(all_nominal()) %>% + step_dummy(all_nominal()) %>% + step_num2factor( + target, + transform = function(x) as.integer(x + 1), + levels = c("0", "1"), + skip=TRUE + ) + return(rec) +} + diff --git a/workflows/comp-research/rstudio/src/predict.R b/workflows/comp-research/rstudio/src/predict.R new file mode 100644 index 0000000..11ce4e3 --- /dev/null +++ b/workflows/comp-research/rstudio/src/predict.R @@ -0,0 +1,46 @@ +library(tidyverse) +library(tidymodels) + +DATA_PATH = "data" +OUTPUT_PATH = "output" + +source("src/models.R") + +df_train <- read_csv(paste(DATA_PATH, "training_set_features.csv", sep = "/")) +y_train <- read_csv(paste(DATA_PATH, "training_set_labels.csv", sep = "/")) +df_test <- read_csv(paste(DATA_PATH, "test_set_features.csv", sep = "/")) +df_submission <- read_csv(paste(DATA_PATH, "submission_format.csv", sep = "/")) + +df_train <- + left_join(df_train, y_train, on = "respondent_id", keep = FALSE) %>% + select(!"respondent_id") + +get_predictions <- function(target, ignored, df_train, df_test) { + lr_model <- + logistic_reg(penalty = 0.01, mixture = 1) %>% + set_engine("glmnet") + + predictions <- + workflow() %>% + add_recipe(get_preprocessor(df_train, target, ignored)) %>% + add_model(lr_model) %>% + fit(data = df_train) %>% + predict(df_test, type = "prob") %>% # targets are probabilities + pull(".pred_1") # we want the probability *being* vaccinated + + return(predictions) +} + +preds_seasonal <- + get_predictions("seasonal_vaccine", "h1n1_vaccine", df_train, df_test) + +preds_h1n1 <- + get_predictions("h1n1_vaccine", "seasonal_vaccine", df_train, df_test) + +# save predictions to submission file +df_submission %>% + mutate(h1n1_vaccine = preds_h1n1) %>% + mutate(seasonal_vaccine = preds_seasonal) %>% + write_csv(paste(OUTPUT_PATH, "submission.csv", sep = "/")) + + diff --git a/workflows/comp-research/rstudio/wf.yml b/workflows/comp-research/rstudio/wf.yml new file mode 100644 index 0000000..5fe348e --- /dev/null +++ b/workflows/comp-research/rstudio/wf.yml @@ -0,0 +1,25 @@ +steps: +- id: "dataset" + uses: "docker://jacobcarlborg/docker-alpine-wget" + args: ["sh", "src/get_data.sh", "data"] + +- id: "rstudio" + uses: "./" + args: ["rstudio-server", "start"] + options: + ports: + 8787: 8787 + +- id: "figures" + uses: "./" + args: ["Rscript", "evaluate_model.R"] + + +- id: "predict" + uses: "./" + args: ["Rscript", "predict.R"] + +- id: "paper" + uses: "./" + args: ["latexmk", "-pdf", "paper.tex"] + dir: "/workspace/paper" \ No newline at end of file