diff --git a/model.rds b/model.rds deleted file mode 100644 index 4a53e28..0000000 Binary files a/model.rds and /dev/null differ diff --git a/packages.R b/packages.R deleted file mode 100644 index a256803..0000000 --- a/packages.R +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env Rscript - -install.packages(c("dplyr","data.table","tidyr"), repos="https://cran.r-project.org", dependencies=TRUE) diff --git a/r.Dockerfile b/r.Dockerfile deleted file mode 100644 index 5a90f7f..0000000 --- a/r.Dockerfile +++ /dev/null @@ -1,12 +0,0 @@ -FROM rocker/r2u:22.04 - - -RUN mkdir /app -WORKDIR /app - -COPY *.R /app -COPY *.rds /app - -RUN Rscript packages.R - -ENTRYPOINT ["Rscript", "run.R"] \ No newline at end of file diff --git a/run.R b/run.R deleted file mode 100644 index 82274fe..0000000 --- a/run.R +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env Rscript - -# This script calls submission.R. -# Add your method there. - -# To test your submission use the following command: -# Rscript run.R PreFer_fake_data.csv PreFer_fake_background_data.csv - -# Install required packages with Rscript packages.R - -library(dplyr) -library(tidyr) - - -source("submission.R") - -print_usage <- function() { - cat("Usage:\n") - cat(" Rscript script.R DATA_FILE BACKGROUND_DATA_FILE [--output OUTPUT_FILE]\n") -} - -parse_arguments <- function() { - args <- list() - command_args <- commandArgs(trailingOnly = TRUE) - if (length(command_args) < 2) { - return(args) - } - - args$data <- commandArgs(trailingOnly = TRUE)[1] - args$background_data <- commandArgs(trailingOnly = TRUE)[2] - args$output <- get_argument("--output") - return(args) -} - -get_argument <- function(arg_name) { - if (arg_name %in% commandArgs(trailingOnly = TRUE)) { - arg_index <- which(commandArgs(trailingOnly = TRUE) == arg_name) - if (arg_index < length(commandArgs(trailingOnly = TRUE))) { - return(commandArgs(trailingOnly = TRUE)[arg_index + 1]) - } - } - return(NULL) -} - -parse_and_run_predict <- function(args) { - if (is.null(args$data)||is.null(args$background_data)) { - stop("Error: Please provide data and background_data argument for prediction.") - } - - cat("Processing input data for prediction from:", args$data, " ", args$background_data, "\n") - if (!is.null(args$output)) { - cat("Output will be saved to:", args$output, "\n") - } - run_predict(args$data, args$background_data, args$output) -} - -run_predict <- function(data_path, background_data_path, output=NULL) { - if (is.null(output)) { - output <- stdout() - } - df <- read.csv(data_path, encoding="latin1") - background_df <- read.csv(background_data_path, encoding="latin1") - - predictions <- predict_outcomes(df, background_df) - - # Check if predictions have the required format - stopifnot(ncol(predictions) == 2, - all(c("nomem_encr", "prediction") %in% colnames(predictions))) - - # Write predictions to output file - write.csv(predictions, output, row.names = FALSE) -} - - -# Main function -main <- function() { - args <- parse_arguments() - - parse_and_run_predict(args) -} - -# Call main function -main() diff --git a/submission.R b/submission.R deleted file mode 100644 index 56b1b00..0000000 --- a/submission.R +++ /dev/null @@ -1,94 +0,0 @@ -# This is an example script to generate the outcome variable given the input dataset. -# -# This script should be modified to prepare your own submission that predicts -# the outcome for the benchmark challenge by changing the clean_df and predict_outcomes function. -# -# The predict_outcomes function takes a data frame. The return value must -# be a data frame with two columns: nomem_encr and outcome. The nomem_encr column -# should contain the nomem_encr column from the input data frame. The outcome -# column should contain the predicted outcome for each nomem_encr. The outcome -# should be 0 (no child) or 1 (having a child). -# -# clean_df should be used to clean (preprocess) the data. -# -# run.R can be used to test your submission. - -# List your packages here. Don't forget to update packages.R! -library(dplyr) # as an example, not used here - -clean_df <- function(df, background_df = NULL){ - # Preprocess the input dataframe to feed the model. - ### If no cleaning is done (e.g. if all the cleaning is done in a pipeline) leave only the "return df" command - - # Parameters: - # df (dataframe): The input dataframe containing the raw data (e.g., from PreFer_train_data.csv or PreFer_fake_data.csv). - # background (dataframe): Optional input dataframe containing background data (e.g., from PreFer_train_background_data.csv or PreFer_fake_background_data.csv). - - # Returns: - # data frame: The cleaned dataframe with only the necessary columns and processed variables. - - ## This script contains a bare minimum working example - # Create new age variable - df$age <- 2024 - df$birthyear_bg - - # Selecting variables for modelling - - keepcols = c('nomem_encr', # ID variable required for predictions, - 'age') # newly created variable - - ## Keeping data with variables selected - df <- df[ , keepcols ] - - return(df) -} - -predict_outcomes <- function(df, background_df = NULL, model_path = "./model.rds"){ - # Generate predictions using the saved model and the input dataframe. - - # The predict_outcomes function accepts a dataframe as an argument - # and returns a new dataframe with two columns: nomem_encr and - # prediction. The nomem_encr column in the new dataframe replicates the - # corresponding column from the input dataframe The prediction - # column contains predictions for each corresponding nomem_encr. Each - # prediction is represented as a binary value: '0' indicates that the - # individual did not have a child during 2021-2023, while '1' implies that - # they did. - - # Parameters: - # df (dataframe): The data dataframe for which predictions are to be made. - # background_df (dataframe): The background data dataframe for which predictions are to be made. - # model_path (str): The path to the saved model file (which is the output of training.R). - - # Returns: - # dataframe: A dataframe containing the identifiers and their corresponding predictions. - - ## This script contains a bare minimum working example - if( !("nomem_encr" %in% colnames(df)) ) { - warning("The identifier variable 'nomem_encr' should be in the dataset") - } - - # Load the model - model <- readRDS(model_path) - - # Preprocess the fake / holdout data - df <- clean_df(df, background_df) - - # Exclude the variable nomem_encr if this variable is NOT in your model - vars_without_id <- colnames(df)[colnames(df) != "nomem_encr"] - - # Generate predictions from model - predictions <- predict(model, - subset(df, select = vars_without_id), - type = "response") - - # Create predictions that should be 0s and 1s rather than, e.g., probabilities - predictions <- ifelse(predictions > 0.5, 1, 0) - - # Output file should be data.frame with two columns, nomem_encr and predictions - df_predict <- data.frame("nomem_encr" = df[ , "nomem_encr" ], "prediction" = predictions) - # Force columnnames (overrides names that may be given by `predict`) - names(df_predict) <- c("nomem_encr", "prediction") - - # Return only dataset with predictions and identifier - return( df_predict ) -} diff --git a/training.R b/training.R deleted file mode 100644 index 6e79a3e..0000000 --- a/training.R +++ /dev/null @@ -1,27 +0,0 @@ -# This is an example script to train your model given the (cleaned) input dataset. -# -# This script will not be run on the holdout data, -# but the resulting model model.joblib will be applied to the holdout data. -# -# It is important to document your training steps here, including seed, -# number of folds, model, et cetera - -train_save_model <- function(cleaned_df, outcome_df) { - # Trains a model using the cleaned dataframe and saves the model to a file. - - # Parameters: - # cleaned_df (dataframe): The cleaned data from clean_df function to be used for training the model. - # outcome_df (dataframe): The data with the outcome variable (e.g., from PreFer_train_outcome.csv or PreFer_fake_outcome.csv). - - ## This script contains a bare minimum working example - set.seed(1) # not useful here because logistic regression deterministic - - # Combine cleaned_df and outcome_df - model_df <- merge(cleaned_df, outcome_df, by = "nomem_encr") - - # Logistic regression model - model <- glm(new_child ~ age, family = "binomial", data = model_df) - - # Save the model - saveRDS(model, "model.rds") -}