diff --git a/R/arrange.R b/R/arrange.R index 5348059..d344bd8 100644 --- a/R/arrange.R +++ b/R/arrange.R @@ -1,95 +1,95 @@ #' Arrange rows by variables within a nested dataframe -#' +#' #' Arrange rows by variables within a nested dataframe -#' @param df A nested dataframe +#' @param df A nested dataframe #' @param ... Comma separated list of unquoted variable names -#' +#' #' @importFrom dplyr progress_estimated #' @importFrom emo ji #' @importFrom purrr map #' @export -arrange_nested <- function(df, ...){ +arrange_nested <- function(df, ...) { var_expr <- enquos(...) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - arrange_with_progress <- function(data){ + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + arrange_with_progress <- function(data) { pb$tick()$print() - data %>% + data %>% arrange(!!!var_expr) } - - #create the progress bar + + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + start.time <- Sys.time() message(paste(emo::ji("hammer_and_wrench"), "Start sorting...")) output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~arrange_with_progress(.))) + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ arrange_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) message("\n") message(paste(emo::ji("white_check_mark"), "Finish sorting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Sorting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Sorting time:", time.taken, "secs")) } message("\n") - + return(output) } #' Arrange rows by variables within a double-nested dataframe -#' +#' #' Arrange rows by variables within a double-nested dataframe -#' @param df A nested dataframe +#' @param df A nested dataframe #' @param nest_cols Name of columns to nest in existing list-column #' @param ... Comma separated list of unquoted variable names -#' +#' #' @importFrom emo ji #' @importFrom purrr map -#' +#' #' @export -arrange_double_nested <- function(df, nest_cols, ...){ - if(nrow(df) == 0){ +arrange_double_nested <- function(df, nest_cols, ...) { + if (nrow(df) == 0) { stop(paste(emo::ji("bomb"), "No user left, tune your threshold and try again.")) } - + stopifnot( - is.list(df[ , grepl("^data$", names(df))]) + is.list(df[, grepl("^data$", names(df))]) ) - + var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + arrange_column <- . %>% arrange(!!!var_expr) - - arrange_columns <- . %>% - dplyr::mutate(data = purrr::map(data, arrange_column)) - - # double nest - df[[colname_nested_data]] <- purrr::map(df[[colname_nested_data]], ~.x %>% nest(data = nest_cols)) - + + arrange_columns <- . %>% + dplyr::mutate(data = purrr::map(data, arrange_column)) + + # double nest + df[[colname_nested_data]] <- purrr::map(df[[colname_nested_data]], ~ .x %>% nest(data = nest_cols)) + start.time <- Sys.time() message(paste(emo::ji("hammer_and_wrench"), "Start sorting...")) - output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], arrange_columns)) + output <- df %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], arrange_columns)) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) message(paste(emo::ji("white_check_mark"), "Finish sorting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Sorting time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Sorting time:", time.taken, "secs")) } - + message("\n") return(output) -} \ No newline at end of file +} diff --git a/R/data.R b/R/data.R index 4497646..ab7cbd8 100644 --- a/R/data.R +++ b/R/data.R @@ -1,8 +1,8 @@ #' Tweets sent by 100 random users -#' -#' De-identified test data includes 100 random users, +#' +#' De-identified test data includes 100 random users, #' and it can be used as an example to get started with homelocator package -#' +#' #' @format A dataframe of 16,300 rows and 3 variables: #' \describe{ #' \item{u_id}{unique identifier for each user} @@ -13,11 +13,11 @@ #' Neighbours for locations -#' -#' Spatial neighbours for locations, where a neighbour has +#' +#' Spatial neighbours for locations, where a neighbour has #' at least one line in common, but its interior does not intersect with the location -#' -#' +#' +#' #' @format A nested dataframe of 1,942 rows and 2 variables: #' \describe{ #' \item{grid_id}{unique identifier for each location} diff --git a/R/enrich_timestamp.R b/R/enrich_timestamp.R index 04db0c0..fa45d36 100644 --- a/R/enrich_timestamp.R +++ b/R/enrich_timestamp.R @@ -1,76 +1,66 @@ #' Create new variables dervied from timestamp -#' -#' Create new variables from existing timestamp, which are often used/needed as intermediate variables in home location algorithms. -#' @param df A nested dataframe +#' +#' Create new variables from existing timestamp, which are often used/needed as intermediate variables in home location algorithms. +#' @param df A nested dataframe #' @param timestamp Name of column that holds specific timestamp for each data point and it should be POSIXct -#' +#' #' @importFrom emo ji #' @importFrom rlang sym -#' @importFrom lubridate year -#' @importFrom lubridate month -#' @importFrom lubridate day -#' @importFrom lubridate wday -#' @importFrom lubridate hour +#' @importFrom lubridate year +#' @importFrom lubridate month +#' @importFrom lubridate day +#' @importFrom lubridate wday +#' @importFrom lubridate hour #' @importFrom dplyr progress_estimated -#' @importFrom purrr map -#' +#' @importFrom purrr map +#' #' @export -enrich_timestamp <- function(df, timestamp = "created_at"){ - if(!is.list(df[ , grepl("^data$", names(df)), with=F])){ +enrich_timestamp <- function(df, timestamp = "created_at") { + if (!is.list(df[, grepl("^data$", names(df)), with = F])) { stop(paste(emo::ji("bomb"), "Input dataset is not nested!")) } - - colname_nested_data <- names(df[ , grepl("^data$", names(df)), with=F]) + + colname_nested_data <- names(df[, grepl("^data$", names(df)), with = F]) timestamp <- rlang::sym(timestamp) - - if(!is(df[[colname_nested_data]][[1]] %>% pull({{timestamp}}), "POSIXct")){ + + if (!is(df[[colname_nested_data]][[1]] %>% pull({{ timestamp }}), "POSIXct")) { stop(paste(emo::ji("bomb"), "Timestamp is not POSIXct!")) } - - #define reading function which includes the progress bar - enrich_with_progress <- function(data){ + + # define reading function which includes the progress bar + enrich_with_progress <- function(data) { pb$tick()$print() data %>% - dplyr::mutate(year = lubridate::year({{timestamp}}), - month = lubridate::month({{timestamp}}), - day = lubridate::day({{timestamp}}), - wday = lubridate::wday({{timestamp}}), # day of the week - hour = lubridate::hour({{timestamp}}), # hour of the day - ymd = format({{timestamp}}, "%Y-%m-%d")) + dplyr::mutate( + year = lubridate::year({{ timestamp }}), + month = lubridate::month({{ timestamp }}), + day = lubridate::day({{ timestamp }}), + wday = lubridate::wday({{ timestamp }}), # day of the week + hour = lubridate::hour({{ timestamp }}), # hour of the day + ymd = format({{ timestamp }}, "%Y-%m-%d") + ) } - #create the progress bar + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + start.time <- Sys.time() message(paste(emo::ji("hammer_and_wrench"), "Enriching variables from timestamp...")) output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~enrich_with_progress(.))) + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ enrich_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish enriching! New added variables: year, month, day, wday, hour, ymd.")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Enriching time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Enriching time:", time.taken, "secs")) } message("\n") - + return(output) } - - - - - - - - - - - - diff --git a/R/extract_location.R b/R/extract_location.R index 412ae11..e85e517 100644 --- a/R/extract_location.R +++ b/R/extract_location.R @@ -1,72 +1,74 @@ -#' Extract identified home locations for users -#' -#' Extract the most likely home location for each user based on choosed column value -#' @param df A nested dataframe by user -#' @param show_n_loc Number of homes to be shown -#' @param keep_score Choice to keep score or not -#' @param score_var Name of column that holds weighted score for each user +#' Extract identified home locations for users +#' +#' Extract the most likely home location for each user based on choosed column value +#' @param df A nested dataframe by user +#' @param show_n_loc Number of homes to be shown +#' @param keep_score Choice to keep score or not +#' @param score_var Name of column that holds weighted score for each user #' @param user Name of column that holds unique identifier for each user #' @param location Name of column that holds unique identifier for each location -#' +#' #' @importFrom rlang has_name #' @importFrom emo ji -#' @importFrom dplyr progress_estimated +#' @importFrom dplyr progress_estimated #' @importFrom dplyr arrange #' @importFrom dplyr select #' @importFrom purrr map_chr -#' -#' +#' +#' #' @export -extract_location <- function(df, user = "u_id", location = "loc_id", show_n_loc = 1, keep_score = F, ...){ +extract_location <- function(df, user = "u_id", location = "loc_id", show_n_loc = 1, keep_score = F, ...) { if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } - - user <- rlang::sym(user) + + user <- rlang::sym(user) location <- rlang::sym(location) var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - get_loc_with_progress <- function(data){ + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + get_loc_with_progress <- function(data) { pb$tick()$print() get_loc <- data %>% - dplyr::arrange(!!!var_expr) %>% + dplyr::arrange(!!!var_expr) %>% slice(1:show_n_loc) %>% - pull({{location}}) - if(show_n_loc == 1){ + pull({{ location }}) + if (show_n_loc == 1) { get_loc - } else{ + } else { paste(get_loc, collapse = "; ") } } # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + start.time <- Sys.time() message(paste(emo::ji("hammer_and_wrench"), "Start extracting homes for users...")) - - if(keep_score){ + + if (keep_score) { output <- df %>% - dplyr::mutate(home = purrr::map_chr(df[[colname_nested_data]], ~get_loc_with_progress(.))) - } else{ + dplyr::mutate(home = purrr::map_chr(df[[colname_nested_data]], ~ get_loc_with_progress(.))) + } else { output <- df %>% - dplyr::mutate(home = purrr::map_chr(df[[colname_nested_data]], ~get_loc_with_progress(.))) %>% + dplyr::mutate(home = purrr::map_chr(df[[colname_nested_data]], ~ get_loc_with_progress(.))) %>% dplyr::select(-colname_nested_data) %>% - dplyr::select({{user}}, home) + dplyr::select({{ user }}, home) } end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - - n_user <- output %>% pull(!!user) %>% n_distinct() + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + + n_user <- output %>% + pull(!!user) %>% + n_distinct() message("\n") message(paste(emo::ji("tada"), "Congratulations!! Your have found", n_user, "users' potential home(s).")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Extracting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Extracting time:", time.taken, "secs")) } message("\n") return(output) -} \ No newline at end of file +} diff --git a/R/filter.R b/R/filter.R index 9183396..2cd2b48 100644 --- a/R/filter.R +++ b/R/filter.R @@ -1,58 +1,61 @@ -#' Return rows with matching conditions -#' -#' Filter finds rows where conditions are true. -#' @param df A dataframe +#' Return rows with matching conditions +#' +#' Filter finds rows where conditions are true. +#' @param df A dataframe #' @param user Name of column that holds unique identifier for each user #' @param ... Logical predicates defined in terms of the variables in df. Only rows match conditions are kept. -#' +#' #' @importFrom emo ji #' @importFrom rlang sym #' @importFrom rlang has_name -#' +#' #' @export -filter_verbose <- function(df, user = "u_id", ...){ - +filter_verbose <- function(df, user = "u_id", ...) { if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } - - user <- rlang::sym(user) + + user <- rlang::sym(user) var_expr <- enquos(...) - - n_original_users <- df %>% pull({{user}}) %>% dplyr::n_distinct() - + + n_original_users <- df %>% + pull({{ user }}) %>% + dplyr::n_distinct() + message(paste(emo::ji("bust_in_silhouette"), "There are", n_original_users, "users at this moment.")) message(paste(emo::ji("hammer_and_wrench"), "Start filtering users...")) start.time <- Sys.time() output <- df %>% filter(!!!var_expr) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - - n_new_users <- output %>% pull({{user}}) %>% n_distinct() + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + + n_new_users <- output %>% + pull({{ user }}) %>% + n_distinct() n_removed_users <- n_original_users - n_new_users message(paste(emo::ji("white_check_mark"), "Finish filtering! Filterred", n_removed_users, "users!")) message(paste(emo::ji("bust_in_silhouette"), "There are", n_new_users, "users left.")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Filtering time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Filtering time:", time.taken, "secs")) } - + message("\n") return(output) } -#' Return rows with matching condition within nested dataframe -#' -#' -#' Filter finds rows where conditions are true within nested dataframe -#' @param df A nested dataframe +#' Return rows with matching condition within nested dataframe +#' +#' +#' Filter finds rows where conditions are true within nested dataframe +#' @param df A nested dataframe #' @param user Name of column that holds unique identifier for each user #' @param ... Logical predicates defined in terms of the variables in df. Only rows match conditions are kept. -#' +#' #' @importFrom emo ji #' @importFrom rlang sym #' @importFrom rlang has_name @@ -61,69 +64,63 @@ filter_verbose <- function(df, user = "u_id", ...){ #' @importFrom plyr empty #' @importFrom purrr map #' @importFrom purrr map_lgl -#' +#' #' @export -filter_nested <- function(df, user = "u_id", ...){ - - if(!is.list(df[ , grepl("^data$", names(df))])){ +filter_nested <- function(df, user = "u_id", ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Dataset is not nested!")) } - + if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } - + var_expr <- enquos(...) - user <- rlang::sym(user) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - #filter - filter_with_progress <- function(data){ + user <- rlang::sym(user) + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + # filter + filter_with_progress <- function(data) { pb$tick()$print() - data %>% + data %>% filter(!!!var_expr) } - + start.time <- Sys.time() - n_original_users <- df %>% pull({{user}}) %>% dplyr::n_distinct() + n_original_users <- df %>% + pull({{ user }}) %>% + dplyr::n_distinct() message(paste(emo::ji("bust_in_silhouette"), "There are", n_original_users, "users at this moment.")) - message(paste(emo::ji("hammer_and_wrench"), "Start filtering user...")) - + message(paste(emo::ji("hammer_and_wrench"), "Start filtering user...")) + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + start.time <- Sys.time() - output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~filter_with_progress(.))) + output <- df %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ filter_with_progress(.))) output_data <- output[[colname_nested_data]] - #check empty tibble - output <- output %>% + # check empty tibble + output <- output %>% filter(!(purrr::map_lgl(output_data, plyr::empty))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - - n_new_users <- output %>% pull({{user}}) %>% dplyr::n_distinct() + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + + n_new_users <- output %>% + pull({{ user }}) %>% + dplyr::n_distinct() n_removed_users <- n_original_users - n_new_users - + message("\n") message(paste(emo::ji("white_check_mark"), "Finish Filtering! Filterred", n_removed_users, "users!")) message(paste(emo::ji("bust_in_silhouette"), "There are", n_new_users, "users left!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Filtering time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Filtering time:", time.taken, "secs")) } message("\n") return(output) } - - - - - - - - - diff --git a/R/homelocator.R b/R/homelocator.R index a355b1b..b342d68 100644 --- a/R/homelocator.R +++ b/R/homelocator.R @@ -1,8 +1,8 @@ -.onAttach <- function(libname, pkgname){ +.onAttach <- function(libname, pkgname) { packageStartupMessage("Welcome to homelocator package!") } -.onLoad <- function(libname, pkgname){ +.onLoad <- function(libname, pkgname) { op <- options() op.devtools <- list( devtools.path = "~/R-dev", @@ -12,20 +12,6 @@ devtools.desc = list() ) toset <- !(names(op.devtools) %in% names(op)) - if(any(toset)) options(op.devtools[toset]) + if (any(toset)) options(op.devtools[toset]) invisible() } - - - - - - - - - - - - - - diff --git a/R/identify_location.R b/R/identify_location.R index 06410f6..a82a9b8 100644 --- a/R/identify_location.R +++ b/R/identify_location.R @@ -1,66 +1,76 @@ #' Identify home locations for users with built-in recipes -#' -#' This function infers the most possible home locations for users with built-in recipes. -#' The "recipe" can be either embeded "recipe" - HMLC, OSNA, APDM, FREQ, or self created "recipe". -#' +#' +#' This function infers the most possible home locations for users with built-in recipes. +#' The "recipe" can be either embeded "recipe" - HMLC, OSNA, APDM, FREQ, or self created "recipe". +#' #' @param df A dataframe with columns for the user id, location, timestamp #' @param user Name of column that holds unique identifier for each user #' @param timestamp Name of timestamp column. Should be POSIXct #' @param location Name of column that holds unique identifier for each location -#' @param recipe Embeded algorithms to identify the most possible home locations for users +#' @param recipe Embeded algorithms to identify the most possible home locations for users #' @param show_n_loc Number of potential homes to extract #' @param keep_score Option to keep or remove calculated result/score per user per location #' @param use_default_threshold Option to use default threshold or customized threshold #' @param rm_topNpct_user Option to remove or keep the top N percent active users -#' +#' #' @importFrom rlang sym #' @importFrom rlang has_name #' @importFrom emo ji #' @importFrom tictoc tic #' @importFrom tictoc toc -#' +#' #' @export -identify_location <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", recipe, - show_n_loc = 1, keep_score = F, use_default_threshold = T, rm_topNpct_user = F){ +identify_location <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", recipe, + show_n_loc = 1, keep_score = F, use_default_threshold = T, rm_topNpct_user = F) { user_expr <- rlang::sym(user) timestamp_expr <- rlang::sym(timestamp) location_expr <- rlang::sym(location) - + tictoc::tic("Location Identification") - ## Validate the input dataset + ## Validate the input dataset df_valided <- validate_dataset(df, user = user, timestamp = timestamp, location = location) - + # Nest the dataset to each user - df_nested <- nest_verbose(df_valided, c({{location_expr}}, {{timestamp_expr}})) + df_nested <- nest_verbose(df_valided, c({{ location_expr }}, {{ timestamp_expr }})) ## Derive new variables from timestamp df_enriched <- enrich_timestamp(df_nested, timestamp = timestamp) - + ## recipe: HMLC - if(recipe == "HMLC"){ - output <- recipe_HMLC(df_enriched, user = user, timestamp = timestamp, location = location, - show_n_loc, keep_original_vars = F, keep_score = keep_score, - use_default_threshold = use_default_threshold, rm_topNpct_user = rm_topNpct_user) - } - + if (recipe == "HMLC") { + output <- recipe_HMLC(df_enriched, + user = user, timestamp = timestamp, location = location, + show_n_loc, keep_original_vars = F, keep_score = keep_score, + use_default_threshold = use_default_threshold, rm_topNpct_user = rm_topNpct_user + ) + } + ## recipe: FREQ - if(recipe == "FREQ"){ - output <- recipe_FREQ(df_enriched, user = user, timestamp = timestamp, location = location, show_n_loc, - keep_score = keep_score, use_default_threshold = use_default_threshold, rm_topNpct_user = rm_topNpct_user) + if (recipe == "FREQ") { + output <- recipe_FREQ(df_enriched, + user = user, timestamp = timestamp, location = location, show_n_loc, + keep_score = keep_score, use_default_threshold = use_default_threshold, rm_topNpct_user = rm_topNpct_user + ) } - + ## recipe: OSNA - if(recipe == "OSNA"){ - output <- recipe_OSNA(df_enriched, user = user, timestamp = timestamp, location = location, show_n_loc = show_n_loc, - keep_score = keep_score, use_default_threshold = use_default_threshold, rm_topNpct_user = rm_topNpct_user) + if (recipe == "OSNA") { + output <- recipe_OSNA(df_enriched, + user = user, timestamp = timestamp, location = location, show_n_loc = show_n_loc, + keep_score = keep_score, use_default_threshold = use_default_threshold, rm_topNpct_user = rm_topNpct_user + ) } - + ## recipe: APDM - if(recipe == "APDM"){ - message(paste(emo::ji("exclamation"), "Please make sure you have loaded the neighbors table before you use APDM recipe.\nThe table should have one column named", location, - "and another column named neighbor.\n The neighbor column should be a list-column contains the neighboring locations for", location, "per row.")) - output <- recipe_APDM(df_enriched, df_neighbors, user = user, timestamp = timestamp, location = location, - keep_score = keep_score, use_default_threshold = use_default_threshold) + if (recipe == "APDM") { + message(paste( + emo::ji("exclamation"), "Please make sure you have loaded the neighbors table before you use APDM recipe.\nThe table should have one column named", location, + "and another column named neighbor.\n The neighbor column should be a list-column contains the neighboring locations for", location, "per row." + )) + output <- recipe_APDM(df_enriched, df_neighbors, + user = user, timestamp = timestamp, location = location, + keep_score = keep_score, use_default_threshold = use_default_threshold + ) } tictoc::toc() return(output) @@ -76,20 +86,20 @@ identify_location <- function(df, user = "u_id", timestamp = "created_at", locat #' @param keep_original_vars Option to keep or remove columns other than 'user, timestamp, and location' #' @param use_default_threshold Option to use default threshold or customized threshold #' @param rm_topNpct_user Option to remove or keep the top N percent active users -#' +#' #' @importFrom rlang sym #' @importFrom lubridate hour #' @importFrom lubridate minute #' @importFrom lubridate second -#' -recipe_HMLC <- function (df, user = "u_id", timestamp = "created_at", location = "loc_id", show_n_loc, - keep_original_vars = F, keep_score = F, use_default_threshold = T, rm_topNpct_user = F) { +#' +recipe_HMLC <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", show_n_loc, + keep_original_vars = F, keep_score = F, use_default_threshold = T, rm_topNpct_user = F) { user_expr <- rlang::sym(user) timestamp_expr <- rlang::sym(timestamp) location_expr <- rlang::sym(location) - + # use_default_threshold <- readline(prompt = "Do you want to use the default thresholds? (Yes/No): ") - if(use_default_threshold){ + if (use_default_threshold) { topNpct <- 1 threshold_n_points <- 10 threshold_n_locs <- 10 @@ -106,15 +116,15 @@ recipe_HMLC <- function (df, user = "u_id", timestamp = "created_at", location = w_weekend <- 0.1 w_rest <- 0.2 w_weekend_am <- 0.1 - } else{ - topNpct <- readline(prompt="How many percentage of top active users to be removed (default = 1)? Your answer: ") %>% as.integer() + } else { + topNpct <- readline(prompt = "How many percentage of top active users to be removed (default = 1)? Your answer: ") %>% as.integer() threshold_n_points <- readline(prompt = "What is the minimum number of data points should a user sent in total(default = 10)? Your answer: ") %>% as.integer() threshold_n_locs <- readline(prompt = "What is the minimum number of unique places should data points be collected from (default = 10)? Your answer: ") %>% as.integer() threshold_n_points_loc <- readline(prompt = "What is the minimum number of data points should a user sent at each place (default = 10)? Your answer: ") %>% as.integer() threshold_n_hours_loc <- readline(prompt = "What is the minimum number of unique hours should data points be collected from (default = 10)? Your answer: ") %>% as.integer() threshold_n_days_loc <- readline(prompt = "What is the minimum number of days should data points be collected from (default = 10)? Your answer: ") %>% as.integer() threshold_period_loc <- readline(prompt = "What is the minimum period should a user being active on the digital platform (default = 10)? Your answer: ") %>% as.integer() - + w_n_points_loc <- readline(prompt = "Give a weight to the number of data points at a place (default = 0.1): ") %>% as.integer() w_n_hours_loc <- readline(prompt = "Give a weight to the number of unique hours data points were sent a place (default = 0.1): ") %>% as.integer() w_n_days_loc <- readline(prompt = "Give a weight to the number of days data points were sent at a place (default = 0.1): ") %>% as.integer() @@ -125,64 +135,75 @@ recipe_HMLC <- function (df, user = "u_id", timestamp = "created_at", location = w_rest <- readline(prompt = "Give a weight the proportion of data points sent during rest time at a place (default = 0.2): ") %>% as.integer() w_weekend_am <- readline(prompt = "Give a weight the proportion of data points sent during weekend morning at a place (default = 0.1): ") %>% as.integer() } - - # users level - ## pre-condition set on users + + # users level + ## pre-condition set on users df_match_user_condition <- df %>% - summarise_nested(., n_points = n(), - n_locs = n_distinct({{location_expr}})) %>% + summarise_nested(., + n_points = n(), + n_locs = n_distinct({{ location_expr }}) + ) %>% remove_top_users(., user = user, counts = "n_points", topNpct_user = topNpct, rm_topNpct_user = rm_topNpct_user) %>% # remove top N percent active users based on frequency filter_verbose(., user = user, n_points > threshold_n_points & n_locs > threshold_n_locs) # each use at least has more than threshold_n_points data points sent at threshold_n_locs different locations - - # location level - # pre-condition set on locations + + # location level + # pre-condition set on locations colnames_nested_data <- df_match_user_condition$data[[1]] %>% names() colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data == location)] - + df_match_loc_condition <- df_match_user_condition %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, # summarise on location level - n_points_loc = n(), - n_hours_loc = n_distinct(hour), - n_days_loc = n_distinct(ymd), - period_loc = as.numeric(max({{timestamp_expr}}) - min({{timestamp_expr}}), "days")) %>% + summarise_double_nested(., + nest_cols = colnmaes_to_nest, # summarise on location level + n_points_loc = n(), + n_hours_loc = n_distinct(hour), + n_days_loc = n_distinct(ymd), + period_loc = as.numeric(max({{ timestamp_expr }}) - min({{ timestamp_expr }}), "days") + ) %>% unnest_verbose(data) %>% # one row shows data point of per user per location - filter_verbose(., user = user, - n_points_loc > threshold_n_points_loc & n_hours_loc > threshold_n_hours_loc & n_days_loc > threshold_n_days_loc & period_loc > threshold_period_loc) %>% # each use at least has threshold_n_points_loc data points sent at threshold_n_hours_loc different hours for at least threshold_n_days_loc different days at each location, the whole time period is at least more than threshold_period_loc days - summarise_nested(n_wdays_loc = n_distinct(wday), # add new variable after matching the conditions - n_months_loc = n_distinct(month)) - - # add new variables - df_expanded <- df_match_loc_condition %>% - mutate_nested(., - wd_or_wk = if_else(wday %in% c(1,7), "weekend", "weekday"), # 1 means Sunday and 7 means Saturday - time_numeric = lubridate::hour({{timestamp_expr}}) + lubridate::minute({{timestamp_expr}})/60 + lubridate::second({{timestamp_expr}})/3600, - rest_or_work = if_else(time_numeric >= 9 & time_numeric <= 18, "work", "rest"), - wk.am_or_wk.pm = if_else(time_numeric >= 6 & time_numeric <= 12 & wd_or_wk == "weekend", "weekend_am", "weekend_pm")) %>% + filter_verbose(., + user = user, + n_points_loc > threshold_n_points_loc & n_hours_loc > threshold_n_hours_loc & n_days_loc > threshold_n_days_loc & period_loc > threshold_period_loc + ) %>% # each use at least has threshold_n_points_loc data points sent at threshold_n_hours_loc different hours for at least threshold_n_days_loc different days at each location, the whole time period is at least more than threshold_period_loc days + summarise_nested( + n_wdays_loc = n_distinct(wday), # add new variable after matching the conditions + n_months_loc = n_distinct(month) + ) + + # add new variables + df_expanded <- df_match_loc_condition %>% + mutate_nested(., + wd_or_wk = if_else(wday %in% c(1, 7), "weekend", "weekday"), # 1 means Sunday and 7 means Saturday + time_numeric = lubridate::hour({{ timestamp_expr }}) + lubridate::minute({{ timestamp_expr }}) / 60 + lubridate::second({{ timestamp_expr }}) / 3600, + rest_or_work = if_else(time_numeric >= 9 & time_numeric <= 18, "work", "rest"), + wk.am_or_wk.pm = if_else(time_numeric >= 6 & time_numeric <= 12 & wd_or_wk == "weekend", "weekend_am", "weekend_pm") + ) %>% prop_factor_nested(wd_or_wk, rest_or_work, wk.am_or_wk.pm) - + ## assign weight to variables and sum variables - df_scored <- df_expanded %>% - score_nested(., user = user, location = location, # score each variables with weight - keep_original_vars = keep_original_vars, - s_n_points_loc = w_n_points_loc * (n_points_loc/max(n_points_loc)), - s_n_hours_loc = w_n_hours_loc * (n_hours_loc/24), - s_n_days_loc = w_n_days_loc * (n_days_loc/max(n_days_loc)), - s_n_wdays_loc = w_n_wdays_loc * (n_wdays_loc/7), - s_n_months_loc = w_n_months_loc * (n_months_loc/12), - s_period_loc = w_period_loc * (period_loc/max(period_loc)), - s_weekend = w_weekend * (weekend), - s_rest = w_rest * (rest), - s_weekend_am = w_weekend_am * (weekend_am)) %>% + df_scored <- df_expanded %>% + score_nested(., + user = user, location = location, # score each variables with weight + keep_original_vars = keep_original_vars, + s_n_points_loc = w_n_points_loc * (n_points_loc / max(n_points_loc)), + s_n_hours_loc = w_n_hours_loc * (n_hours_loc / 24), + s_n_days_loc = w_n_days_loc * (n_days_loc / max(n_days_loc)), + s_n_wdays_loc = w_n_wdays_loc * (n_wdays_loc / 7), + s_n_months_loc = w_n_months_loc * (n_months_loc / 12), + s_period_loc = w_period_loc * (period_loc / max(period_loc)), + s_weekend = w_weekend * (weekend), + s_rest = w_rest * (rest), + s_weekend_am = w_weekend_am * (weekend_am) + ) %>% score_summary(., user = user, location = location, starts_with("s_")) # sum all scores for each location - - # extract locations based on score value - df_scored %>% + + # extract locations based on score value + df_scored %>% extract_location(., user = user, location = location, show_n_loc = show_n_loc, keep_score = keep_score, desc(score)) } #' recipe: frequency - FREQ -#' +#' #' @param df An enriched dataframe #' @param user Name of column that holds unique identifier for each user #' @param timestamp Name of timestamp column. Should be POSIXct @@ -191,55 +212,58 @@ recipe_HMLC <- function (df, user = "u_id", timestamp = "created_at", location = #' @param keep_score Option to keep or remove calculated result/score per user per location #' @param use_default_threshold Option to use default threshold or customized threshold #' @param rm_topNpct_user Option to remove or keep the top N percent active users -#' +#' #' @importFrom rlang sym -recipe_FREQ <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", show_n_loc, - keep_score = F, use_default_threshold = T, rm_topNpct_user = F){ +recipe_FREQ <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", show_n_loc, + keep_score = F, use_default_threshold = T, rm_topNpct_user = F) { user_expr <- rlang::sym(user) timestamp_expr <- rlang::sym(timestamp) location_expr <- rlang::sym(location) - + # use_default_threshold <- readline(prompt = "Do you want to use the default thresholds? (Yes/No): ") - if(use_default_threshold){ + if (use_default_threshold) { topNpct <- 1 threshold_n_points <- 10 threshold_n_locs <- 10 threshold_n_points_loc <- 10 - } else{ - topNpct <- readline(prompt="How many percentage of top active users to be removed (default = 1)? Your answer: ") %>% as.integer() - threshold_n_points <- readline(prompt="What is the minumn number of data points should a user sent in total(default = 10)? Your answer: ") %>% as.integer() - threshold_n_locs <- readline(prompt="What is the minumn number of unique places should data points be collected from (default = 10)? Your answer: ") %>% as.integer() - threshold_n_points_loc <- readline(prompt="What is the minumn number of data points should a user sent at each place (default = 10)? Your answer: ") %>% as.integer() + } else { + topNpct <- readline(prompt = "How many percentage of top active users to be removed (default = 1)? Your answer: ") %>% as.integer() + threshold_n_points <- readline(prompt = "What is the minumn number of data points should a user sent in total(default = 10)? Your answer: ") %>% as.integer() + threshold_n_locs <- readline(prompt = "What is the minumn number of unique places should data points be collected from (default = 10)? Your answer: ") %>% as.integer() + threshold_n_points_loc <- readline(prompt = "What is the minumn number of data points should a user sent at each place (default = 10)? Your answer: ") %>% as.integer() } - - # users level - ## pre-condition set on users + + # users level + ## pre-condition set on users df_match_user_condition <- df %>% - summarise_nested(., - n_points = n(), - n_locs = n_distinct({{location_expr}})) %>% + summarise_nested(., + n_points = n(), + n_locs = n_distinct({{ location_expr }}) + ) %>% remove_top_users(., user = user, counts = "n_points", topNpct_user = topNpct, rm_topNpct_user = rm_topNpct_user) %>% # remove top N percent active users based on frequency filter_verbose(., user = user, n_points > threshold_n_points & n_locs > threshold_n_locs) # each use at least has more than threshold_n_points data points sent at threshold_n_loc different locations - - # location level - # pre-condition set on locations + + # location level + # pre-condition set on locations colnames_nested_data <- df_match_user_condition$data[[1]] %>% names() colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data == location)] - + df_match_loc_condition <- df_match_user_condition %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, # summarise on location level - n_points_loc = n()) %>% - filter_nested(., user = user, n_points_loc > threshold_n_points_loc) - + summarise_double_nested(., + nest_cols = colnmaes_to_nest, # summarise on location level + n_points_loc = n() + ) %>% + filter_nested(., user = user, n_points_loc > threshold_n_points_loc) + # extract locations based on frequency of data points sent on locations - df_match_loc_condition %>% + df_match_loc_condition %>% extract_location(., user = user, location = location, show_n_loc = show_n_loc, keep_score = keep_score, desc(n_points_loc)) } #' recipe: Online Social Networks Activity - OSNA -#' -#' +#' +#' #' @param df An enriched dataframe #' @param user Name of column that holds unique identifier for each user #' @param timestamp Name of timestamp column. Should be POSIXct @@ -248,71 +272,76 @@ recipe_FREQ <- function(df, user = "u_id", timestamp = "created_at", location = #' @param keep_score Option to keep or remove calculated result/score per user per location #' @param use_default_threshold Option to use default threshold or customized threshold #' @param rm_topNpct_user Option to remove or keep the top N percent active users -#' +#' #' @importFrom rlang sym -recipe_OSNA <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", show_n_loc, - keep_score = F, use_default_threshold = T, rm_topNpct_user = F){ +recipe_OSNA <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", show_n_loc, + keep_score = F, use_default_threshold = T, rm_topNpct_user = F) { user_expr <- rlang::sym(user) timestamp_expr <- rlang::sym(timestamp) location_expr <- rlang::sym(location) - + # use_default_threshold <- readline(prompt = "Do you want to use the default thresholds? (Yes/No): ") - if(use_default_threshold){ + if (use_default_threshold) { topNpct <- 1 threshold_n_locs <- 3 - } else{ - topNpct <- readline(prompt="How many percentage of top active users to be removed (default = 1)? Your answer: ") %>% as.integer() - threshold_n_locs <- readline(prompt="What is the minumn number of unique places should data points be collected from (default = 3)? Your answer: ") %>% as.integer() + } else { + topNpct <- readline(prompt = "How many percentage of top active users to be removed (default = 1)? Your answer: ") %>% as.integer() + threshold_n_locs <- readline(prompt = "What is the minumn number of unique places should data points be collected from (default = 3)? Your answer: ") %>% as.integer() } - # users level - ## pre-condition set on users + # users level + ## pre-condition set on users df_match_user_condition <- df %>% - summarise_nested(., - n_points = n(), - n_locs = n_distinct({{location_expr}})) %>% + summarise_nested(., + n_points = n(), + n_locs = n_distinct({{ location_expr }}) + ) %>% remove_top_users(., user = user, counts = "n_points", topNpct_user = topNpct, rm_topNpct_user = rm_topNpct_user) %>% # remove top N percent active users based on frequency - filter_verbose(., user = user, n_locs > threshold_n_locs) %>% # remove users with data at less than N places - filter_nested(., user = user, !wday %in% c(1, 7)) %>% # remove data sent on weekend, 1 for Sunday and 7 for Saturday - mutate_nested(timeframe = if_else(hour >= 2 & hour < 8, "Rest", if_else(hour >= 8 & hour < 19, "Active", "Leisure"))) %>% # add time frame column + filter_verbose(., user = user, n_locs > threshold_n_locs) %>% # remove users with data at less than N places + filter_nested(., user = user, !wday %in% c(1, 7)) %>% # remove data sent on weekend, 1 for Sunday and 7 for Saturday + mutate_nested(timeframe = if_else(hour >= 2 & hour < 8, "Rest", if_else(hour >= 8 & hour < 19, "Active", "Leisure"))) %>% # add time frame column filter_nested(., user = user, timeframe != "Active") # home location is focused on Rest and Leisure time frame - - ## calculate weighted score of data points sent during Leisure and Rest time in different day at different places + + ## calculate weighted score of data points sent during Leisure and Rest time in different day at different places colnames_nested_data <- df_match_user_condition$data[[1]] %>% names() - colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data %in% c(location, "ymd", "timeframe"))] # per location per day per timeframe - + colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data %in% c(location, "ymd", "timeframe"))] # per location per day per timeframe + df_timeframe_nested <- df_match_user_condition %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, - n_points_timeframe = n()) %>% # number of data points at the timeframe - spread_nested(., key_var = "timeframe", value_var = "n_points_timeframe") %>% # spread the timeframe to columns + summarise_double_nested(., + nest_cols = colnmaes_to_nest, + n_points_timeframe = n() + ) %>% # number of data points at the timeframe + spread_nested(., key_var = "timeframe", value_var = "n_points_timeframe") %>% # spread the timeframe to columns unnest_verbose() %>% # unnest the result, missed column will be automatically added with NA value - replace(., is.na(.), 0) # replace NA with 0 - - # give the weigh to Rest and Leisure time + replace(., is.na(.), 0) # replace NA with 0 + + # give the weigh to Rest and Leisure time weight_rest <- mean(0.744, 0.735, 0.737) weight_leisure <- mean(0.362, 0.357, 0.354) - - df_weighted <- df_timeframe_nested %>% - mutate_verbose(score_ymd_loc = weight_rest * Rest + weight_leisure * Leisure) - - # nest by user - df_user_nested <- df_weighted %>% nest_verbose(-u_id) - - # calculate the score per location + + df_weighted <- df_timeframe_nested %>% + mutate_verbose(score_ymd_loc = weight_rest * Rest + weight_leisure * Leisure) + + # nest by user + df_user_nested <- df_weighted %>% nest_verbose(-u_id) + + # calculate the score per location colnames_nested_data <- df_user_nested$data[[1]] %>% names() - colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data == location)] # nest by location - - df_scored <- df_user_nested %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, - score = sum(score_ymd_loc)) - - # extract location based on score value + colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data == location)] # nest by location + + df_scored <- df_user_nested %>% + summarise_double_nested(., + nest_cols = colnmaes_to_nest, + score = sum(score_ymd_loc) + ) + + # extract location based on score value extract_location(df_scored, user = user, location = location, show_n_loc = show_n_loc, keep_score = keep_score, desc(score)) } #' recipe: Anchor Point Determining Model - APDM -#' +#' #' @param df An enriched dataframe #' @param user Name of column that holds unique identifier for each user #' @param timestamp Name of timestamp column. Should be POSIXct @@ -320,12 +349,12 @@ recipe_OSNA <- function(df, user = "u_id", timestamp = "created_at", location = #' @param show_n_loc Number of potential homes to extract #' @param keep_score Option to keep or remove calculated result/score per user per location #' @param use_default_threshold Option to use default threshold or customized threshold -#' +#' #' @importFrom rlang sym #' @importFrom rlang has_name #' @importFrom emo ji #' @importFrom chron times -recipe_APDM <- function(df, df_neighbors, user = "u_id", timestamp = "created_at", location = "loc_id", keep_score = F, use_default_threshold = T){ +recipe_APDM <- function(df, df_neighbors, user = "u_id", timestamp = "created_at", location = "loc_id", keep_score = F, use_default_threshold = T) { user_expr <- rlang::sym(user) timestamp_expr <- rlang::sym(timestamp) location_expr <- rlang::sym(location) @@ -333,17 +362,17 @@ recipe_APDM <- function(df, df_neighbors, user = "u_id", timestamp = "created_at if (!rlang::has_name(df_neighbors, location)) { stop(paste(emo::ji("bomb"), "Location column does not exist!")) } - + if (!rlang::has_name(df_neighbors, "neighbor")) { stop(paste(emo::ji("bomb"), "Neighbor column does not exist!")) } # use_default_threshold <- readline(prompt = "Do you want to use the default thresholds? (Yes/No): ") - if(use_default_threshold){ + if (use_default_threshold) { threshold_n_days_per_month_loc <- 7 threshold_n_points_per_month_loc <- 500 - } else{ - threshold_n_days_per_month_loc <- readline(prompt="What is the minium active days per month (default = 7)? Your answer: ") %>% as.integer() - threshold_n_points_per_month_loc <- readline(prompt="What is the maximum data points sent per month (default = 500)? Your answer: ") %>% as.integer() + } else { + threshold_n_days_per_month_loc <- readline(prompt = "What is the minium active days per month (default = 7)? Your answer: ") %>% as.integer() + threshold_n_points_per_month_loc <- readline(prompt = "What is the maximum data points sent per month (default = 500)? Your answer: ") %>% as.integer() } ## Removal of places with too high or too low a number of data points made ## respondents who have data points at their top n frequently visited places fewer than 7 days a month will be removed from the database @@ -351,93 +380,107 @@ recipe_APDM <- function(df, df_neighbors, user = "u_id", timestamp = "created_at colnames_nested_data <- df$data[[1]] %>% names() colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data %in% c(location, "year", "month"))] df_cleaned <- df %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, - n_days_per_month_loc = n_distinct(day), - n_points_per_month_loc = n()) %>% - filter_nested(., user = user, - n_days_per_month_loc >= threshold_n_days_per_month_loc & n_points_per_month_loc <= threshold_n_points_per_month_loc) - + summarise_double_nested(., + nest_cols = colnmaes_to_nest, + n_days_per_month_loc = n_distinct(day), + n_points_per_month_loc = n() + ) %>% + filter_nested(., + user = user, + n_days_per_month_loc >= threshold_n_days_per_month_loc & n_points_per_month_loc <= threshold_n_points_per_month_loc + ) + # get the top 5 most frequently visited location df_user_nested <- df_cleaned %>% unnest_nested(., data) colnames_nested_data <- df_user_nested$data[[1]] %>% names() colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data == location)] df_top5_locs <- df_user_nested %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, - n_days_loc = n_distinct(ymd), - n_points_loc = n()) %>% + summarise_double_nested(., + nest_cols = colnmaes_to_nest, + n_days_loc = n_distinct(ymd), + n_points_loc = n() + ) %>% arrange_nested(., n_days_loc, n_points_loc) %>% - top_n_nested(., n = 5, wt = "n_points_loc") %>% + top_n_nested(., n = 5, wt = "n_points_loc") %>% unnest_verbose() - - # extract the start data point per day for each location + + # extract the start data point per day for each location colnames_nested_data <- df_top5_locs$data[[1]] %>% names() colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data %in% c("ymd"))] # nest to per user per location per day df_with_start_point <- df_top5_locs %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, - start_point = min({{timestamp_expr}})) # get the start data point of a day - + summarise_double_nested(., + nest_cols = colnmaes_to_nest, + start_point = min({{ timestamp_expr }}) + ) # get the start data point of a day + # add home or work label to top 5 most frequently visit places: 1 means home and 0 means work - time_line <- chron::times("17:00:00") %>% as.numeric() - df_with_home_type <- df_with_start_point %>% - mutate_nested(start_time = format(start_point, format = "%H:%M:%S") %>% chron::times() %>% as.numeric()) %>% - summarise_nested(mean_start_time = mean(start_time), - sd_start_time = sd(start_time)) %>% + time_line <- chron::times("17:00:00") %>% as.numeric() + df_with_home_type <- df_with_start_point %>% + mutate_nested(start_time = format(start_point, format = "%H:%M:%S") %>% chron::times() %>% as.numeric()) %>% + summarise_nested( + mean_start_time = mean(start_time), + sd_start_time = sd(start_time) + ) %>% mutate_verbose(location_type = if_else(mean_start_time < time_line & sd_start_time <= 0.175, 0, 1)) %>% # 1 means home and 0 means work filter(location_type != 0) %>% # remove work locations - nest_verbose(-{{user_expr}}) %>% + nest_verbose(-{{ user_expr }}) %>% summarise_nested(n_home = n()) # get number of returned home locations - - ## for one home users, directly extract the location as their homes + + ## for one home users, directly extract the location as their homes df_one_home <- df_with_home_type %>% filter(n_home == 1) - - output <- extract_location(df_one_home %>% dplyr::select(-n_home), user = user, - location = location, show_n_loc = 1, keep_score = keep_score, desc(n_days_loc), desc(n_points_loc)) - + + output <- extract_location(df_one_home %>% dplyr::select(-n_home), + user = user, + location = location, show_n_loc = 1, keep_score = keep_score, desc(n_days_loc), desc(n_points_loc) + ) + ## for multiple home users, take the top 2 most frequently visit places and do the comparision - df_multiple_homes <- df_with_home_type %>% - filter(n_home > 1) %>% + df_multiple_homes <- df_with_home_type %>% + filter(n_home > 1) %>% arrange_nested(., desc(n_days_loc), desc(n_points_loc)) # order the location by number of days and number of data points in descending order - - - decide_home <- function(data){ - homes <- data %>% pull({{location_expr}}) + + + decide_home <- function(data) { + homes <- data %>% pull({{ location_expr }}) home1 <- homes[1] - home1_neighbors <- df_neighbors %>% filter({{location_expr}} == home1) %>% pull(neighbor) %>% unlist() # get the neighboring locations of first home + home1_neighbors <- df_neighbors %>% + filter({{ location_expr }} == home1) %>% + pull(neighbor) %>% + unlist() # get the neighboring locations of first home home2 <- homes[2] # if home 1 and home 2 are neighboring location, extract home1 (one with the higher number of days, and the higher number of data points) - if(home2 %in% home1_neighbors){ + if (home2 %in% home1_neighbors) { return(home1) - } else{ + } else { n_days_homes <- data %>% pull(n_days_loc) n_days_home1 <- n_days_homes[1] n_days_home2 <- n_days_homes[2] # if the most frequently visited location covers more than 75 percent of the days a user stayed at the two most frequently visited locations, return the place as home - if((n_days_home1/(n_days_home1 + n_days_home2)) > 0.75){ + if ((n_days_home1 / (n_days_home1 + n_days_home2)) > 0.75) { return(home1) - } else{ + } else { sd_homes <- data %>% pull(sd_start_time) sd_home1 <- sd_homes[1] sd_home2 <- sd_homes[2] - # the location with the larger standard deviation is classified as the home - if(sd_home1 > sd_home2){ + # the location with the larger standard deviation is classified as the home + if (sd_home1 > sd_home2) { return(home1) - } else{ + } else { return(home2) } } } } - - if(keep_score){ - df_multiple_homes %>% - dplyr::select(-n_home) %>% - mutate_verbose(., home = map_chr(data, function(x) decide_home(x))) %>% + + if (keep_score) { + df_multiple_homes %>% + dplyr::select(-n_home) %>% + mutate_verbose(., home = map_chr(data, function(x) decide_home(x))) %>% bind_rows(., output) - } else{ - df_multiple_homes %>% - mutate_verbose(., home = map_chr(data, function(x) decide_home(x))) %>% - dplyr::select(u_id, home) %>% + } else { + df_multiple_homes %>% + mutate_verbose(., home = map_chr(data, function(x) decide_home(x))) %>% + dplyr::select(u_id, home) %>% bind_rows(., output) } - } diff --git a/R/mutate.R b/R/mutate.R index bc8fbde..66f4509 100644 --- a/R/mutate.R +++ b/R/mutate.R @@ -1,107 +1,106 @@ -#' Add new variable -#' -#' Add new variable and perserves existing to dataframe -#' -#' @param df A dataframe +#' Add new variable +#' +#' Add new variable and perserves existing to dataframe +#' +#' @param df A dataframe #' @param ... Name-value pairs of expressions -#' +#' #' @importFrom emo ji #' @importFrom dplyr setdiff -#' -#' +#' +#' #' @export -mutate_verbose <- function(df, ...){ +mutate_verbose <- function(df, ...) { if (!is.data.frame(df)) { stop(paste(emo::ji("bomb"), "Dataset is not a dataframe!")) } - + var_expr <- enquos(..., .named = TRUE) - + message(paste(emo::ji("hammer_and_wrench"), "Start adding...")) start.time <- Sys.time() output <- df %>% mutate(!!!var_expr) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- names(df) colnames_new <- names(output) - colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - + colnames_added <- dplyr::setdiff(colnames_new, colnames_original) + message(paste(emo::ji("white_check_mark"), "Finish adding! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Adding time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Adding time:", time.taken, "secs")) } message("\n") - + return(output) } #' Add new variables within a nested dataframe -#' +#' #' Add new variables and preserves existing within a nested dataframe -#' @param df A nested dataframe +#' @param df A nested dataframe #' @param ... Name-value pairs of expressions -#' +#' #' @importFrom emo ji #' @importFrom purrr map #' @importFrom dplyr setdiff #' @export -mutate_nested <- function(df, ...){ - - if(!is.list(df[ , grepl("^data$", names(df))])){ +mutate_nested <- function(df, ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Dataset is not nested!")) } - + var_expr <- enquos(...) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) + colname_nested_data <- names(df[, grepl("^data$", names(df))]) - add_with_progress <- function(data){ + add_with_progress <- function(data) { pb$tick()$print() - data %>% + data %>% mutate(!!!var_expr) } - + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - - + + message(paste(emo::ji("hammer_and_wrench"), "Start adding variable(s)...")) start.time <- Sys.time() output <- df %>% - mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~add_with_progress(.))) + mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ add_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- df[[colname_nested_data]][[1]] %>% names() colnames_new <- output[[colname_nested_data]][[1]] %>% names() - colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - + colnames_added <- dplyr::setdiff(colnames_new, colnames_original) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish adding! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Adding time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Adding time:", time.taken, "secs")) } message("\n") - + return(output) } #' Calculate the proportion of categories for a variable within a nested dataframe -#' +#' #' Calculate the proportion of categories for a variable within a nested dataframe and convert each categories to a new variable adding to the dataframe -#' @param df A nested dataframe -#' @param var Name of column to calculate -#' -#' +#' @param df A nested dataframe +#' @param var Name of column to calculate +#' +#' #' @importFrom emo ji #' @importFrom dplyr select #' @importFrom dplyr summarise @@ -110,130 +109,116 @@ mutate_nested <- function(df, ...){ #' @importFrom purrr map #' @importFrom dplyr setdiff -#' @export -prop_factor_nested <- function(df, ...){ - - if(!is.list(df[ , grepl("^data$", names(df))])){ +#' @export +prop_factor_nested <- function(df, ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Dataset is not nested!")) } - + var_expr <- enquos(...) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) + colname_nested_data <- names(df[, grepl("^data$", names(df))]) - add_with_progress <- function(data){ + add_with_progress <- function(data) { pb$tick()$print() - data %>% - dplyr::select(!!!var_expr) %>% - rownames_to_column(var = "id") %>% - gather(key = "key", value = "value", -id) %>% - group_by(key, value) %>% - dplyr::summarise(n = n()) %>% - group_by(key) %>% - mutate(total = sum(n), - prop = n/total) %>% - ungroup() %>% - dplyr::select(value, prop) %>% + data %>% + dplyr::select(!!!var_expr) %>% + rownames_to_column(var = "id") %>% + gather(key = "key", value = "value", -id) %>% + group_by(key, value) %>% + dplyr::summarise(n = n()) %>% + group_by(key) %>% + mutate( + total = sum(n), + prop = n / total + ) %>% + ungroup() %>% + dplyr::select(value, prop) %>% spread(value, prop) } - #create the progress bar + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + message(paste(emo::ji("hammer_and_wrench"), "Start calculating proportion...")) start.time <- Sys.time() - output <- df %>% - dplyr::bind_cols(do.call(dplyr::bind_rows, purrr::map(df[[colname_nested_data]], ~add_with_progress(.)))) %>% + output <- df %>% + dplyr::bind_cols(do.call(dplyr::bind_rows, purrr::map(df[[colname_nested_data]], ~ add_with_progress(.)))) %>% replace(., is.na(.), 0) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- names(df) colnames_new <- names(output) colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - - + + message("\n") message(paste(emo::ji("white_check_mark"), "Finish calculating! There are", length(colnames_added), "new calculated variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Calculating time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Calculating time:", time.taken, "secs")) } message("\n") - + return(output) } #' Add new variable within a double-nested dataframe -#' +#' #' Add new variable within a double-nested dataframe -#' @param df A dataframe +#' @param df A dataframe #' @param nest_cols A selection of columns to nest in existing list-column #' @param ... Name-value pairs of functions -#' +#' #' @importFrom emo ji #' @importFrom purrr map -#' +#' #' @export -mutate_double_nested <- function(df, nest_cols, ...){ - - if(nrow(df) == 0){ +mutate_double_nested <- function(df, nest_cols, ...) { + if (nrow(df) == 0) { stop(paste(emo::ji("bomb"), "No user left, tune your threshold and try again.")) } - + stopifnot( - is.list(df[ , grepl("^data$", names(df))]) + is.list(df[, grepl("^data$", names(df))]) ) - + var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) + colname_nested_data <- names(df[, grepl("^data$", names(df))]) add_variable <- . %>% mutate(!!!var_expr) - add_column <- . %>% - mutate(data = purrr::map(data, add_variable)) - - - # double nest - df[[colname_nested_data]] <- purrr::map(df[[colname_nested_data]], ~.x %>% nest(data = nest_cols)) - + add_column <- . %>% + mutate(data = purrr::map(data, add_variable)) + + + # double nest + df[[colname_nested_data]] <- purrr::map(df[[colname_nested_data]], ~ .x %>% nest(data = nest_cols)) + message(paste(emo::ji("hammer_and_wrench"), "Start adding values...")) start.time <- Sys.time() - output <- df %>% - mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], add_column)) + output <- df %>% + mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], add_column)) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- df[[colname_nested_data]][[1]] %>% names() colnames_new <- output[[colname_nested_data]][[1]] %>% names() colnames_new <- colnames_new[-which(colnames_new == "data")] colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - + message(paste(emo::ji("white_check_mark"), "Finish adding! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Adding time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Adding time:", time.taken, "secs")) } message("\n") - + return(output) } - - - - - - - - - - - - - - diff --git a/R/nest.R b/R/nest.R index 69741b3..993e208 100644 --- a/R/nest.R +++ b/R/nest.R @@ -1,245 +1,230 @@ -#' Nest dataframe -#' +#' Nest dataframe +#' #' Nesting creates a list-column of dataframe -#' @param df A dataframe -#' @param ... A selection of columns. -#' -#' +#' @param df A dataframe +#' @param ... A selection of columns. +#' +#' #' @importFrom emo ji #' @importFrom dplyr progress_estimated -#' @importFrom purrr map -#' +#' @importFrom purrr map +#' #' @export -nest_verbose <- function(df, ...){ - +nest_verbose <- function(df, ...) { if (!is.data.frame(df)) { stop(paste(emo::ji("bomb"), "Dataset is not a dataframe!")) } - + var_expr <- enquos(..., .named = TRUE) - + message(paste(emo::ji("hammer_and_wrench"), "Start nesting...")) start.time <- Sys.time() output <- df %>% nest_legacy(!!!var_expr) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message(paste(emo::ji("white_check_mark"), "Finish nesting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Nesting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Nesting time:", time.taken, "secs")) } message("\n") - + return(output) } #' Unnest dataframe -#' +#' #' Unnesting makes each element of the list its own row. #' @param df A dataframe -#' @param ... Specification of columns to unnest. -#' +#' @param ... Specification of columns to unnest. +#' #' @importFrom emo ji #' @export -unnest_verbose <- function(df, ...){ - +unnest_verbose <- function(df, ...) { if (!is.data.frame(df)) { stop(paste(emo::ji("bomb"), "Dataset is not a dataframe!")) } - + var_expr <- enquos(..., .named = TRUE) - + message(paste(emo::ji("hammer_and_wrench"), "Start unnesting...")) start.time <- Sys.time() output <- suppressWarnings( - df %>% unnest_legacy(!!!var_expr)) + df %>% unnest_legacy(!!!var_expr) + ) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message(paste(emo::ji("white_check_mark"), "Finish unnesting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Unnesting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Unnesting time:", time.taken, "secs")) } message("\n") - + return(output) } -#' Unnest within a nested dataframe -#' +#' Unnest within a nested dataframe +#' #' Flatten list-column to regular columns inside a nested dataframe -#' +#' #' @param df A dataframe -#' @param ... Specification of columns to unnest -#' +#' @param ... Specification of columns to unnest +#' #' @importFrom emo ji #' @importFrom dplyr progress_estimated #' @importFrom purrr map -#' +#' #' @export -unnest_nested <- function(df, ...){ - if(!is.list(df[ , grepl("^data$", names(df))])){ +unnest_nested <- function(df, ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Error: Dataset is not nested!")) } - + var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - - unnest_with_progress <- function(data){ + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + + unnest_with_progress <- function(data) { pb$tick()$print() suppressWarnings( data %>% unnest_legacy(!!!var_expr) ) } - #create the progress bar + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + message(paste(emo::ji("hammer_and_wrench"), "Start unnesting...")) start.time <- Sys.time() output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~unnest_with_progress(.))) + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ unnest_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish unnesting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Unnesting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Unnesting time:", time.taken, "secs")) } message("\n") - + return(output) } -#' Nest within a nested dataframe -#' +#' Nest within a nested dataframe +#' #' Double nesting creates a list-column of nested dataframe -#' @param df A nested dataframe -#' @param ... A selection of columns. -#' +#' @param df A nested dataframe +#' @param ... A selection of columns. +#' #' @importFrom emo ji #' @importFrom dplyr progress_estimated #' @importFrom purrr map #' @export -nest_nested <- function(df, ...){ - - if(!is.list(df[ , grepl("^data$", names(df))])){ +nest_nested <- function(df, ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Error: Dataset is not nested!")) } - + var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) + colname_nested_data <- names(df[, grepl("^data$", names(df))]) - nest_with_progress <- function(data){ + nest_with_progress <- function(data) { pb$tick()$print() suppressWarnings( data %>% nest_legacy(!!!var_expr) - ) + ) } - - #create the progress bar + + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - - + + message(paste(emo::ji("hammer_and_wrench"), "Start nesting...")) start.time <- Sys.time() output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~nest_with_progress(.))) + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ nest_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish nesting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Nesting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Nesting time:", time.taken, "secs")) } message("\n") - + return(output) } #' Unnest within a double-nested dataframe -#' +#' #' Unnesting makes each element of the list its own row. #' @param df A nested dataframe -#' @param ... Specification of columns to unnest. -#' +#' @param ... Specification of columns to unnest. +#' #' @importFrom emo ji #' @importFrom dplyr progress_estimated #' @importFrom purrr map -#' -#' @export -unnest_double_nested <- function(df, ...){ - - if(!is.list(df[ , grepl("^data$", names(df))])){ +#' +#' @export +unnest_double_nested <- function(df, ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Error: Dataset is not nested!")) } - + var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - unnest_with_progress <- function(data){ + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + unnest_with_progress <- function(data) { pb$tick()$print() suppressWarnings( data %>% unnest_legacy(!!!var_expr) ) } - #create the progress bar + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + message(paste(emo::ji("hammer_and_wrench"), "Start unnesting...")) start.time <- Sys.time() output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~unnest_with_progress(.))) %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ unnest_with_progress(.))) %>% unnest_legacy() end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish unnesting!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Unnesting time:", time.taken, "mins")) - }else{ + } else { message(paste(emo::ji("hourglass"), "Unnesting time:", time.taken, "secs")) } message("\n") - + return(output) } - - - - - - - - - - - - diff --git a/R/remove_top_users.R b/R/remove_top_users.R index b4f7a9d..8e8bcd6 100644 --- a/R/remove_top_users.R +++ b/R/remove_top_users.R @@ -1,63 +1,64 @@ #' Remove top N percent of active users based on the frequency of data points per user -#' -#' Remove top N percent of active users based on the frequency of data points per user. -#' Although the majority of users are real people, some accounts are run by algorithms or 'bots', whereas others can be considered as spam accounts. -#' Removing a certain top N percent of active users is an oft-used approach to remove such accounts and reduce the number of such users in the final dataset. -#' -#' @param df A dataframe with columns for the user id, counts point per user +#' +#' Remove top N percent of active users based on the frequency of data points per user. +#' Although the majority of users are real people, some accounts are run by algorithms or 'bots', whereas others can be considered as spam accounts. +#' Removing a certain top N percent of active users is an oft-used approach to remove such accounts and reduce the number of such users in the final dataset. +#' +#' @param df A dataframe with columns for the user id, counts point per user #' @param user Name of column that holds unique identifier for each user -#' @param counts Name of column that holds the data points frequency for each user +#' @param counts Name of column that holds the data points frequency for each user #' @param topNpct_user A decimal number that represent the certain percentage of users to remove -#' @param rm_topNpct_user Option to remove or keep the top N percent active users -#' +#' @param rm_topNpct_user Option to remove or keep the top N percent active users +#' #' @importFrom rlang has_name #' @importFrom rlang sym #' @importFrom emo ji #' @importFrom dplyr n_distinct #' @importFrom dplyr slice -#' +#' #' @export -remove_top_users <- function(df, user = "u_id", counts = "n_points", topNpct_user = 1, rm_topNpct_user = F){ - +remove_top_users <- function(df, user = "u_id", counts = "n_points", topNpct_user = 1, rm_topNpct_user = F) { if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } if (!rlang::has_name(df, counts)) { stop(paste(emo::ji("bomb"), "The column of counts of data points for each user does not exist!")) } - - user <- rlang::sym(user) + + user <- rlang::sym(user) counts <- rlang::sym(counts) - - n_original_users <- df %>% pull({{user}}) %>% dplyr::n_distinct() + + n_original_users <- df %>% + pull({{ user }}) %>% + dplyr::n_distinct() message(paste(emo::ji("bust_in_silhouette"), "There are", n_original_users, "users at this moment.")) - - if(rm_topNpct_user == F){ + + if (rm_topNpct_user == F) { message(paste(emo::ji("bust_in_silhouette"), "Skip removing active users")) - output <- df %>% arrange(desc({{counts}})) - }else{ - message(paste0(emo::ji("hammer_and_wrench"), " Start removing top ", topNpct_user, "% top active users...")) + output <- df %>% arrange(desc({{ counts }})) + } else { + message(paste0(emo::ji("hammer_and_wrench"), " Start removing top ", topNpct_user, "% top active users...")) start.time <- Sys.time() - output <- df %>% - arrange(desc({{counts}})) %>% - dplyr::slice(round(n_original_users*(topNpct_user/100)):n()) + output <- df %>% + arrange(desc({{ counts }})) %>% + dplyr::slice(round(n_original_users * (topNpct_user / 100)):n()) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - - n_new_users <- output %>% pull({{user}}) %>% n_distinct() + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + + n_new_users <- output %>% + pull({{ user }}) %>% + n_distinct() n_removed_users <- n_original_users - n_new_users message(paste(emo::ji("white_check_mark"), "Finish removing! Removed", n_removed_users, "active users!")) message(paste(emo::ji("bust_in_silhouette"), "There are", n_new_users, "users left.")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Removing time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Removing time:", time.taken, "secs")) } message("\n") } return(output) } - - diff --git a/R/score.R b/R/score.R index ddf3a27..e5bbd06 100644 --- a/R/score.R +++ b/R/score.R @@ -1,152 +1,150 @@ #' Give a weighted value for one or more variables in a nested dataframe -#' +#' #' Give a weighted value for one or more variables in a nested dataframe #' @param df A nested dataframe by user #' @param user Name of column that holds unique identifier for each user #' @param location Name of column that holds unique identifier for each location -#' @param keep_ori_vars Option to keep or drop original varialbes +#' @param keep_ori_vars Option to keep or drop original varialbes #' @param ... Name-value pairs of expression -#' +#' #' @importFrom rlang has_name -#' @importFrom rlang sym +#' @importFrom rlang sym #' @importFrom emo ji -#' @importFrom dplyr select +#' @importFrom dplyr select #' @importFrom dplyr setdiff #' @importFrom dplyr progress_estimated #' @importFrom purrr map -#' +#' #' @export -score_nested <- function(df, user = "u_id", location = "loc_id", keep_original_vars = F, ...){ +score_nested <- function(df, user = "u_id", location = "loc_id", keep_original_vars = F, ...) { if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } - + if (!rlang::has_name(df, location)) { stop(paste(emo::ji("bomb"), "Location column does not exist!")) } - + var_expr <- enquos(..., .named = TRUE) - user <- rlang::sym(user) + user <- rlang::sym(user) location <- rlang::sym(location) - + df_nest <- df %>% - nest_legacy(-({{user}})) - - colname_nested_data <- names(df_nest[ , grepl("^data$", names(df_nest))]) - - transmute_with_progress <- function(data){ + nest_legacy(-({{ user }})) + + colname_nested_data <- names(df_nest[, grepl("^data$", names(df_nest))]) + + transmute_with_progress <- function(data) { pb$tick()$print() - transmute_column <- data %>% + transmute_column <- data %>% transmute(!!!var_expr) - - data %>% - dplyr::select({{location}}) %>% + + data %>% + dplyr::select({{ location }}) %>% bind_cols(transmute_column) } - - add_with_progress <- function(data){ + + add_with_progress <- function(data) { pb$tick()$print() - data %>% + data %>% dplyr::mutate(!!!var_expr) } - - #create the progress bar + + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - - + + message(paste(emo::ji("hammer_and_wrench"), "Start scoring ...")) start.time <- Sys.time() - if(keep_original_vars){ - output <- df_nest %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df_nest[[colname_nested_data]], ~add_with_progress(.))) - }else{ - output <- df_nest %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df_nest[[colname_nested_data]], ~transmute_with_progress(.))) + if (keep_original_vars) { + output <- df_nest %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df_nest[[colname_nested_data]], ~ add_with_progress(.))) + } else { + output <- df_nest %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df_nest[[colname_nested_data]], ~ transmute_with_progress(.))) } end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- df_nest[[colname_nested_data]][[1]] %>% names() colnames_new <- output[[colname_nested_data]][[1]] %>% names() colnames_added <- dplyr::setdiff(colnames_new, colnames_original) message("\n") message(paste(emo::ji("white_check_mark"), "Finish scoring! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Scoring time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Scoring time:", time.taken, "secs")) } message("\n") - + return(output) } #' Summarises all scored columns and return one single summary score per row -#' +#' #' summarises all scored columns and return one single summary score per row -#' @param df A dataframe +#' @param df A dataframe #' @param user Name of column that holds unique identifier for each user #' @param location Name of column that holds unique identifier for each location #' @param ... A selection of columns to sum -#' -#' +#' +#' #' @importFrom rlang has_name -#' @importFrom rlang sym +#' @importFrom rlang sym #' @importFrom emo ji -#' @importFrom dplyr select +#' @importFrom dplyr select #' @importFrom dplyr setdiff #' @importFrom dplyr progress_estimated #' @importFrom purrr map -#' @export -score_summary <- function(df, user = "u_id", location = "loc_id", ...){ +#' @export +score_summary <- function(df, user = "u_id", location = "loc_id", ...) { if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } - - user <- rlang::sym(user) + + user <- rlang::sym(user) location <- rlang::sym(location) var_expr <- enquos(...) - - colname_nested_data <- names(df[,grepl("^data$", names(df))]) - - sum_score_with_progress <- function(data){ + + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + sum_score_with_progress <- function(data) { pb$tick()$print() - data_sub <- data %>% dplyr::select({{location}}, !!!var_expr) - + data_sub <- data %>% dplyr::select({{ location }}, !!!var_expr) + location_index <- which(colnames(data_sub) == location) - data_sub %>% - dplyr::mutate(score = rowSums(.[ , -c(location_index)])) + data_sub %>% + dplyr::mutate(score = rowSums(.[, -c(location_index)])) } - + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - - + + message(paste(emo::ji("hammer_and_wrench"), "Start summing scores...")) start.time <- Sys.time() - output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~sum_score_with_progress(.))) + output <- df %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ sum_score_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- df[[colname_nested_data]][[1]] %>% names() colnames_new <- output[[colname_nested_data]][[1]] %>% names() colnames_added <- dplyr::setdiff(colnames_new, colnames_original) message("\n") message(paste(emo::ji("white_check_mark"), "Finish summing! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Summing time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Summing time:", time.taken, "secs")) } message("\n") - + return(output) } - - diff --git a/R/spread.R b/R/spread.R index c942818..6217862 100644 --- a/R/spread.R +++ b/R/spread.R @@ -1,64 +1,53 @@ #' Spread a key-value pair across multiple columns -#' -#' Spread a key-value pair across multiple columns in nested dataframe -#' @param df A nested dataframe +#' +#' Spread a key-value pair across multiple columns in nested dataframe +#' @param df A nested dataframe #' @param var_key Column name or position #' @param var_value Column name or position -#' -#' +#' +#' #' @importFrom rlang sym #' @importFrom dplyr progress_estimated #' @importFrom dplyr setdiff #' @importFrom purrr map #' @importFrom emo ji #' @export -spread_nested <- function(df, key_var, value_var){ - - key_var_expr <- rlang::sym(key_var) - value_var_expr <- rlang::sym(value_var) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) +spread_nested <- function(df, key_var, value_var) { + key_var_expr <- rlang::sym(key_var) + value_var_expr <- rlang::sym(value_var) + colname_nested_data <- names(df[, grepl("^data$", names(df))]) - spread_with_progress <- function(data){ + spread_with_progress <- function(data) { pb$tick()$print() data %>% - spread(key = {{key_var_expr}}, value = {{value_var_expr}}) %>% + spread(key = {{ key_var_expr }}, value = {{ value_var_expr }}) %>% replace(., is.na(.), 0) } - #create the progress bar + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - + message(paste(emo::ji("hammer_and_wrench"), "Start spreading", key_var, "variable...")) start.time <- Sys.time() output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~spread_with_progress(.))) + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ spread_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- names(df[[colname_nested_data]][[1]]) colnames_new <- names(output[[colname_nested_data]][[1]]) - colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - + colnames_added <- dplyr::setdiff(colnames_new, colnames_original) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish spreading! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Spreading time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Spreading time:", time.taken, "secs")) } message("\n") - + return(output) } - - - - - - - - - - diff --git a/R/summarise.R b/R/summarise.R index 647b249..2f93d45 100644 --- a/R/summarise.R +++ b/R/summarise.R @@ -1,24 +1,24 @@ #' Aggregate multiple values to a single value within a nested tibble -#' +#' #' Aggregate multiple values to a single value of existing table by name-value paired summary function within a nested tibble -#' @param df A nested dataframe +#' @param df A nested dataframe #' @param ... Name-value pairs of summary functions. -#' +#' #' @importFrom emo ji #' @importFrom dplyr progress_estimated #' @importFrom dplyr setdiff #' @importFrom purrr map -#' +#' #' @export -summarise_nested <- function(df, ...){ - if(!is.list(df[ , grepl("^data$", names(df))])){ +summarise_nested <- function(df, ...) { + if (!is.list(df[, grepl("^data$", names(df))])) { stop(paste(emo::ji("bomb"), "Dataset is not nested!")) } var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - summarise_with_progress <- function(data){ + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + summarise_with_progress <- function(data) { pb$tick()$print() data %>% dplyr::summarise(!!!var_expr) } @@ -28,88 +28,84 @@ summarise_nested <- function(df, ...){ message(paste(emo::ji("hammer_and_wrench"), "Start summarising values...")) start.time <- Sys.time() output <- df %>% - dplyr::mutate(adds = purrr::map(df[[colname_nested_data]], ~summarise_with_progress(.))) %>% + dplyr::mutate(adds = purrr::map(df[[colname_nested_data]], ~ summarise_with_progress(.))) %>% unnest_legacy(adds) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- names(df) colnames_new <- names(output) - colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - + colnames_added <- dplyr::setdiff(colnames_new, colnames_original) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish summarising! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Summarising time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Summarising time:", time.taken, "secs")) } message("\n") - + return(output) } #' Aggregate multiple values to a single value in a double-nested tibble -#' +#' #' Create a list-column in existing list-column and aggregate multiple values in created list-column to a single value by name-value paired summary function -#' @param df A nested dataframe +#' @param df A nested dataframe #' @param nest_cols A selection of columns to nest in existing list-column #' @param ... Name-value pairs of summary functions -#' +#' #' @importFrom emo ji #' @importFrom dplyr setdiff #' @importFrom purrr map -#' -#' @export -summarise_double_nested <- function(df, nest_cols, ...){ - - if(nrow(df) == 0){ +#' +#' @export +summarise_double_nested <- function(df, nest_cols, ...) { + if (nrow(df) == 0) { stop(paste(emo::ji("bomb"), "No user left, tune your threshold and try again.")) } - + stopifnot( - is.list(df[ , grepl("^data$", names(df))]) + is.list(df[, grepl("^data$", names(df))]) ) - + var_expr <- enquos(..., .named = TRUE) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + cal_column <- . %>% dplyr::summarise(!!!var_expr) - - add_column <- . %>% - dplyr::mutate(adds = purrr::map(data, cal_column)) %>% - unnest_legacy(adds) - - # double nest - df[[colname_nested_data]] <- purrr::map(df[[colname_nested_data]], ~.x %>% nest(data = nest_cols)) - + + add_column <- . %>% + dplyr::mutate(adds = purrr::map(data, cal_column)) %>% + unnest_legacy(adds) + + # double nest + df[[colname_nested_data]] <- purrr::map(df[[colname_nested_data]], ~ .x %>% nest(data = nest_cols)) + message(paste(emo::ji("hammer_and_wrench"), "Start summarising values...")) start.time <- Sys.time() - output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], add_column)) + output <- df %>% + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], add_column)) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + colnames_original <- df[[colname_nested_data]][[1]] %>% names() colnames_new <- output[[colname_nested_data]][[1]] %>% names() colnames_new <- colnames_new[-which(colnames_new == "data")] colnames_added <- dplyr::setdiff(colnames_new, colnames_original) - + message(paste(emo::ji("white_check_mark"), "Finish summarising! There are", length(colnames_added), "new added variables:", paste(colnames_added, collapse = ", "))) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Summarising time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Summarising time:", time.taken, "secs")) } message("\n") - + return(output) } - - - diff --git a/R/topN.R b/R/topN.R index 28c26f3..f066582 100644 --- a/R/topN.R +++ b/R/topN.R @@ -1,51 +1,44 @@ -#' Select top n rows by certain value -#' +#' Select top n rows by certain value +#' #' Select top n rows in each group, ordered by wt within a nested dataframe -#' @param df A nested dataframe -#' @param n Number of rows to return -#' @param wt The variable to use for ordering -#' +#' @param df A nested dataframe +#' @param n Number of rows to return +#' @param wt The variable to use for ordering +#' #' @importFrom dplyr progress_estimated #' @importFrom rlang sym #' @importFrom emo ji #' @export -top_n_nested <- function(df, n = 2, wt){ +top_n_nested <- function(df, n = 2, wt) { wt_expr <- rlang::sym(wt) - colname_nested_data <- names(df[ , grepl("^data$", names(df))]) - - top_n_with_progress <- function(data){ + colname_nested_data <- names(df[, grepl("^data$", names(df))]) + + top_n_with_progress <- function(data) { pb$tick()$print() - data %>% - top_n(n = n, wt = {{wt_expr}}) + data %>% + top_n(n = n, wt = {{ wt_expr }}) } - - #create the progress bar + + # create the progress bar pb <- dplyr::progress_estimated(nrow(df)) - - message(paste(emo::ji("hammer_and_wrench"), "Start selecting top", n, "row(s)...")) + + message(paste(emo::ji("hammer_and_wrench"), "Start selecting top", n, "row(s)...")) start.time <- Sys.time() output <- df %>% - dplyr::mutate({{colname_nested_data}} := purrr::map(df[[colname_nested_data]], ~top_n_with_progress(.))) + dplyr::mutate({{ colname_nested_data }} := purrr::map(df[[colname_nested_data]], ~ top_n_with_progress(.))) end.time <- Sys.time() - time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) - + time.taken <- difftime(end.time, start.time, units = "secs") %>% round(., 3) + message("\n") message(paste(emo::ji("white_check_mark"), "Finish selecting top", n, "row(s)!")) - - if(time.taken > 60){ - time.taken <- round(time.taken/60, 2) + + if (time.taken > 60) { + time.taken <- round(time.taken / 60, 2) message(paste(emo::ji("hourglass"), "Selecting time:", time.taken, "mins")) - } else{ + } else { message(paste(emo::ji("hourglass"), "Selecting time:", time.taken, "secs")) } message("\n") - + return(output) } - - - - - - - diff --git a/R/validate_dataset.R b/R/validate_dataset.R index 7865aaa..577174a 100644 --- a/R/validate_dataset.R +++ b/R/validate_dataset.R @@ -1,23 +1,23 @@ -#' Validate input dataset -#' -#' To make sure the input dataset contains all three necessary variables: +#' Validate input dataset +#' +#' To make sure the input dataset contains all three necessary variables: #' a unique identifier for the person or user -#' a unique identifier for the spatial locaiton for the data point -#' and a timestamp that relects the time the data point was created -#' +#' a unique identifier for the spatial locaiton for the data point +#' and a timestamp that relects the time the data point was created +#' #' @param df A dataframe with columns for the user id, location, timestamp #' @param user Name of column that holds unique identifier for each user #' @param timestamp Name of column that holds specific timestamp for each data point and it should be POSIXct #' @param location Name of column that holds unique identifier for each location #' @param keep_other_vars Option to keep or remove other variables of the input dataset -#' +#' #' @importFrom emo ji #' @importFrom rlang sym #' @importFrom rlang has_name #' @importFrom dplyr select -#' +#' #' @export -validate_dataset <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", keep_other_vars = F){ +validate_dataset <- function(df, user = "u_id", timestamp = "created_at", location = "loc_id", keep_other_vars = F) { if (!rlang::has_name(df, user)) { stop(paste(emo::ji("bomb"), "User column does not exist!")) } @@ -36,26 +36,22 @@ validate_dataset <- function(df, user = "u_id", timestamp = "created_at", locati stop(paste(emo::ji("bomb"), "Dataset is not a dataframe!")) } - if (!is(df %>% pull({{timestamp}}), "POSIXct")) { + if (!is(df %>% pull({{ timestamp }}), "POSIXct")) { stop("Timestamp is not of class POSIXct") } - unique_users <- df %>% pull({{user}}) %>% n_distinct() + unique_users <- df %>% + pull({{ user }}) %>% + n_distinct() message(paste(emo::ji("tada"), "Congratulations!! Your dataset has passed validation.")) message(paste(emo::ji("bust_in_silhouette"), "There are", unique_users, "unique users in your dataset.")) message(paste(emo::ji("earth_asia"), "Now start your journey identifying their meaningful location(s)!")) message(paste(emo::ji("clap"), "Good luck!")) message("\n") - if(keep_other_vars) { + if (keep_other_vars) { df } else { - df %>% dplyr::select(c({{user}}, {{location}}, {{timestamp}})) + df %>% dplyr::select(c({{ user }}, {{ location }}, {{ timestamp }})) } } - - - - - - diff --git a/README.Rmd b/README.Rmd index 872e9f6..57822a9 100644 --- a/README.Rmd +++ b/README.Rmd @@ -52,7 +52,7 @@ library(here) ```{r validate} -# load test sample dataset +# load test sample dataset data("test_sample", package = "homelocator") df_validated <- validate_dataset(test_sample, user = "u_id", timestamp = "created_at", location = "grid_id") head(df_validated) @@ -103,15 +103,19 @@ identify_location(test_sample, user = "u_id", timestamp = "created_at", location #### FREQ ```{r eval=F} # recipe: Frequency -- FREQ -identify_location(test_sample, user = "u_id", timestamp = "created_at", location = "grid_id", - show_n_loc = 1, recipe = "FREQ") +identify_location(test_sample, + user = "u_id", timestamp = "created_at", location = "grid_id", + show_n_loc = 1, recipe = "FREQ" +) ``` #### OSNA ```{r eval=F} # recipe: Online Social Network Activity -- OSNA -identify_location(test_sample, user = "u_id", timestamp = "created_at", location = "grid_id", - show_n_loc = 1, recipe = "OSNA") +identify_location(test_sample, + user = "u_id", timestamp = "created_at", location = "grid_id", + show_n_loc = 1, recipe = "OSNA" +) ``` #### APDM @@ -120,10 +124,12 @@ identify_location(test_sample, user = "u_id", timestamp = "created_at", location ## APDM recipe strictly returns the most likely home location ## It is important to create your location neighbors table before you use the recipe!! ## example: st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****") -## neighbors <- st_queen(df_sf) ===> convert result to dataframe +## neighbors <- st_queen(df_sf) ===> convert result to dataframe data("df_neighbors", package = "homelocator") -identify_location(test_sample, user = "u_id", timestamp = "created_at", location = "grid_id", - show_n_loc = 1, recipe = "APDM") +identify_location(test_sample, + user = "u_id", timestamp = "created_at", location = "grid_id", + show_n_loc = 1, recipe = "APDM" +) ``` diff --git a/data-raw/df_neighbors.R b/data-raw/df_neighbors.R index a4f29cb..47b6332 100644 --- a/data-raw/df_neighbors.R +++ b/data-raw/df_neighbors.R @@ -2,20 +2,20 @@ library(tidyverse) library(sf) library(here) -#generate grid neighbors +# generate grid neighbors grids <- st_read(here("data-raw/MP14_SUBZONE_NO_SEA_PL.shp"), quiet = T) %>% - st_transform(crs = 3414) %>% - st_make_valid() %>% - st_make_grid(., cellsize = 750, square = F) %>% - st_sf() %>% + st_transform(crs = 3414) %>% + st_make_valid() %>% + st_make_grid(., cellsize = 750, square = F) %>% + st_sf() %>% rowid_to_column("grid_id") st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****") neighbors <- st_queen(grids) -#convert list to tibble -list_to_tibble <- function(index, neighbors){ - tibble(grid_id = as.character(index)) %>% +# convert list to tibble +list_to_tibble <- function(index, neighbors) { + tibble(grid_id = as.character(index)) %>% mutate(neighbor = list(neighbors[[index]])) } df_neighbors <- do.call(rbind, map(1:length(neighbors), function(x) list_to_tibble(x, neighbors))) @@ -24,29 +24,21 @@ df_neighbors <- do.call(rbind, map(1:length(neighbors), function(x) list_to_tibb save(df_neighbors, file = "data/df_neighbors.rda") -# generate sample file +# generate sample file df <- readRDS("~/GitProjects/identifyingmeaningfullocations/analysis/data/derived_data/sg_tweets_anonymized_20200929.rds") -df_nested <- df %>% - dplyr::select(u_id_anonymized, created_at_sg_anonymized, grid_id) %>% - dplyr::rename(u_id = u_id_anonymized, created_at = created_at_sg_anonymized) %>% +df_nested <- df %>% + dplyr::select(u_id_anonymized, created_at_sg_anonymized, grid_id) %>% + dplyr::rename(u_id = u_id_anonymized, created_at = created_at_sg_anonymized) %>% nest(data = c(created_at, grid_id)) set.seed(1234) -test_sample <- df_nested %>% - sample_n(size = 100) %>% - unnest(cols = data) %>% +test_sample <- df_nested %>% + sample_n(size = 100) %>% + unnest(cols = data) %>% sample_n(size = nrow(.)) save(test_sample, file = here("data/test_sample.rda")) load("data/test_sample.rda") - - - - - - - - diff --git a/vignettes/homelocator.Rmd b/vignettes/homelocator.Rmd index 34add21..1e087a9 100644 --- a/vignettes/homelocator.Rmd +++ b/vignettes/homelocator.Rmd @@ -45,7 +45,7 @@ To explore the basic data manipulation functions of `homelocator`, we'll use a ` # load data data("test_sample", package = "homelocator") -# show test sample +# show test sample test_sample %>% head(3) ``` @@ -68,7 +68,7 @@ The `validate_dataset()` function makes sure your input dataset contains all thr ```{r} df_validated <- validate_dataset(test_sample, user = "u_id", timestamp = "created_at", location = "grid_id", keep_other_vars = FALSE) -# show result +# show result df_validated %>% head(3) ``` @@ -85,16 +85,16 @@ The `nest_verbose()` and `unnest_verbose()` functions work in the same way as [ # nest data df_nested <- nest_verbose(df_validated, c("created_at", "grid_id")) -# show result +# show result df_nested %>% head(3) -# show result +# show result df_nested$data[[1]] %>% head(3) # unnest data df_unnested <- unnest_verbose(df_nested, data) -# show result +# show result df_unnested %>% head(3) ``` @@ -111,20 +111,20 @@ The `nest_double_nest()` and `unnest_double_nested()` functions work in a simila # double nest data (e.g., nesting column: created_at) df_double_nested <- nest_nested(df_nested %>% head(100), c("created_at")) -# show result +# show result df_double_nested %>% head(3) -# show result +# show result df_double_nested$data[[1]] %>% head(3) df_double_nested$data[[1]]$data[[1]] %>% head(3) -# unnest nested data -df_double_unnested <- df_double_nested %>% - head(100) %>% ## take first 100 rows for example - unnest_double_nested(., data) +# unnest nested data +df_double_unnested <- df_double_nested %>% + head(100) %>% ## take first 100 rows for example + unnest_double_nested(., data) -# show result +# show result df_double_unnested %>% head(3) ``` @@ -138,14 +138,18 @@ The `enrich_timestamp()` function creates additional variables that are derived ```{r} -#original variables -df_nested[1, ] %>% unnest(cols = c(data)) %>% head(3) +# original variables +df_nested[1, ] %>% + unnest(cols = c(data)) %>% + head(3) -# create new variables from "created_at" timestamp column +# create new variables from "created_at" timestamp column df_enriched <- enrich_timestamp(df_nested, timestamp = "created_at") -# show result -df_enriched[1, ] %>% unnest(cols = c(data)) %>% head(3) +# show result +df_enriched[1, ] %>% + unnest(cols = c(data)) %>% + head(3) ``` ### Summarize in nested and double nested dataframe @@ -157,23 +161,25 @@ The `summarise_nested()` function works similar to dplyr's regular `summarise` - `...`: name-value pairs of summary functions ```{r} -# summarize in nested dataframe +# summarize in nested dataframe # e.g., summarise total number of tweets and total number of places per user -df_summarize_nested <- summarise_nested(df_enriched, - n_tweets = n(), - n_locs = n_distinct(grid_id)) +df_summarize_nested <- summarise_nested(df_enriched, + n_tweets = n(), + n_locs = n_distinct(grid_id) +) -# show result +# show result df_summarize_nested %>% head(3) -# summarize in double nested dataframe +# summarize in double nested dataframe # take first 100 users for example -# e.g summarise total number of tweets and totla number of distinct days -df_summarize_double_nested <- summarise_double_nested(df_enriched %>% head(100), - nest_cols = c("created_at", "ymd", "year", "month", "day", "wday", "hour"), - n_tweets = n(), n_days = n_distinct(ymd)) +# e.g summarise total number of tweets and totla number of distinct days +df_summarize_double_nested <- summarise_double_nested(df_enriched %>% head(100), + nest_cols = c("created_at", "ymd", "year", "month", "day", "wday", "hour"), + n_tweets = n(), n_days = n_distinct(ymd) +) -# show result +# show result df_summarize_double_nested[1, ] df_summarize_double_nested[1, ]$data[[1]] %>% head(3) ``` @@ -189,10 +195,12 @@ The `remove_top_users()` function allows to remove top `N` percent of active use ```{r} # remove top 1% active users (e.g based on the frequency of tweets sent by users) -df_removed_active_users <- remove_top_users(df_summarize_nested, user = "u_id", - counts = "n_tweets", topNpct_user = 1) +df_removed_active_users <- remove_top_users(df_summarize_nested, + user = "u_id", + counts = "n_tweets", topNpct_user = 1 +) -# show result +# show result df_removed_active_users %>% head(3) ``` @@ -206,19 +214,23 @@ The `filter_verbose()` function works in the same way as [filter](https://www.rd - `...`: Logical predicates defined in terms of the variables in df. Only rows match conditions are kept. ```{r} -# filter users with less than 10 tweets sent at less than 10 places -df_filtered <- filter_verbose(df_removed_active_users, user = "u_id", - n_tweets > 10 & n_locs > 10) +# filter users with less than 10 tweets sent at less than 10 places +df_filtered <- filter_verbose(df_removed_active_users, + user = "u_id", + n_tweets > 10 & n_locs > 10 +) -# show result +# show result df_filtered %>% head(3) # filter tweets that sent on weekends and during daytime (8am to 6pm) -df_filter_nested <- filter_nested(df_filtered, user = "u_id", - !wday %in% c(1, 7), # 1 means Sunday and 7 means Saturday - !hour %in% seq(8, 18, 1)) +df_filter_nested <- filter_nested(df_filtered, + user = "u_id", + !wday %in% c(1, 7), # 1 means Sunday and 7 means Saturday + !hour %in% seq(8, 18, 1) +) -# show result +# show result df_filter_nested %>% head(3) df_filter_nested$data[[1]] %>% head(3) @@ -238,31 +250,37 @@ Function `mutate_nested()` works in the similar way as `mutate_verbose()` but it ```{r} -## let's use pre-discussed functions to filter some users first +## let's use pre-discussed functions to filter some users first colnmaes_data <- df_filtered$data[[1]] %>% names() -colnmaes_to_nest <- colnmaes_data[-which(colnmaes_data == "grid_id")] - -df_cleaned <- df_filtered %>% - summarise_double_nested(., nest_cols = colnmaes_to_nest, - n_tweets_loc = n(), # number of tweets sent at each location - n_hrs_loc = n_distinct(hour), # number of unique hours of sent tweets - n_days_loc = n_distinct(ymd), # number of unique days of sent tweets - period_loc = as.numeric(max(created_at) - min(created_at), "days")) %>% # period of tweeting - unnest_verbose(data) %>% - filter_verbose(., user = "u_id", - n_tweets_loc > 10 & n_hrs_loc > 10 & n_days_loc > 10 & period_loc > 10) - -# show cleaned dataset +colnmaes_to_nest <- colnmaes_data[-which(colnmaes_data == "grid_id")] + +df_cleaned <- df_filtered %>% + summarise_double_nested(., + nest_cols = colnmaes_to_nest, + n_tweets_loc = n(), # number of tweets sent at each location + n_hrs_loc = n_distinct(hour), # number of unique hours of sent tweets + n_days_loc = n_distinct(ymd), # number of unique days of sent tweets + period_loc = as.numeric(max(created_at) - min(created_at), "days") + ) %>% # period of tweeting + unnest_verbose(data) %>% + filter_verbose(., + user = "u_id", + n_tweets_loc > 10 & n_hrs_loc > 10 & n_days_loc > 10 & period_loc > 10 + ) + +# show cleaned dataset df_cleaned %>% head(3) # ok, then let's apply the mutate_nested function to add four new variables: wd_or_wk, time_numeric, rest_or_work, wk.am_or_wk.pm -df_expanded <- df_cleaned %>% - mutate_nested(wd_or_wk = if_else(wday %in% c(1,7), "weekend", "weekday"), - time_numeric = lubridate::hour(created_at) + lubridate::minute(created_at) / 60 + lubridate::second(created_at) / 3600, - rest_or_work = if_else(time_numeric >= 9 & time_numeric <= 18, "work", "rest"), - wk.am_or_wk.pm = if_else(time_numeric >= 6 & time_numeric <= 12 & wd_or_wk == "weekend", "weekend_am", "weekend_pm")) - -# show result +df_expanded <- df_cleaned %>% + mutate_nested( + wd_or_wk = if_else(wday %in% c(1, 7), "weekend", "weekday"), + time_numeric = lubridate::hour(created_at) + lubridate::minute(created_at) / 60 + lubridate::second(created_at) / 3600, + rest_or_work = if_else(time_numeric >= 9 & time_numeric <= 18, "work", "rest"), + wk.am_or_wk.pm = if_else(time_numeric >= 6 & time_numeric <= 12 & wd_or_wk == "weekend", "weekend_am", "weekend_pm") + ) + +# show result df_expanded %>% head(3) ``` @@ -277,10 +295,10 @@ The function `prop_factor_nested()` allows you to calculate the proportion of ca df_expanded$data[[1]] %>% head(3) # calculate proportion of categories for four variables: wd_or_wk, rest_or_work, wk.am_or_wk.pm -df_expanded <- df_expanded %>% - prop_factor_nested(wd_or_wk, rest_or_work, wk.am_or_wk.pm) +df_expanded <- df_expanded %>% + prop_factor_nested(wd_or_wk, rest_or_work, wk.am_or_wk.pm) -# show result +# show result df_expanded %>% head(3) ``` @@ -303,41 +321,47 @@ The `score_summary()` function summarises all scored columns and return one sing ```{r} -## let's add two more variables before we do the scoring -df_expanded <- df_expanded %>% - summarise_nested(n_wdays_loc = n_distinct(wday), - n_months_loc = n_distinct(month)) +## let's add two more variables before we do the scoring +df_expanded <- df_expanded %>% + summarise_nested( + n_wdays_loc = n_distinct(wday), + n_months_loc = n_distinct(month) + ) df_expanded %>% head(3) # when calculating scores, you can give weight to different variables, but the total weight should add up to 1 -df_scored <- df_expanded %>% - score_nested(., user = "u_id", location = "grid_id", keep_original_vars = F, - s_n_tweets_loc = 0.1 * (n_tweets_loc/max(n_tweets_loc)), - s_n_hrs_loc = 0.1 * (n_hrs_loc/24), - s_n_days_loc = 0.1 * (n_days_loc/max(n_days_loc)), - s_period_loc = 0.1 * (period_loc/max(period_loc)), - s_n_wdays_loc = 0.1 * (n_wdays_loc/7), - s_n_months_loc = 0.1 * (n_months_loc/12), - s_weekend = 0.1 * (weekend), - s_rest = 0.2 * (rest), - s_wk_am = 0.1 * (weekend_am)) +df_scored <- df_expanded %>% + score_nested(., + user = "u_id", location = "grid_id", keep_original_vars = F, + s_n_tweets_loc = 0.1 * (n_tweets_loc / max(n_tweets_loc)), + s_n_hrs_loc = 0.1 * (n_hrs_loc / 24), + s_n_days_loc = 0.1 * (n_days_loc / max(n_days_loc)), + s_period_loc = 0.1 * (period_loc / max(period_loc)), + s_n_wdays_loc = 0.1 * (n_wdays_loc / 7), + s_n_months_loc = 0.1 * (n_months_loc / 12), + s_weekend = 0.1 * (weekend), + s_rest = 0.2 * (rest), + s_wk_am = 0.1 * (weekend_am) + ) df_scored %>% head(3) df_scored$data[[1]] -### we can replace the score function by mutate function -df_scored_2 <- df_expanded %>% - nest_verbose(-u_id) %>% - mutate_nested(s_n_tweets_loc = 0.1 * (n_tweets_loc/max(n_tweets_loc)), - s_n_hrs_loc = 0.1 * (n_hrs_loc/24), - s_n_days_loc = 0.1 * (n_days_loc/max(n_days_loc)), - s_period_loc = 0.1 * (period_loc/max(period_loc)), - s_n_wdays_loc = 0.1 * (n_wdays_loc/7), - s_n_months_loc = 0.1 * (n_months_loc/12), - s_weekend = 0.1 * (weekend), - s_rest = 0.2 * (rest), - s_wk_am = 0.1 * (weekend_am)) +### we can replace the score function by mutate function +df_scored_2 <- df_expanded %>% + nest_verbose(-u_id) %>% + mutate_nested( + s_n_tweets_loc = 0.1 * (n_tweets_loc / max(n_tweets_loc)), + s_n_hrs_loc = 0.1 * (n_hrs_loc / 24), + s_n_days_loc = 0.1 * (n_days_loc / max(n_days_loc)), + s_period_loc = 0.1 * (period_loc / max(period_loc)), + s_n_wdays_loc = 0.1 * (n_wdays_loc / 7), + s_n_months_loc = 0.1 * (n_months_loc / 12), + s_weekend = 0.1 * (weekend), + s_rest = 0.2 * (rest), + s_wk_am = 0.1 * (weekend_am) + ) df_scored_2 %>% head(3) df_scored_2$data[[1]] @@ -345,8 +369,8 @@ df_scored_2$data[[1]] ```{r} -# sum varialbes score for each location -df_score_summed <- df_scored %>% +# sum varialbes score for each location +df_score_summed <- df_scored %>% score_summary(., user = "u_id", location = "grid_id", starts_with("s_")) df_score_summed %>% head(3) @@ -367,13 +391,13 @@ The `extract_location()` function allows you to sort the locations of each each ```{r} # extract homes for users based on score value (each user return 1 most possible home) -df_home <- df_score_summed %>% +df_home <- df_score_summed %>% extract_location(., user = "u_id", location = "grid_id", show_n_loc = 1, keep_score = F, score) df_home %>% head(3) # extract homes for users and keep scores of locations -df_home <- df_score_summed %>% +df_home <- df_score_summed %>% extract_location(., user = "u_id", location = "grid_id", show_n_loc = 1, keep_score = T, score) df_home %>% head(3) @@ -391,7 +415,7 @@ The `spread_nested()` function works in the same way as [spread](https://www.rdo ```{r} # let's add one timeframe variable first and calculate the number of data points at each timeframe -df_timeframe <- df_enriched %>% +df_timeframe <- df_enriched %>% mutate_nested(timeframe = if_else(hour >= 2 & hour < 8, "Rest", if_else(hour >= 8 & hour < 19, "Active", "Leisure"))) colnames_nested_data <- df_timeframe$data[[1]] %>% names() @@ -399,13 +423,15 @@ colnames_to_nest <- colnames_nested_data[-which(colnames_nested_data %in% c("gri df_timeframe <- df_timeframe %>% head(20) %>% # take first 20 users as example - summarise_double_nested(., nest_cols = colnames_to_nest, - n_points_timeframe = n()) + summarise_double_nested(., + nest_cols = colnames_to_nest, + n_points_timeframe = n() + ) df_timeframe$data[[2]] # spread timeframe in nested dataframe with key is timeframe and value is n_points_timeframe -df_timeframe_spreaded <- df_timeframe %>% - spread_nested(., key_var = "timeframe", value_var = "n_points_timeframe") +df_timeframe_spreaded <- df_timeframe %>% + spread_nested(., key_var = "timeframe", value_var = "n_points_timeframe") df_timeframe_spreaded$data[[2]] ``` @@ -419,9 +445,9 @@ The `arrange_nested()` function works in the same way as [arrange](https://www.r - `...`: comma separated list of unquoted variable names ```{r} -df_enriched$data[[3]] +df_enriched$data[[3]] -df_arranged <- df_enriched %>% +df_arranged <- df_enriched %>% arrange_nested(desc(hour)) # arrange the hour in descending order df_arranged$data[[3]] @@ -435,23 +461,25 @@ The `arrange_double_nested()` function works in the similar way as `arrange_nest ```{r} -# get the name of columns to nest +# get the name of columns to nest colnames_nested_data <- df_enriched$data[[1]] %>% names() colnmaes_to_nest <- colnames_nested_data[-which(colnames_nested_data %in% c("grid_id"))] -df_double_arranged <- df_enriched %>% +df_double_arranged <- df_enriched %>% head(20) %>% # take the first 20 users for example - arrange_double_nested(., nest_cols = colnmaes_to_nest, - desc(created_at)) # sort by time in descending order + arrange_double_nested(., + nest_cols = colnmaes_to_nest, + desc(created_at) + ) # sort by time in descending order # original third user df_enriched[3, ] -# third user data points +# third user data points df_enriched$data[[3]] # arranged third user df_double_arranged[3, ] -# arranged time +# arranged time df_double_arranged$data[[3]]$data[[2]] ``` @@ -463,8 +491,8 @@ The `top_n_nested()` function works in the same way as [top_n](https://dplyr.tid ```{r} df_enriched$data[[2]] -## get the top 1 row based on hour -df_top_1 <- df_enriched %>% +## get the top 1 row based on hour +df_top_1 <- df_enriched %>% top_n_nested(., n = 1, wt = "hour") df_top_1$data[[2]] @@ -494,23 +522,31 @@ Current available recipes: ```{r eval = FALSE} # recipe: homelocator -- HMLC -identify_location(test_sample, user = "u_id", timestamp = "created_at", - location = "grid_id", show_n_loc = 1, recipe = "HMLC") +identify_location(test_sample, + user = "u_id", timestamp = "created_at", + location = "grid_id", show_n_loc = 1, recipe = "HMLC" +) # recipe: Frequency -- FREQ -identify_location(test_sample, user = "u_id", timestamp = "created_at", - location = "grid_id", show_n_loc = 1, recipe = "FREQ") +identify_location(test_sample, + user = "u_id", timestamp = "created_at", + location = "grid_id", show_n_loc = 1, recipe = "FREQ" +) # recipe: Online Social Network Activity -- OSNA -identify_location(test_sample, user = "u_id", timestamp = "created_at", - location = "grid_id", show_n_loc = 1, recipe = "OSNA") +identify_location(test_sample, + user = "u_id", timestamp = "created_at", + location = "grid_id", show_n_loc = 1, recipe = "OSNA" +) # recipe: Online Social Network Activity -- APDM ## APDM recipe strictly returns the most likely home location ## It is important to load the neighbors table before you use the recipe!! ## example: st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****") -## neighbors <- st_queen(df_sf) ===> convert result to dataframe +## neighbors <- st_queen(df_sf) ===> convert result to dataframe data("df_neighbors", package = "homelocator") -identify_location(test_sample, user = "u_id", timestamp = "created_at", - location = "grid_id", recipe = "APDM", keep_score = F) +identify_location(test_sample, + user = "u_id", timestamp = "created_at", + location = "grid_id", recipe = "APDM", keep_score = F +) ``` diff --git a/vignettes/quick-start.Rmd b/vignettes/quick-start.Rmd index ccf9455..3f31c3e 100644 --- a/vignettes/quick-start.Rmd +++ b/vignettes/quick-start.Rmd @@ -27,7 +27,7 @@ install_github("spatialnetworkslab/homelocator") ## Load library ```{r setup} -#load homelocator package +# load homelocator package library(homelocator) ``` @@ -98,7 +98,7 @@ Calculates the average and standard deviation of start time data points by a sin ## APDM recipe strictly returns the most likely home location ## It is important to load the neighbors table before you use the recipe!! ## example: st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****") -## neighbors <- st_queen(df_sf) ===> convert result to dataframe +## neighbors <- st_queen(df_sf) ===> convert result to dataframe data("df_neighbors", package = "homelocator") identify_location(test_sample, user = "u_id", timestamp = "created_at", location = "grid_id", recipe = "APDM", keep_score = F) ```