Add 'split.data.by.bins.vector' and fix miscellaneous bugs in splitting

Modify 'split.data.time.based' to be able to split by activity-based bins. Rename the function to 'split.data.by.time.or.bins'. Introduce wrapper functions 'split.data.by.bins.vector' and 'split.data.time.based' to call 'split.data.by.time.or.bins'. Add 'include.duplicate.ids' parameter in 'split.get.bins.activity.based' to obtain bins covering all data elements from 'df' by which the split is being performed, regardless of the elements ids uniqueness. In 'split.data.activity.based', after calculating the bins to place data elements into, replace the time-based splitting by 'split.data.by.bins.vector'. Time-based splitting is incorrect for the case that the date of the last element in a bin is the same as the date of the first element of the next bin. Adjust calculation of 'offset.end' in 'split.data.activity.based' to fix a bug where because of a short last window the end offset would cross the border of the last window, overlapping into the second last. Because of this overlap the last sliding windows would not be calculated as expected. This works towards #239. Signed-off-by: Maximilian Löffler <[email protected]>
se-sic · Sep 5, 2023 · ece569c · ece569c
1 parent 26d7b7e
commit ece569c
Showing 1 changed file with 107 additions and 27 deletions.
diff --git a/util-split.R b/util-split.R
@@ -22,6 +22,7 @@
 ## Copyright 2021 by Niklas Schneider <[email protected]>
 ## Copyright 2021 by Johannes Hostert <[email protected]>
 ## Copyright 2022 by Jonathan Baumann <[email protected]>
+## Copyright 2023 by Maximilian Löffler <[email protected]>
 ## All Rights Reserved.
 
 
@@ -63,6 +64,52 @@ requireNamespace("lubridate") # for date conversion
 split.data.time.based = function(project.data, time.period = "3 months", bins = NULL,
                                  number.windows = NULL, split.basis = c("commits", "mails", "issues"),
                                  sliding.window = FALSE, project.conf.new = NULL) {
+    split = split.data.by.time.or.bins(project.data, splitting.length = time.period, bins, split.by.time = TRUE,
+                                        number.windows, split.basis, sliding.window, project.conf.new)
+    return(split)
+}
+
+#' Split project data in activity-bin-based ranges as specified
+#'
+#' @param project.data the *Data object from which the data is retrieved
+#' @param activity.amount the amount of data elements with unique ids to be considered in a bin, an integer.
+#' @param bins the date objects defining the start of ranges (the last date defines the end of the last range, in an
+#'             *exclusive* manner), augmented with a bin vector mapping unique ids to bins.
+#'             [default: NULL]
+#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
+#'                    [default: "commits"]
+#' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}.
+#'
+#' @return the list of RangeData objects, each referring to one bin
+split.data.by.bins.vector = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"),
+                                     sliding.window) {
+    split = split.data.by.time.or.bins(project.data, activity.amount, bins, split.by.time = FALSE,
+                                        sliding.window = sliding.window, split.basis = split.basis)
+    return(split)
+}
+
+#' Split project data in time-based or activity-bin-based ranges as specified
+#'
+#' @param project.data the *Data object from which the data is retrieved
+#' @param splitting.length either \code{time.period} from \code{split.data.time.based}
+#'                         or \code{splitting.length} from\code{split.data.by.bins.vector}
+#' @param bins either \code{bins} from \code{split.data.time.based}
+#'             or \code{bins} from\code{split.data.by.bins.vector}
+#' @param split.by.time logical indicating whether splitting is done time-based or by activity-bins-based,
+#' @param number.windows see \code{number.windows} from \code{split.data.time.by.bins.vector}
+#'                       [default: NULL]
+#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
+#'                    [default: "commits"]
+#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
+#'                       [default: FALSE]
+#' @param project.conf.new the new project config to construct the \code{RangeData} objects.
+#'                         If \code{NULL}, a clone of \code{project.data$get.project.conf()} will be used.
+#'                         [default: NULL]
+#'
+#' @return the list of RangeData objects, each referring to one time period
+split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time,
+                                      number.windows = NULL, split.basis = c("commits", "mails", "issues"),
+                                      sliding.window = FALSE, project.conf.new = NULL) {
 
     ## get basis for splitting process
     split.basis = match.arg(split.basis)
@@ -99,26 +146,32 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
         ## remove sliding windows
         sliding.window = FALSE
     }
+
+    ## initiate variable
+    split.by.bins = FALSE
+
     ## if bins are NOT given explicitly
     if (is.null(bins)) {
         ## get bins based on split.basis
-        bins = split.get.bins.time.based(data[[split.basis]][["date"]], time.period, number.windows)$bins
+        bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins
         bins.labels = head(bins, -1)
-        split.by.bins = FALSE
         ## logging
         logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.",
-                         project.data$get.class.name(), time.period, split.basis)
+                         project.data$get.class.name(), splitting.length, split.basis)
     }
-    ## when bins are given explicitly
+    ## when bins are given explicitly, get bins based on parameter
     else {
-        ## remove sliding windows
-        sliding.window = FALSE
-        ## get bins based on parameter
-        split.basis = NULL
-        bins = get.date.from.string(bins)
-        bins = get.date.string(bins)
+        if (split.by.time) {
+            split.basis = NULL
+            split.by.bins = TRUE
+            sliding.window = FALSE
+            bins = get.date.from.string(bins)
+            bins = get.date.string(bins)
+        } else {
+            bins.vector = bins[["vector"]]
+            bins = bins[["bins"]]
+        }
         bins.labels = head(bins, -1)
-        split.by.bins = TRUE
         ## logging
         logging::loginfo("Splitting data '%s' into time ranges [%s].",
                          project.data$get.class.name(), paste(bins, collapse = ", "))
@@ -129,7 +182,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
     bins.ranges = construct.ranges(bins)
     names(bins.ranges) = bins.ranges
 
-    if ((length(bins.ranges) <= 1) && sliding.window) {
+    if (split.by.time && (length(bins.ranges) <= 1) && sliding.window) {
         logging::logwarn("Sliding-window approach does not apply for one range or less.")
         sliding.window = FALSE
     }
@@ -140,13 +193,16 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
         project.conf.new = project.data$get.project.conf()$clone()
     }
 
-    if (!sliding.window) {
+    if (!sliding.window || !split.by.time) {
         ## split data
         data.split = parallel::mclapply(data.to.split, function(df.name) {
             logging::logdebug("Splitting %s.", df.name)
             ## identify bins for data
             df = data[[df.name]]
-            df.bins = findInterval(df[["date"]], bins.date, all.inside = FALSE)
+            df.bins = if (!split.by.time && (df.name == split.basis))
+                        bins.vector
+                      else
+                        findInterval(df[["date"]], bins.date, all.inside = FALSE)
             ## split data according to df.bins
             df.split = split(df, df.bins)
             ## add proper labels/names
@@ -192,10 +248,10 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
         ## perform different steps for sliding-window approach
 
         ranges = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date),
-                                              time.period = time.period, overlap = 0.5, raw = FALSE,
+                                              time.period = splitting.length, overlap = 0.5, raw = FALSE,
                                               include.end.date = FALSE) # bins have already been prepared correctly
         bins.info = construct.overlapping.ranges(start = min(bins.date), end = max(bins.date),
-                                                 time.period = time.period, overlap = 0.5, raw = TRUE,
+                                                 time.period = splitting.length, overlap = 0.5, raw = TRUE,
                                                  include.end.date = FALSE) # bins have already been prepared correctly
         bins.date = sort(unname(unique(get.date.from.unix.timestamp(unlist(bins.info)))))
         bins = get.date.string(bins.date)
@@ -214,7 +270,7 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
 
     ## add splitting information to project configuration
     project.conf.new$set.splitting.info(
-        type = "time-based",
+        type = if (split.by.time) "time-based" else "activity-based",
         length = if (split.by.bins) {
                     bins
                  }
@@ -228,8 +284,8 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
                             )
                         ))
                     }
-                    else time.period
-                 },
+                    else splitting.length
+                },
         basis = split.basis,
         sliding.window = sliding.window,
         revisions = bins,
@@ -363,14 +419,14 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
     ## get bins based on split.basis
     logging::logdebug("Getting activity-based bins.")
     bins.data = split.get.bins.activity.based(data[[activity.type]], id.column[[activity.type]],
-                                              activity.amount, remove.duplicate.bins = TRUE)
+                                              activity.amount, remove.duplicate.bins = TRUE, include.duplicate.ids = TRUE)
     bins = bins.data[["bins"]]
     bins.date = get.date.from.string(bins)
 
     ## split the data based on the extracted timestamps
     logging::logdebug("Splitting data based on time windows arising from activity bins.")
-    cf.data = split.data.time.based(project.data, bins = bins.date, split.basis = activity.type,
-                                    project.conf.new = project.conf.new)
+    cf.data = split.data.by.bins.vector(project.data, bins = bins.data, activity.amount = activity.amount, 
+                                     sliding.window = sliding.window, split.basis = activity.type)
 
     ## perform additional steps for sliding-window approach:
     ## for activity-based sliding-window bins to work, we need to crop the data appropriately and,
@@ -387,6 +443,13 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
         ## offsets used for cropping (half the first/last bin)
         offset.start = floor(activity.amount / 2)
         offset.end = (items.unique.count - offset.start) %% activity.amount
+
+        # make sure that end offset does not go above one window
+        last.window = cf.data[[length(cf.data)]][[DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]]]()
+        length.of.last.window = length(unique(last.window[[ id.column[[activity.type]] ]]))
+
+        offset.end = max(c(length.of.last.window - offset.start, 0))
+
         ## cut the data appropriately
         if (offset.end > 0) {
             items.cut = c(
@@ -435,7 +498,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
         ## and the data of the last regular range is contained in the last sliding-window range, then:
         ## remove the last regular range as it is not complete and we don't loose data when removing it
         last.regular.range = cf.data[[length(cf.data)]]
-        last.sliding.range = cf.data[[length(cf.data) - 1]]
+        last.sliding.range = cf.data.sliding[[length(cf.data.sliding) - 1]]
         get.activity.data = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]
 
         last.regular.range.ids = (last.regular.range[[get.activity.data]]())[[ id.column[[activity.type]] ]]
@@ -1102,13 +1165,18 @@ split.get.bins.time.based = function(dates, time.period, number.windows = NULL)
 #' @param activity.amount the amount of activity denoting the number of unique items
 #'                        in each split bin [default: 5000]
 #' @param remove.duplicate.bins remove duplicate bin borders? [default: FALSE]
+#' @param include.duplicate.ids include entries of the \code{df} with non-unique ids
+#'                              in the creation of the bins. This should! not change bin borders
+#'                              as entries with the same id should! share the same \code{date} attribute.
+#'                              [default: FALSE]
 #'
 #' @return a list,
-#'         the item 'vector': the bins each row in 'df' belongs to (increasing integers),
+#'         the item 'vector': the bins each row in 'df' belongs to (increasing integers),q
 #'         the item 'bins': the bin labels,  described by dates, each bin containing
-#'         'acitivity.amount' many unique items; each item in the vector indicates
+#'         'activity.amount' many unique items; each item in the vector indicates
 #'         the start of a bin, although the last item indicates the end of the last bin
-split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE) {
+split.get.bins.activity.based = function(df, id, activity.amount, remove.duplicate.bins = FALSE,
+                                         include.duplicate.ids = FALSE) {
     logging::logdebug("split.get.bins.activity.based: starting")
     ## get the unique integer IDs for each item in 'id' column
     ids = df[[id]]
@@ -1120,11 +1188,23 @@ split.get.bins.activity.based = function(df, id, activity.amount, remove.duplica
         if (bins.number.complete != 0) rep(seq_len(bins.number.complete), each = activity.amount),
         rep(bins.number.complete + 1, bins.number.incomplete)
     )
+
+    ## pad bins with entries for all duplicate ids
+    if (include.duplicate.ids) {
+        bins.activity.padded = c()
+        for (i in seq_along(ids)) {
+            ## create an extra entry for every duplicate id in the same bin as
+            ## the first occurance of the id
+            current.bin = bins.activity[ which(ids.unique == ids[i]) ]
+            bins.activity.padded = c(bins.activity.padded, current.bin)
+        }
+        bins.activity = bins.activity.padded
+    }
     bins.number = max(bins.activity)
 
     ## join ids and bin numbers
     bins.mapping = data.frame(
-        id = ids.unique,
+        id = if (include.duplicate.ids) ids else ids.unique,
         bin = bins.activity
     )