diff --git a/tests/codeface-data/results/testing/test_feature/feature/issues-github.list b/tests/codeface-data/results/testing/test_feature/feature/issues-github.list index 706b8c40..b962deb5 100644 --- a/tests/codeface-data/results/testing/test_feature/feature/issues-github.list +++ b/tests/codeface-data/results/testing/test_feature/feature/issues-github.list @@ -15,3 +15,15 @@ 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"subscribed";"Björn";"bjoern@example.org";"2016-12-07 15:30:02";"udo";"""udo@example.org""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"labeled";"Olaf";"olaf@example.org";"2017-05-23 12:31:34";"decided";"""""" 6;"Distinguish directedness of networks and edge-construction algorithm";"[""issue"", ""bug"", ""enhancement""]";"open";"[]";"2016-07-12 14:30:13";"";"[]";"commented";"Björn";"bjoern@example.org";"2017-05-23 12:32:39";"open";"[]" +"1";"Example pull request 1";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"created";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" +"1";"Example pull request 1";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"commented";"Thomas";"thomas@example.org";"2016-07-12 15:59:25";"open";"[]" +"1";"Example pull request 1";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"commented";"Olaf";"olaf@example.org";"2016-07-12 16:01:01";"open";"[]" +"1";"Example pull request 1";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"commented";"Björn";"bjoern@example.org";"2016-07-12 16:06:01";"open";"[]" +"1";"Example pull request 1";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"state_updated";"Thomas";"thomas@example.org";"2016-07-12 15:59:59";"closed";"""open""" +"1";"Example pull request 1";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"state_updated";"Olaf";"olaf@example.org";"2016-07-14 13:37:00";"closed";"""open""" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"created";"Björn";"bjoern@example.org";"2016-07-12 14:59:25";"open";"[]" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"commented";"Björn";"bjoern@example.org";"2016-07-12 14:59:25";"open";"[]" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"state_updated";"Björn";"bjoern@example.org";"2016-07-12 15:00:01";"closed";"""open""" +"2";"Example pull request 2";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"state_updated";"Olaf";"olaf@example.org";"2016-07-12 16:04:59";"closed";"""open""" +"4";"Example pull request 4";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"created";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" +"4";"Example pull request 4";"[""pull request""]";"closed";"[]";"2013-05-13 02:46:44";"2013-09-05 14:48:10";"[]";"commented";"Olaf";"olaf@example.org";"2016-07-12 16:02:02";"open";"[]" \ No newline at end of file diff --git a/tests/test-networks-covariates.R b/tests/test-networks-covariates.R index dd6eca59..ab611fc0 100644 --- a/tests/test-networks-covariates.R +++ b/tests/test-networks-covariates.R @@ -47,35 +47,10 @@ myranges = construct.ranges(mybins, sliding.window = FALSE) #' Load test data and generate test networks #' -#' @return Tuple containing project data and list of networks -get.network.covariates.test.networks = function(network.type = c("author", "artifact")) { - - network.type.function = paste("get", match.arg(network.type), "network", sep = ".") - - ## configuration and data objects - proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT) - proj.conf$update.value("commits.filter.base.artifact", FALSE) - proj.conf$update.value("commits.filter.untracked.files", TRUE) - proj.conf$update.value("issues.only.comments", FALSE) - net.conf = NetworkConf$new() - net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - - ## retrieve project data and network builder - project.data = ProjectData$new(proj.conf) - project.data$set.issues(NULL) - - ## split data - input.data = split.data.time.based(project.data, bins = mybins) - input.data.networks = lapply(input.data, function(d) NetworkBuilder$new(d, net.conf)[[network.type.function]]()) - - return(list("networks" = input.data.networks, "project.data" = project.data)) -} - - -#' Load test data and generate test networks, but including issues +#' @param issues Whether to retain issue data. If \code{FALSE}, issue data is deleted. [default: FALSE] #' #' @return Tuple containing project data and list of networks -get.network.covariates.test.networks.with.issues = function(network.type = c("author", "artifact")) { +get.network.covariates.test.networks = function(network.type = c("author", "artifact"), issues = FALSE) { network.type.function = paste("get", match.arg(network.type), "network", sep = ".") @@ -87,8 +62,11 @@ get.network.covariates.test.networks.with.issues = function(network.type = c("au net.conf = NetworkConf$new() net.conf$update.values(list(author.relation = "cochange", simplify = FALSE)) - ## retrieve project data and network builder + ## retrieve project data project.data = ProjectData$new(proj.conf) + if (!issues) { + project.data$set.issues(NULL) + } ## split data input.data = split.data.time.based(project.data, bins = mybins) @@ -570,9 +548,9 @@ test_that("Test add.vertex.attribute.mail.thread.count", { #' Test the add.vertex.attribute.mail.count method test_that("Test add.vertex.attribute.issue.count", { ## Test setup - networks.and.data = get.network.covariates.test.networks.with.issues() + networks.and.data = get.network.covariates.test.networks(issues=TRUE) - expected.attributes = list( + expected.attributes.issues.only = list( range = network.covariates.test.build.expected(c(0L), c(0L), c(1L, 1L, 0L)), cumulative = network.covariates.test.build.expected(c(0L), c(1L), c(1L, 1L, 1L)), all.ranges = network.covariates.test.build.expected(c(2L), c(1L), c(1L, 1L, 1L)), @@ -581,16 +559,37 @@ test_that("Test add.vertex.attribute.issue.count", { complete = network.covariates.test.build.expected(c(3L), c(3L), c(3L, 1L, 3L)) ) - ## Test + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(1L), c(3L), c(1L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(1L), c(3L), c(3L, 0L, 1L)), + all.ranges = network.covariates.test.build.expected(c(2L), c(3L), c(3L, 0L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L), c(3L), c(3L, 0L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(2L), c(3L), c(3L, 0L, 1L)), + complete = network.covariates.test.build.expected(c(2L), c(3L), c(3L, 0L, 1L)) + ) + + ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { networks.with.attr = add.vertex.attribute.issue.count( - networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) - actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issues.count") + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.count") - expect_identical(expected.attributes[[level]], actual.attributes) + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.count" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pull.request.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) }) }) @@ -598,9 +597,9 @@ test_that("Test add.vertex.attribute.issue.count", { #' Test the add.vertex.attribute.mail.count method test_that("Test add.vertex.attribute.issue.count.by.commenting", { ## Test setup - networks.and.data = get.network.covariates.test.networks.with.issues() + networks.and.data = get.network.covariates.test.networks(issues = TRUE) - expected.attributes = list( + expected.attributes.issues.only = list( range = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 0L, 0L)), cumulative = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 1L, 1L)), all.ranges = network.covariates.test.build.expected(c(1L), c(0L), c(0L, 1L, 1L)), @@ -609,25 +608,94 @@ test_that("Test add.vertex.attribute.issue.count.by.commenting", { complete = network.covariates.test.build.expected(c(3L), c(1L), c(1L, 1L, 2L)) ) - ## Test + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(0L), c(2L), c(0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L), c(2L), c(2L, 0L, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L), c(2L), c(2L, 0L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L), c(2L), c(2L, 0L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(2L), c(2L), c(2L, 0L, 1L)), + complete = network.covariates.test.build.expected(c(2L), c(2L), c(2L, 0L, 1L)) + ) + + ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { networks.with.attr = add.vertex.attribute.issue.count.by.commenting( - networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) - actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issues.count.by.commenting") + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.count.by.commenting") - expect_identical(expected.attributes[[level]], actual.attributes) + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.count.by.commenting( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.count.by.commenting" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pull.request.count.by.commenting") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) + }) +}) + +#' Test the add.vertex.attribute.mail.count method +test_that("Test add.vertex.attribute.issue.creation.count", { + ## Test setup + networks.and.data = get.network.covariates.test.networks(issues = TRUE) + + expected.attributes.issues.only = list( + range = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 1L, 0L)), + all.ranges = network.covariates.test.build.expected(c(1L), c(0L), c(0L, 1L, 0L)), + project.cumulative = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 1L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(1L), c(0L), c(0L, 1L, 1L)), + complete = network.covariates.test.build.expected(c(1L), c(0L), c(0L, 1L, 1L)) + ) + + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(0L), c(1L), c(0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L), c(1L), c(1L, 0L, 1L)), + all.ranges = network.covariates.test.build.expected(c(0L), c(1L), c(1L, 0L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L), c(1L), c(1L, 0L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(1L), c(1L), c(1L, 0L, 1L)), + complete = network.covariates.test.build.expected(c(1L), c(1L), c(1L, 0L, 1L)) + ) + + ## Test issues only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.creation.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.creation.count") + + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.creation.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.creation.count" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pull.request.creation.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) }) }) #' Test the add.vertex.attribute.mail.count method test_that("Test add.vertex.attribute.issue.comment.count", { ## Test setup - networks.and.data = get.network.covariates.test.networks.with.issues() + networks.and.data = get.network.covariates.test.networks(issues = TRUE) - expected.attributes = list( + expected.attributes.issues.only = list( range = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 0L, 0L)), cumulative = network.covariates.test.build.expected(c(0L), c(0L), c(0L, 1L, 1L)), all.ranges = network.covariates.test.build.expected(c(2L), c(0L), c(0L, 1L, 1L)), @@ -636,16 +704,37 @@ test_that("Test add.vertex.attribute.issue.comment.count", { complete = network.covariates.test.build.expected(c(9L), c(4L), c(4L, 1L, 2L)) ) - ## Test + expected.attributes.prs.only = list( + range = network.covariates.test.build.expected(c(0L), c(2L), c(0L, 0L, 0L)), + cumulative = network.covariates.test.build.expected(c(0L), c(2L), c(2L, 0L, 1L)), + all.ranges = network.covariates.test.build.expected(c(1L), c(2L), c(2L, 0L, 1L)), + project.cumulative = network.covariates.test.build.expected(c(1L), c(2L), c(2L, 0L, 1L)), + project.all.ranges = network.covariates.test.build.expected(c(2L), c(2L), c(2L, 0L, 1L)), + complete = network.covariates.test.build.expected(c(2L), c(2L), c(2L, 0L, 1L)) + ) + + ## Test issues only lapply(AGGREGATION.LEVELS, function(level) { networks.with.attr = add.vertex.attribute.issue.comment.count( - networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "issues" ) actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "issue.comment.count") logging::logdebug(level) - expect_identical(expected.attributes[[level]], actual.attributes) + expect_identical(expected.attributes.issues.only[[level]], actual.attributes) + }) + + # Test PRs only + + lapply(AGGREGATION.LEVELS, function(level) { + networks.with.attr = add.vertex.attribute.issue.comment.count( + networks.and.data[["networks"]], networks.and.data[["project.data"]], aggregation.level = level, issue.type = "pull.requests", name = "pull.request.comment.count" + ) + + actual.attributes = lapply(networks.with.attr, igraph::get.vertex.attribute, name = "pull.request.comment.count") + + expect_identical(expected.attributes.prs.only[[level]], actual.attributes) }) }) diff --git a/util-core-peripheral.R b/util-core-peripheral.R index 97456002..807d9618 100644 --- a/util-core-peripheral.R +++ b/util-core-peripheral.R @@ -707,39 +707,51 @@ get.committer.or.author.commit.count = function(range.data) { } -#' Helper function that aggregates counts of things like commits, mails, ... on a per-author basis +#' Helper function that aggregates counts of things like commits, mails, ... on a per-author basis. +#' +#' For example, called with name="commit.count", data.extractor=function(proj.data) {return(proj.data$get.commits.filtered())}, +#' grouping.keys=c("committer.name"), distinctize=true and distinctize.key=c("hash"), the returned function will +#' +#' 1. get the data frame using the extractor (in this case, the commit df) +#' 2. remove duplicate entries so that there is only one entry per commit hash +#' 3. project away unneeded columns, leaving only committer.name +#' 4. count the commits grouped by the commiter name +#' 5. return a dataframe with cols commiter.name and freq, which contains the number of commits authored by each author #' #' @param name the name the function will be bound to, for logging -#' @param data.extractor a function to be called on the main argument that gets the dataframe from which to aggregate from +#' @param data.extractor a function which given the project data, extracts the relevant dataframe (i.e. the commit data.frame) from it #' @param grouping.keys the dataframe keys to group by -#' @param distrinctize Whether to remove duplicates -#' @param distrinctize.key if distinctize, then the key by which to remove duplicates +#' @param distinctize Whether to remove duplicates +#' @param distinctize.key if distinctize, then the key by which to remove duplicates #' #' @return A function that aggregates data according to the above specification contained in a given \code{ProjectData}. #' This function itself returns a dataframe consisting of |grouping.keys|+1 columns, the last holding the count, #' and the others the respective grouping #' group.data.by.key = function(name, data.extractor, grouping.keys, distinctize, distinctize.key) { - return (function(proj.data) { - logging::logdebug(paste(name, ": starting.", sep="")) - + return(function(proj.data) { + logging::logdebug(paste0(name, ": starting.")) + #get the data we want to group df = data.extractor(proj.data) + #if necessary, make sure that there is only one entry for each distinctizing key if (distinctize) { df = df[!duplicated(df[[distinctize.key]]), ] } + #throw away unnecessary columns df = df[grouping.keys] grouping.keys.formatted = paste(grouping.keys, sep="`, `") - stmt = paste("SELECT `",grouping.keys.formatted,"`, COUNT(*) as `freq` FROM `df` - GROUP BY `",grouping.keys.formatted,"` ORDER BY `freq` DESC, `",grouping.keys.formatted,"`",sep="") - logging::logdebug(paste(name, ": running SQL ", stmt,sep="")) + #execute a query that counts the number of occurrences of the grouping.keys + stmt = paste0("SELECT `", grouping.keys.formatted, "`, COUNT(*) as `freq` FROM `df` + GROUP BY `", grouping.keys.formatted, "` ORDER BY `freq` DESC, `", grouping.keys.formatted, "`") + logging::logdebug(paste0(name, ": running SQL ", stmt)) res = sqldf::sqldf(stmt) - logging::logdebug(paste(name, ": finished",sep="")) + logging::logdebug(paste0(name, ": finished")) return(res) }) } #' Get the commit count per committer in the given range data, where the committer -#' may match the author of the respective commits +#' may match the author of the respective commits. #' #' @param range.data The data to count on #' @@ -772,7 +784,7 @@ get.author.mail.count = group.data.by.key("get.author.mail.count", function(proj c("author.name"), TRUE, c("message.id")) #' Get the mail thread count for each author based on the mail data contained in the specified \code{ProjectData}. -#' This is the number of threads the author participated in, i.e. contributed at least one e-mail +#' This is the number of threads the author participated in, i.e. contributed at least one e-mail to. #' #' @param proj.data the \code{ProjectData} containing the mail data #' @@ -793,202 +805,144 @@ get.author.mail.thread.count = function(proj.data) { } ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Issues-based classification --------------------------------------------- +## Issue-/PR-based classification --------------------------------------------- ## * Count-based classification -------------------------------------------- - -#' Get the issues count for each author based on the issues data contained in the specified \code{ProjectData}. -#' The issues count is the number of issues the author participated in (which can mean anything, -#' from commenting to closing to assigning the issue to others, to assigning tags, ...) +#' Gets and preprocesses issue data, removing unnecessary columns and rows were are not interested in. #' -#' Issues do not include pull requests +#' Retained rows are \code{author.name}, \code{issue.id} and \code{"event.type"} #' -#' @param proj.data the \code{ProjectData} containing the mail data -#' -#' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.issues.count = function(proj.data) { - logging::logdebug("get.author.issues.count: starting.") - - df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("issue" %in% df$issue.type[[rnum]]) { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } - } - stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df.new` - GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.issues.count: running SQL ", stmt,sep="")) - res = sqldf::sqldf(stmt) - logging::logdebug("get.author.issues.count: finished") - return(res) -} - -#' Get the issues count for each author based on the issues data contained in the specified \code{ProjectData}. -#' The issues count here is the number of issues the author participated in by commenting +#' Retained colums depend on type. If it is \code{"all"}, then all rows are retained. +#' Otherwise, only the rows containing information about either issues or pull requests are retained #' #' @param proj.data the \code{ProjectData} containing the mail data +#' @param type which issue type to consider. +#' One of \code{"issues"}, \code{"pull.requests"} or \{"all"} +#' [default: "all"] #' -#' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.issues.commented.in.count = function(proj.data) { - logging::logdebug("get.author.issues.commented.in.count: starting.") - - df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type", "event.name")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("issue" %in% df$issue.type[[rnum]] && df$event.name[[rnum]] == "commented") { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } - } - stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df.new` - GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.issues.commented.in.count: running SQL ", stmt,sep="")) - res = sqldf::sqldf(stmt) - logging::logdebug("get.author.issues.commented.in.count: finished") - return(res) -} -#' Get the issues count for each author based on the issues data contained in the specified \code{ProjectData}. -#' The issues count is the number of issues the author commented in. -#' -#' @param proj.data the \code{ProjectData} containing the mail data #' -#' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.issue.comments.count = function(proj.data) { - logging::logdebug("get.author.issue.comments.count: starting.") - +preprocess.issue.data = function(proj.data, type = "all") { df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type", "event.name")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("issue" %in% df$issue.type[[rnum]] && df$event.name[[rnum]] == "commented") { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } + # if k is a list, and nrow(df) == 0, then df[k, ..] fails + # so we abort beforehand + if (nrow(df) == 0) { + return (df[c("author.name", "issue.id", "event.name")]); } - stmt = "SELECT `author.name`, COUNT(`issue.id`) as `freq` FROM `df.new` - GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.issue.comments.count: running SQL ", stmt,sep="")) - res = sqldf::sqldf(stmt) - logging::logdebug("get.author.issue.comments.count: finished") - return(res) + switch (type, + all = { + df = df[c("author.name", "issue.id", "event.name")] + }, + issues = { + df = df[sapply(df[["issue.type"]], function (k) {return ("issue" %in% k)}), c("author.name", "issue.id", "event.name")] + }, + pull.requests = { + df = df[sapply(df[["issue.type"]], function (k) {return ("pull request" %in% k)}), c("author.name", "issue.id", "event.name")] + }, + stop("Unknown issue data kind " + type) + ) + return(df) } - -## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / -## Pull-Request-based classification --------------------------------------------- - -## * Count-based classification -------------------------------------------- - - -#' Get the pull request count for each author based on the issues data contained in the specified \code{ProjectData}. -#' The pull request count is the number of pull requests the author created. +#' Get the issue/pr count for each author based on the issues data contained in the specified \code{ProjectData}. +#' The issue count here is the number of issues the author participated in (which can mean anything, +#' from commenting to closing to assigning the issue to others, to assigning tags, referencing it in other issues, +#' adding commits, ...). +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). #' #' @param proj.data the \code{ProjectData} containing the mail data +#' @param type which issue type to consider. see \code{preprocess.issue.data} +#' One of \code{"issues"}, \code{"pull.requests"} or \{"all"} +#' [default: "all"] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.pull.requests.created.count = function(proj.data) { - logging::logdebug("get.author.pull.requests.created.count: starting.") - - df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type", "event.name")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("pull request" %in% df$issue.type[[rnum]] && df$event.name[[rnum]] == "created") { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } - } - stmt = "SELECT `author.name`, COUNT(DISTINCT `issue.id`) as `freq` FROM `df.new` +#' their respective issue counts +get.author.issue.count = function(proj.data, type = "all") { + logging::logdebug("get.author.issue.count: starting.") + df = preprocess.issue.data(proj.data, type) + #count distinct since an author may appear in multiple issues at the same time, + #or in the same issue multiple times + stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df` GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.pull.requests.created.count: running SQL ", stmt,sep="")) res = sqldf::sqldf(stmt) - logging::logdebug("get.author.pull.requests.created.count: finished") + logging::logdebug("get.author.issue.count: finished") return(res) } -#' Get the pull request count for each author based on the issues data contained in the specified \code{ProjectData}. -#' The pull request count is the number of pull requests the author added a commit to. +#' Get the issue/pr count for each author based on the issues data contained in the specified \code{ProjectData}. +#' The issue count here is the number of issues the author created. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). #' #' @param proj.data the \code{ProjectData} containing the mail data +#' @param type which issue type to consider. see \code{preprocess.issue.data} +#' One of \code{"issues"}, \code{"pull.requests"} or \{"all"} +#' [default: "all"] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.pull.requests.commited.in.count = function(proj.data) { - logging::logdebug("get.author.pull.requests.commited.in.count: starting.") - - df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type", "event.name")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("pull request" %in% df$issue.type[[rnum]] && df$event.name[[rnum]] == "commit_added") { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } - } - stmt = "SELECT `author.name`, COUNT(DISTINCT `issue.id`) as `freq` FROM `df.new` +#' their respective issue counts +get.author.issues.created.count = function(proj.data, type = "all") { + logging::logdebug("get.author.issues.created.count: starting.") + df = preprocess.issue.data(proj.data, type) + #count distinct since an author may appear in multiple issues at the same time, + #or in the same issue multiple times + stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df` + WHERE `event.name` = 'created' GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.pull.requests.commited.in.count: running SQL ", stmt,sep="")) res = sqldf::sqldf(stmt) - logging::logdebug("get.author.pull.requests.commited.in.count: finished") + logging::logdebug("get.author.issues.created.count: finished") return(res) } -#' Get the pull request count for each author based on the issues data contained in the specified \code{ProjectData}. -#' The pull request count is the number of pull requests the author contributed to in any way (see get.author.issues.count) +#' Get the issue/pr count for each author based on the issues data contained in the specified \code{ProjectData}. +#' The issue count here is the number of issues the author commented in. +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). #' #' @param proj.data the \code{ProjectData} containing the mail data +#' @param type which issue type to consider. see \code{preprocess.issue.data} +#' One of \code{"issues"}, \code{"pull.requests"} or \{"all"} +#' [default: "all"] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.pull.requests.count = function(proj.data) { - logging::logdebug("get.author.pull.requests.count: starting.") - - df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type", "event.name")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("pull request" %in% df$issue.type[[rnum]]) { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } - } - stmt = "SELECT `author.name`, COUNT(DISTINCT `issue.id`) as `freq` FROM `df.new` +#' their respective issue counts +get.author.issues.commented.in.count = function(proj.data, type = "all") { + logging::logdebug("get.author.issues.commented.in.count: starting.") + df = preprocess.issue.data(proj.data, type) + #count distinct since an author may appear in multiple issues at the same time, + #or in the same issue multiple times + stmt = "SELECT `author.name`, COUNT( DISTINCT `issue.id`) as `freq` FROM `df` + WHERE `event.name` = 'commented' GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.pull.requests.count: running SQL ", stmt,sep="")) res = sqldf::sqldf(stmt) - logging::logdebug("get.author.pull.requests.count: finished") + logging::logdebug("get.author.issues.commented.in.count: finished") return(res) } -#' Get the number of comments to pull requests for each author based on the issues data contained in the specified \code{ProjectData}. +#' Get the issue/pr comment count for each author based on the issues data contained in the specified \code{ProjectData}. +#' The issue comment count here is the number of comments the author created summed across all issues +#' +#' The type argument specifies whether we count PRs alone, issues alone, or both (\code{"all"}). #' #' @param proj.data the \code{ProjectData} containing the mail data +#' @param type which issue type to consider. see \code{preprocess.issue.data} +#' One of \code{"issues"}, \code{"pull.requests"} or \{"all"} +#' [default: "all"] #' #' @return a dataframe consisting of two columns, the first of which holding the authors' names and the second holding -#' their respective mail counts -get.author.pull.request.comments.count = function(proj.data) { - logging::logdebug("get.author.pull.request.comments.count: starting.") - - df = proj.data$get.issues.unfiltered() - df = df[c("author.name", "issue.id", "issue.type", "event.name")] - df.new = data.frame(author.name=character(),issue.id=character()) - for (rnum in 1:nrow(df)) { - if ("pull request" %in% df$issue.type[[rnum]]) { - df.new[nrow(df.new)+1,] <- list(df$author.name[[rnum]], df$issue.id[[rnum]]) - } - } - stmt = "SELECT `author.name`, COUNT(`issue.id`) as `freq` FROM `df.new` +#' their respective comment counts +get.author.issue.comment.count = function(proj.data, type = "all") { + logging::logdebug("get.author.issue.comment.count: starting.") + df = preprocess.issue.data(proj.data, type) + stmt = "SELECT `author.name`, COUNT(*) as `freq` FROM `df` + WHERE `event.name` = 'commented' GROUP BY `author.name` ORDER BY `freq` DESC, `author.name` ASC" - logging::logdebug(paste("get.author.pull.request.comments.count: running SQL ", stmt,sep="")) res = sqldf::sqldf(stmt) - logging::logdebug("get.author.pull.request.comments.count: finished") + logging::logdebug("get.author.issue.comment.count: finished") return(res) } - ## * LOC-based classification ---------------------------------------------- #' Classify authors into "core" and "peripheral" based on authors' lines of code (LOC) and return the classification diff --git a/util-data.R b/util-data.R index d43759c1..45f9b4fa 100644 --- a/util-data.R +++ b/util-data.R @@ -112,6 +112,7 @@ ProjectData = R6::R6Class("ProjectData", mails.patchstacks = NULL, # list ## issues issues = NULL, #data.frame + issues.filtered = NULL, #data.frame ## authors authors = NULL, # data.frame ## additional data sources @@ -546,6 +547,7 @@ ProjectData = R6::R6Class("ProjectData", private$commit.messages = NULL private$mails = NULL private$issues = NULL + private$issues.filtered = NULL private$authors = NULL private$synchronicity = NULL private$pasta = NULL @@ -1067,7 +1069,7 @@ ProjectData = R6::R6Class("ProjectData", private$authors = data }, - #' Get the issue data. + #' Get the issue data, filtered #' If it does not already exist call the read method. #' #' @return the issue data @@ -1075,17 +1077,13 @@ ProjectData = R6::R6Class("ProjectData", logging::loginfo("Getting issue data") ## if issues have not been read yet do this - if (is.null(private$issues)) { - private$issues = read.issues(self$get.data.path.issues(), private$project.conf$get.value("issues.from.source")) - } - private$extract.timestamps(source = "issues") - - if (private$project.conf$get.value("issues.only.comments")) { - df = private$issues[private$issues[["event.name"]] == "commented", ] - return(df) - } else { - return(private$issues) + if (is.null(private$issues.filtered)) { + private$issues.filtered = self$get.issues.unfiltered() + if (private$project.conf$get.value("issues.only.comments")) { + private$issues.filtered = private$issues.filtered[private$issues[["event.name"]] == "commented", ] + } } + return(private$issues) }, #' Get the issue data, unfiltered. @@ -1115,6 +1113,7 @@ ProjectData = R6::R6Class("ProjectData", } private$issues = data + private$issues.filtered = NULL }, #' Get the list of artifacts from the given \code{data.source} of the project. diff --git a/util-networks-covariates.R b/util-networks-covariates.R index d914d21c..f2fa6b6f 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -349,7 +349,7 @@ add.vertex.attribute.commit.count.helper = function(list.of.networks, project.da ## * Mail count ---------------------------------------------------------- -#' Add mail-count attribute based on the total number of mails send where the person represented by the vertex is the author +#' Add mail-count attribute based on the total number of mails send where the person represented by the vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data @@ -375,7 +375,7 @@ add.vertex.attribute.mail.count = function(list.of.networks, project.data, return(nets.with.attr) } -#' Add mail-count attribute based on the number of mail threads participated in where the person represented by the vertex is the author +#' Add mail-count attribute based on the number of mail threads participated in where the person represented by the vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data @@ -402,9 +402,9 @@ add.vertex.attribute.mail.thread.count = function(list.of.networks, project.data return(nets.with.attr) } -## * Issue count -------------------------------------------------------------- +## * Issue / PR count -------------------------------------------------------------- -#' Add issues-count attribute based on the number of issues participated in where the person represented by the vertex is the author +#' Add issues-count attribute based on the number of issues participated in where the person represented by the vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data @@ -415,181 +415,102 @@ add.vertex.attribute.mail.thread.count = function(list.of.networks, project.data #' \code{"complete"}. See \code{split.data.by.networks} for #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] +#' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] #' #' @return A list of networks with the added attribute add.vertex.attribute.issue.count = function(list.of.networks, project.data, - name = "issues.count", + name = "issue.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), - default.value = 0L) { + default.value = 0L, issue.type = "all") { nets.with.attr = add.vertex.attribute.commit.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, get.author.issues.count, "author.name" + default.value, function(data) {return (get.author.issue.count(data, type = issue.type))}, "author.name" ) return(nets.with.attr) } -#' Add issues-count attribute based on the number of issues participated by commenting in where the person represented by the vertex is the author +#' Add issues-count attribute based on the number of issues participated by commenting in where the person represented by the vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data -#' @param name The attribute name to add [default: "issues.count.by.commenting"] +#' @param name The attribute name to add [default: "issue.count.by.commenting"] #' @param aggregation.level Determines the data to use for the attribute calculation. #' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, #' \code{"project.cumulative"}, \code{"project.all.ranges"}, and #' \code{"complete"}. See \code{split.data.by.networks} for #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] +#' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] #' #' @return A list of networks with the added attribute add.vertex.attribute.issue.count.by.commenting = function(list.of.networks, project.data, - name = "issues.count.by.commenting", + name = "issue.count.by.commenting", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), - default.value = 0L) { + default.value = 0L, issue.type = "all") { nets.with.attr = add.vertex.attribute.commit.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, get.author.issues.commented.in.count, "author.name" + default.value, function(data) {return (get.author.issues.commented.in.count(data, type = issue.type))}, "author.name" ) return(nets.with.attr) } -#' Add issues-count attribute based on the number of issues participated by commenting in where the person represented by the vertex is the author +#' Add issues-count attribute based on the number of issues created in where the person represented by the vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data -#' @param name The attribute name to add [default: "issue.comment.count"] +#' @param name The attribute name to add [default: "issues.count.by.commenting"] #' @param aggregation.level Determines the data to use for the attribute calculation. #' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, #' \code{"project.cumulative"}, \code{"project.all.ranges"}, and #' \code{"complete"}. See \code{split.data.by.networks} for #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] +#' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] #' #' @return A list of networks with the added attribute -add.vertex.attribute.issue.comment.count = function(list.of.networks, project.data, - name = "issue.comment.count", +add.vertex.attribute.issue.creation.count = function(list.of.networks, project.data, + name = "issue.creation.count", aggregation.level = c("range", "cumulative", "all.ranges", "project.cumulative", "project.all.ranges", "complete"), - default.value = 0L) { - nets.with.attr = add.vertex.attribute.commit.count.helper( - list.of.networks, project.data, name, aggregation.level, - default.value, get.author.issue.comments.count, "author.name" - ) - - return(nets.with.attr) -} - -## * Pull request count ------------------------------------------------------------- - -#' Add PR-count attribute based on the number of PR created where the person represented by the vertex is the author -#' -#' @param list.of.networks The network list -#' @param project.data The project data -#' @param name The attribute name to add [default: "pull.request.creation.count"] -#' @param aggregation.level Determines the data to use for the attribute calculation. -#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, -#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and -#' \code{"complete"}. See \code{split.data.by.networks} for -#' more details. [default: "range"] -#' @param default.value The default value to add if a vertex has no matching value [default: 0L] -#' -#' @return A list of networks with the added attribute -add.vertex.attribute.pull.request.creation.count = function(list.of.networks, project.data, - name = "pull.request.creation.count", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete"), - default.value = 0L) { - nets.with.attr = add.vertex.attribute.commit.count.helper( - list.of.networks, project.data, name, aggregation.level, - default.value, get.author.pull.requests.created.count, "author.name" - ) - - return(nets.with.attr) -} - -#' Add PR-count attribute based on the number of PRs participated in where the person represented by the vertex is the author -#' -#' @param list.of.networks The network list -#' @param project.data The project data -#' @param name The attribute name to add [default: "mail.thread.count"] -#' @param aggregation.level Determines the data to use for the attribute calculation. -#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, -#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and -#' \code{"complete"}. See \code{split.data.by.networks} for -#' more details. [default: "range"] -#' @param default.value The default value to add if a vertex has no matching value [default: 0L] -#' -#' @return A list of networks with the added attribute -add.vertex.attribute.pull.request.count = function(list.of.networks, project.data, - name = "pull.request.count", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete"), - default.value = 0L) { - nets.with.attr = add.vertex.attribute.commit.count.helper( - list.of.networks, project.data, name, aggregation.level, - default.value, get.author.pull.requests.count, "author.name" - ) - - return(nets.with.attr) -} - -#' Add PR-count attribute based on the number of PRs commits were added to where the person represented by the vertex is the author -#' -#' @param list.of.networks The network list -#' @param project.data The project data -#' @param name The attribute name to add [default: "mail.thread.count"] -#' @param aggregation.level Determines the data to use for the attribute calculation. -#' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, -#' \code{"project.cumulative"}, \code{"project.all.ranges"}, and -#' \code{"complete"}. See \code{split.data.by.networks} for -#' more details. [default: "range"] -#' @param default.value The default value to add if a vertex has no matching value [default: 0L] -#' -#' @return A list of networks with the added attribute -add.vertex.attribute.pull.request.count.by.commits = function(list.of.networks, project.data, - name = "pull.request.count.by.commits", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete"), - default.value = 0L) { + default.value = 0L, issue.type = "all") { nets.with.attr = add.vertex.attribute.commit.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, get.author.pull.requests.commited.in.count, "author.name" + default.value, function(data) {return (get.author.issues.created.count(data, type = issue.type))}, "author.name" ) return(nets.with.attr) } -#' Add PR-comment-count attribute based on the number of PRs comments written where the person represented by the vertex is the author +#' Add issues-count attribute based on the number of issues participated by commenting in where the person represented by the vertex is the author. #' #' @param list.of.networks The network list #' @param project.data The project data -#' @param name The attribute name to add [default: "mail.thread.count"] +#' @param name The attribute name to add [default: "issue.comment.count"] #' @param aggregation.level Determines the data to use for the attribute calculation. #' One of \code{"range"}, \code{"cumulative"}, \code{"all.ranges"}, #' \code{"project.cumulative"}, \code{"project.all.ranges"}, and #' \code{"complete"}. See \code{split.data.by.networks} for #' more details. [default: "range"] #' @param default.value The default value to add if a vertex has no matching value [default: 0L] +#' @param issue.type The issue kind,see \code{preprocess.issue.data} [default: "all"] #' #' @return A list of networks with the added attribute -add.vertex.attribute.pull.request.count.by.commits = function(list.of.networks, project.data, - name = "pull.request.comments", - aggregation.level = c("range", "cumulative", "all.ranges", - "project.cumulative", "project.all.ranges", - "complete"), - default.value = 0L) { +add.vertex.attribute.issue.comment.count = function(list.of.networks, project.data, + name = "issue.comment.count", + aggregation.level = c("range", "cumulative", "all.ranges", + "project.cumulative", "project.all.ranges", + "complete"), + default.value = 0L, issue.type = "all") { nets.with.attr = add.vertex.attribute.commit.count.helper( list.of.networks, project.data, name, aggregation.level, - default.value, get.author.pull.request.comments.count, "author.name" + default.value, function(data) {return (get.author.issue.comment.count(data, type = issue.type))}, "author.name" ) return(nets.with.attr)