Apply Review from @bockthom and @clhunsen

Apply the Review from @bockthom and @clhunsen on the previous changes. This includes compliance of coding conventions, update of copyright headers and improvement of documentation. Move the functions for 'get.author.names.from.networks' and 'get.expanded.adjacency' to new file 'util-networks-misc.R'. Also add two functions 'get.author.names.from.data' and 'convert.adjacency.matrix.list.to.array' from the 'dev-network-growth' project to the new file. Signed-off-by: fehnkera <[email protected]>
se-sic · Sep 22, 2020 · 3f91795 · 3f91795
1 parent e178cf2
commit 3f91795
Show file tree

Hide file tree

Showing 5 changed files with 384 additions and 206 deletions.
diff --git a/README.md b/README.md
@@ -129,6 +129,7 @@ Alternatively, you can run `Rscript install.R` to install the packages.
 - `viridis`: For plotting of networks with nice colors
 - `jsonlite`: For parsing the issue data
 - `rTensor`: For calculating EDCPTD centrality
+- `Matrix`: For sparse matrix representation of large adjacency matrices
 
 ### Submodule
 
@@ -410,6 +411,10 @@ Additionally, for more examples, the file `showcase.R` is worth a look.
     * Functionality to add vertex attributes to existing networks
 - `util-networks-metrics.R`
     * A set of network-metric functions
+- `util-networks-misc.R`
+    * Helper functions for network creation (e.g., create adjacency matrices)
+- `util-tensor.R`
+    * Functionality to build fourth-order tensors
 - `util-core-peripheral.R`
     * Author classification (core and peripheral) and related functions
 - `util-motifs.R`

diff --git a/showcase.R b/showcase.R
@@ -118,13 +118,13 @@ x = NetworkBuilder$new(project.data = x.data, network.conf = net.conf)
 ## Calculate EDCPTD centrality ---------------------------------------------
 
 ## get author networks for each relation
-author.networks = get.author.networks(x, c("cochange", "mail", "issue"))
+author.networks = get.author.networks.for.multiple.relations(x, c("cochange", "mail", "issue"))
 
-## create forth-order tensor
-forth.order.tensor = ForthOrderTensor$new(author.networks)
+## create fourth-order tensor
+fourth.order.tensor = FourthOrderTensor$new(author.networks)
 
 ## calculate EDCPTD scores
-edcptd.scores = calculate.EDCPTD.centrality(forth.order.tensor)
+edcptd.scores = calculate.EDCPTD.centrality(fourth.order.tensor)
 
 ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
 ## Range-level data --------------------------------------------------------

diff --git a/util-init.R b/util-init.R
@@ -60,4 +60,5 @@ source("util-plot.R")
 source("util-core-peripheral.R")
 source("util-networks-metrics.R")
 source("util-networks-covariates.R")
+source("util-networks-misc.R")
 source("util-tensor.R")
diff --git a/util-networks-misc.R b/util-networks-misc.R
@@ -0,0 +1,227 @@
+## This file is part of coronet, which is free software: you
+## can redistribute it and/or modify it under the terms of the GNU General
+## Public License as published by  the Free Software Foundation, version 2.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along
+## with this program; if not, write to the Free Software Foundation, Inc.,
+## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+##
+## Copyright 2016 by Sofie Kemper <[email protected]>
+## Copyright 2016 by Claus Hunsen <[email protected]>
+## Copyright 2016-2018 by Thomas Bock <[email protected]>
+## Copyright 2017 by Angelika Schmid <[email protected]>
+## Copyright 2019 by Jakob Kronawitter <[email protected]>
+## Copyright 2019-2020 by Anselm Fehnker <[email protected]>
+## All Rights Reserved.
+
+
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Libraries ---------------------------------------------------------------
+
+requireNamespace("parallel") # for parallel computation
+requireNamespace("igraph") # networks
+requireNamespace("Matrix") # for sparse matrices
+
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Get active authors  -----------------------------------------------------
+
+#' Get all author names that are active in at least one of the networks.
+#'
+#' @param networks the list of networks from which the author names are wanted
+#' @param globally decides if all author names are in one list or in separate lists for each network [default: TRUE]
+#'
+#' @return the list of author names
+get.author.names.from.networks = function(networks, globally = TRUE) {
+
+    ## for each network, get a list of authors that are in this network
+    active.authors.list = lapply(networks, function(network) {
+        active.authors = igraph::V(network)$name
+        return(active.authors)
+    })
+
+    if (globally) {
+        ## flatten the list of lists to one list of authors
+        active.authors = unlist(active.authors.list, recursive = FALSE)
+
+        ## remove distracting named list members
+        names(active.authors) = NULL
+
+        ## remove duplicates and order alphabetically ascending
+        active.authors = active.authors[!duplicated(active.authors)]
+        active.authors = sort(active.authors)
+        return(active.authors)
+    } else {
+        return(active.authors.list)
+    }
+}
+
+#' Get all author names that are active in at least one of the date ranges.
+#'
+#' @param data.ranges the list of the data ranges
+#' @param is.mail.analysis if the data is a mail analysis
+#' @param globally decides if all author names are in one list or in separate for each network [default: TRUE]
+#'
+#' @return the list of author names
+get.author.names.from.data = function(data.ranges, is.mail.analysis, globally = TRUE) {
+
+    ## for each range, get the authors who made at least one commit/mail in this range
+    active.authors.list = lapply(data.ranges, function(range.data) {
+        if (is.mail.analysis) {
+            active.authors = names(range.data$group.artifacts.by.data.column("mails", "author.name"))
+        } else {
+            active.authors = names(range.data$group.artifacts.by.data.column("commits", "author.name"))
+        }
+        return(active.authors)
+    })
+
+    if (globally) {
+        ## flatten the list of lists to one list of authors
+        active.authors = unlist(active.authors.list, recursive = FALSE)
+
+        ## remove distracting named list members
+        names(active.authors) = NULL
+
+        ## remove duplicates and order alphabetically ascending
+        active.authors = active.authors[!duplicated(active.authors)]
+        active.authors = sort(active.authors)
+        return(active.authors)
+    } else {
+        return(active.authors.list)
+    }
+}
+
+## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
+## Adjacency matrices ----------------------------------------------------
+
+#' Get a sparse adjacency matrix for a network.
+#'
+#' @param network the given network
+#' @param authors all authors that are wanted in the adjacency matrix
+#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
+#'
+#' @return the sparse adjacency matrix of the network
+get.expanded.adjacency = function(network, authors, weighted = FALSE) {
+
+    ## create an empty sparse matrix with the right size
+    matrix = Matrix::sparseMatrix(i = c(), j = c(), dims = c(length(authors), length(authors)), giveCsparse = FALSE)
+    matrix = as(matrix, "dgTMatrix")
+
+    ## add row and column names
+    rownames(matrix) = authors
+    colnames(matrix) = authors
+
+    if (igraph::vcount(network) > 0) {
+
+        if (weighted) {
+            ## get the weighted adjacency matrix for the current network
+            matrix.data = igraph::get.adjacency(network, attr = "weight")
+        } else {
+            ## get the unweighted adjacency matrix for the current network
+            matrix.data = igraph::get.adjacency(network)
+        }
+
+        ## order the adjacency matrix
+        if (nrow(matrix.data) > 1) { # for a 1x1 matrix ordering does not work
+            matrix.data = matrix.data[order(rownames(matrix.data)), order(colnames(matrix.data))]
+        }
+
+        ## save the activity data per author
+        if (nrow(matrix.data) > 0) {
+            matrix[rownames(matrix.data), colnames(matrix.data)] = matrix.data
+        }
+
+        if (!weighted) {
+            matrix[matrix > 0] = 1
+        }
+
+    }
+
+    return(matrix)
+}
+
+#' Calculates a sparse adjacency matrix for each network in the list.
+#' All adjacency matrix have the same authors.
+#'
+#' @param networks list of networks
+#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
+#'
+#' @return the list of adjacency matrices
+get.expanded.adjacency.matrices = function(networks, weighted = FALSE){
+
+    authors = get.authors.from.networks(networks)
+
+    adjacency.matrices = parallel::mclapply(networks, get.expanded.adjacency, authors, weighted)
+
+    return(adjacency.matrices)
+}
+
+#' Gets a list of networks, converts them to sparse adjacency matrices, and sums up the adjacency matrices cumulatively.
+#' This means that the first entry of the returned list is just the adjacency matrix from the first network,
+#' the second entry is the sum of the first and the second entry, and so on.
+#'
+#' @param networks list of networks
+#' @param weighted decides if the adjacency matrix shall be weighted [default: FALSE]
+#'
+#' @return the list of cumulated adjacency matrices
+get.expanded.adjacency.cumulated = function(networks, weighted = FALSE) {
+    ## get expanded adjacency matrices first
+    matrices = get.expanded.adjacency.matrices(networks, weighted)
+
+    ## pair-wise sum of matrices: m.cumul(n) = m.cumul(m-1) + m
+    ## (intermediate results consecutively stored in matrices.cumulated)
+    matrices.cumulated = list(matrices[[1]]) # first one is complete already
+
+    if (length(matrices) > 1) {
+        for (m in 2:(length(matrices))){
+
+            matrices.cumulated[[m]] = matrices.cumulated[[m - 1]] + matrices[[m]]
+            rownames(matrices.cumulated[[m]]) = rownames(matrices.cumulated[[m-1]])
+            colnames(matrices.cumulated[[m]]) = colnames(matrices.cumulated[[m-1]])
+
+            if (!weighted) {
+                ## search for a non-zero entry and set them to an arbitray number (e.g., 42)
+                ## to force that all non-zero entries are correctly set to 1 afterwards
+                not.zero.idxs = which(matrices.cumulated[[m]] >= 1, arr.ind = TRUE)
+                if (nrow(not.zero.idxs) > 0) {
+                    first.not.zero.idx = not.zero.idxs[1, ]
+                    names(first.not.zero.idx) = c("row", "col")
+                    matrices.cumulated[[m]][first.not.zero.idx[["row"]], first.not.zero.idx[["col"]]] = 42
+                    matrices.cumulated[[m]]@x = rep(1, length(matrices.cumulated[[m]]@i))
+                }
+            }
+        }
+    }
+
+    return(matrices.cumulated)
+}
+
+#' Converts a list of adjacency matrices to an array.
+#'
+#' @param adjacency.list the list of adjacency matrices
+#'
+#' @return the converted array
+convert.adjacency.matrix.list.to.array = function(adjacency.list){
+
+    ## create a 3-dimensional array representing the adjacency matrices (SIENA data format) as result
+    array = array(data = 0, dim = c(nrow(adjacency.list[[1]]), nrow(adjacency.list[[1]]), length(adjacency.list)))
+    rownames(array) = rownames(adjacency.list[[1]])
+    colnames(array) = colnames(adjacency.list[[1]])
+
+    ## copy the activity values from the adjacency matrices in the list to the corresponding array slices
+    for (i in seq_along(adjacency.ist)){
+        adjacency = adjacency.list[[i]]
+        activity.indices = which(adjacency != 0, arr.ind = TRUE)
+
+        for (j in 1:nrow(activity.indices)){
+            array[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j,2]), i] =
+                adjacency[as.vector(activity.indices[j, 1]), as.vector(activity.indices[j, 2])]
+        }
+    }
+
+    return(array)
+}