From 2e74c65ba646393e62ed36b7b10b21e768677a97 Mon Sep 17 00:00:00 2001 From: Daniel Himmelstein Date: Tue, 5 Apr 2022 09:12:58 -0400 Subject: [PATCH] apply pre-commit linting & blacken --- hetmatpy/degree_group.py | 100 +- hetmatpy/degree_weight.py | 466 ++++++--- hetmatpy/diffusion.py | 30 +- hetmatpy/hetmat/__init__.py | 203 ++-- hetmatpy/hetmat/archive.py | 61 +- hetmatpy/hetmat/caching.py | 58 +- hetmatpy/hetmat/tests/test_archive.py | 44 +- hetmatpy/hetmat/tests/test_caching.py | 54 +- hetmatpy/hetmat/tests/test_hetmat.py | 18 +- hetmatpy/matrix.py | 37 +- hetmatpy/pipeline.py | 90 +- hetmatpy/testing.py | 41 +- hetmatpy/tests/test_degree_weight.py | 1375 +++++++++++++++++-------- hetmatpy/tests/test_diffusion.py | 58 +- hetmatpy/tests/test_hetnetpy.py | 3 +- hetmatpy/tests/test_path_count.py | 53 +- hetmatpy/tests/test_pipeline.py | 310 +++--- hetmatpy/xarray.py | 10 +- setup.py | 32 +- 19 files changed, 1919 insertions(+), 1124 deletions(-) diff --git a/hetmatpy/degree_group.py b/hetmatpy/degree_group.py index cca32f8..25a7025 100644 --- a/hetmatpy/degree_group.py +++ b/hetmatpy/degree_group.py @@ -5,8 +5,8 @@ import pandas import scipy.sparse -from hetmatpy.matrix import metaedge_to_adjacency_matrix import hetmatpy.degree_weight +from hetmatpy.matrix import metaedge_to_adjacency_matrix def degrees_to_degree_to_ind(degrees): @@ -18,8 +18,12 @@ def degrees_to_degree_to_ind(degrees): def metapath_to_degree_dicts(graph, metapath): metapath = graph.metagraph.get_metapath(metapath) - _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7) - _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7) + _, _, source_adj_mat = metaedge_to_adjacency_matrix( + graph, metapath[0], dense_threshold=0.7 + ) + _, _, target_adj_mat = metaedge_to_adjacency_matrix( + graph, metapath[-1], dense_threshold=0.7 + ) source_degrees = source_adj_mat.sum(axis=1).flat target_degrees = target_adj_mat.sum(axis=0).flat source_degree_to_ind = degrees_to_degree_to_ind(source_degrees) @@ -27,7 +31,9 @@ def metapath_to_degree_dicts(graph, metapath): return source_degree_to_ind, target_degree_to_ind -def generate_degree_group_stats(source_degree_to_ind, target_degree_to_ind, matrix, scale=False, scaler=1): +def generate_degree_group_stats( + source_degree_to_ind, target_degree_to_ind, matrix, scale=False, scaler=1 +): """ Yield dictionaries with degree grouped stats """ @@ -41,31 +47,37 @@ def generate_degree_group_stats(source_degree_to_ind, target_degree_to_ind, matr # row_matrix = scipy.sparse.csc_matrix(row_matrix) for target_degree, col_inds in target_degree_to_ind.items(): row = { - 'source_degree': source_degree, - 'target_degree': target_degree, + "source_degree": source_degree, + "target_degree": target_degree, } - row['n'] = len(row_inds) * len(col_inds) + row["n"] = len(row_inds) * len(col_inds) if source_degree == 0 or target_degree == 0: - row['sum'] = 0 - row['nnz'] = 0 - row['sum_of_squares'] = 0 + row["sum"] = 0 + row["nnz"] = 0 + row["sum_of_squares"] = 0 yield row continue slice_matrix = row_matrix[:, col_inds] - values = slice_matrix.data if scipy.sparse.issparse(slice_matrix) else slice_matrix + values = ( + slice_matrix.data + if scipy.sparse.issparse(slice_matrix) + else slice_matrix + ) if scale: values = numpy.arcsinh(values / scaler) - row['sum'] = values.sum() - row['sum_of_squares'] = (values ** 2).sum() + row["sum"] = values.sum() + row["sum_of_squares"] = (values**2).sum() if scipy.sparse.issparse(slice_matrix): - row['nnz'] = slice_matrix.nnz + row["nnz"] = slice_matrix.nnz else: - row['nnz'] = numpy.count_nonzero(slice_matrix) + row["nnz"] = numpy.count_nonzero(slice_matrix) yield row -def dwpc_to_degrees(graph, metapath, damping=0.5, ignore_zeros=False, ignore_redundant=True): +def dwpc_to_degrees( + graph, metapath, damping=0.5, ignore_zeros=False, ignore_redundant=True +): """ Yield a description of each cell in a DWPC matrix adding source and target node degree info as well as the corresponding path count. @@ -78,26 +90,32 @@ def dwpc_to_degrees(graph, metapath, damping=0.5, ignore_zeros=False, ignore_red the same DWPC. """ metapath = graph.metagraph.get_metapath(metapath) - _, _, source_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[0], dense_threshold=0.7) - _, _, target_adj_mat = metaedge_to_adjacency_matrix(graph, metapath[-1], dense_threshold=0.7) + _, _, source_adj_mat = metaedge_to_adjacency_matrix( + graph, metapath[0], dense_threshold=0.7 + ) + _, _, target_adj_mat = metaedge_to_adjacency_matrix( + graph, metapath[-1], dense_threshold=0.7 + ) source_degrees = source_adj_mat.sum(axis=1).flat target_degrees = target_adj_mat.sum(axis=0).flat del source_adj_mat, target_adj_mat - source_path = graph.get_nodes_path(metapath.source(), file_format='tsv') - source_node_df = pandas.read_csv(source_path, sep='\t') - source_node_names = list(source_node_df['name']) + source_path = graph.get_nodes_path(metapath.source(), file_format="tsv") + source_node_df = pandas.read_csv(source_path, sep="\t") + source_node_names = list(source_node_df["name"]) - target_path = graph.get_nodes_path(metapath.target(), file_format='tsv') - target_node_df = pandas.read_csv(target_path, sep='\t') - target_node_names = list(target_node_df['name']) + target_path = graph.get_nodes_path(metapath.target(), file_format="tsv") + target_node_df = pandas.read_csv(target_path, sep="\t") + target_node_names = list(target_node_df["name"]) - row_names, col_names, dwpc_matrix = graph.read_path_counts(metapath, 'dwpc', damping) + row_names, col_names, dwpc_matrix = graph.read_path_counts( + metapath, "dwpc", damping + ) dwpc_matrix = numpy.arcsinh(dwpc_matrix / dwpc_matrix.mean()) if scipy.sparse.issparse(dwpc_matrix): dwpc_matrix = dwpc_matrix.toarray() - _, _, path_count = graph.read_path_counts(metapath, 'dwpc', 0.0) + _, _, path_count = graph.read_path_counts(metapath, "dwpc", 0.0) if scipy.sparse.issparse(path_count): path_count = path_count.toarray() @@ -110,14 +128,14 @@ def dwpc_to_degrees(graph, metapath, damping=0.5, ignore_zeros=False, ignore_red if ignore_zeros and dwpc_value == 0: continue row = { - 'source_id': row_names[row_ind], - 'target_id': col_names[col_ind], - 'source_name': source_node_names[row_ind], - 'target_name': target_node_names[col_ind], - 'source_degree': source_degrees[row_ind], - 'target_degree': target_degrees[col_ind], - 'path_count': path_count[row_ind, col_ind], - 'dwpc': dwpc_value, + "source_id": row_names[row_ind], + "target_id": col_names[col_ind], + "source_name": source_node_names[row_ind], + "target_name": target_node_names[col_ind], + "source_degree": source_degrees[row_ind], + "target_degree": target_degrees[col_ind], + "path_count": path_count[row_ind, col_ind], + "dwpc": dwpc_value, } yield collections.OrderedDict(row) @@ -127,13 +145,19 @@ def single_permutation_degree_group(permuted_hetmat, metapath, dwpc_mean, dampin Compute degree-grouped permutations for a single permuted_hetmat, for one metapath. """ - _, _, matrix = hetmatpy.degree_weight.dwpc(permuted_hetmat, metapath, damping=damping, dense_threshold=0.7) - source_deg_to_ind, target_deg_to_ind = hetmatpy.degree_group.metapath_to_degree_dicts(permuted_hetmat, metapath) + _, _, matrix = hetmatpy.degree_weight.dwpc( + permuted_hetmat, metapath, damping=damping, dense_threshold=0.7 + ) + ( + source_deg_to_ind, + target_deg_to_ind, + ) = hetmatpy.degree_group.metapath_to_degree_dicts(permuted_hetmat, metapath) row_generator = hetmatpy.degree_group.generate_degree_group_stats( - source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=dwpc_mean) + source_deg_to_ind, target_deg_to_ind, matrix, scale=True, scaler=dwpc_mean + ) degree_grouped_df = ( pandas.DataFrame(row_generator) - .set_index(['source_degree', 'target_degree']) + .set_index(["source_degree", "target_degree"]) .assign(n_perms=1) ) return degree_grouped_df diff --git a/hetmatpy/degree_weight.py b/hetmatpy/degree_weight.py index fe713bf..5c26079 100644 --- a/hetmatpy/degree_weight.py +++ b/hetmatpy/degree_weight.py @@ -5,36 +5,41 @@ import logging import numpy +from hetnetpy.matrix import sparsify_or_densify from scipy import sparse -from hetnetpy.matrix import ( - sparsify_or_densify, -) import hetmatpy.hetmat -from hetmatpy.hetmat.caching import path_count_cache import hetmatpy.matrix +from hetmatpy.hetmat.caching import path_count_cache def _category_to_function(category, dwwc_method): function_dictionary = { - 'no_repeats': dwwc_method, - 'disjoint': _dwpc_disjoint, - 'disjoint_groups': _dwpc_disjoint, - 'short_repeat': _dwpc_short_repeat, - 'four_repeat': _dwpc_baba, - 'long_repeat': _dwpc_general_case, - 'BAAB': _dwpc_baab, - 'BABA': _dwpc_baba, - 'repeat_around': _dwpc_repeat_around, - 'interior_complete_group': _dwpc_baba, - 'other': _dwpc_general_case, + "no_repeats": dwwc_method, + "disjoint": _dwpc_disjoint, + "disjoint_groups": _dwpc_disjoint, + "short_repeat": _dwpc_short_repeat, + "four_repeat": _dwpc_baba, + "long_repeat": _dwpc_general_case, + "BAAB": _dwpc_baab, + "BABA": _dwpc_baba, + "repeat_around": _dwpc_repeat_around, + "interior_complete_group": _dwpc_baba, + "other": _dwpc_general_case, } return function_dictionary[category] -@path_count_cache(metric='dwpc') -def dwpc(graph, metapath, damping=0.5, dense_threshold=0, approx_ok=False, - dtype=numpy.float64, dwwc_method=None): +@path_count_cache(metric="dwpc") +def dwpc( + graph, + metapath, + damping=0.5, + dense_threshold=0, + approx_ok=False, + dtype=numpy.float64, + dwwc_method=None, +): """ A unified function to compute the degree-weighted path count. This function will call get_segments, then the appropriate @@ -70,20 +75,29 @@ def dwpc(graph, metapath, damping=0.5, dense_threshold=0, approx_ok=False, """ category = categorize(metapath) dwpc_function = _category_to_function(category, dwwc_method=dwwc_method) - if category in ('long_repeat', 'other'): + if category in ("long_repeat", "other"): if approx_ok: dwpc_function = _dwpc_approx else: - logging.warning(f"Metapath {metapath} will use _dwpc_general_case, " - "which can require very long computations.") + logging.warning( + f"Metapath {metapath} will use _dwpc_general_case, " + "which can require very long computations." + ) row_names, col_names, dwpc_matrix = dwpc_function( - graph, metapath, damping, dense_threshold=dense_threshold, - dtype=dtype) + graph, metapath, damping, dense_threshold=dense_threshold, dtype=dtype + ) return row_names, col_names, dwpc_matrix -@path_count_cache(metric='dwwc') -def dwwc(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64, dwwc_method=None): +@path_count_cache(metric="dwwc") +def dwwc( + graph, + metapath, + damping=0.5, + dense_threshold=0, + dtype=numpy.float64, + dwwc_method=None, +): """ Compute the degree-weighted walk count (DWWC) in which nodes can be repeated within a path. @@ -110,7 +124,9 @@ def dwwc(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64, d ) -def dwwc_sequential(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): +def dwwc_sequential( + graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64 +): """ Compute the degree-weighted walk count (DWWC) in which nodes can be repeated within a path. @@ -129,7 +145,8 @@ def dwwc_sequential(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy row_names = None for metaedge in metapath: rows, cols, adj_mat = hetmatpy.matrix.metaedge_to_adjacency_matrix( - graph, metaedge, dense_threshold=dense_threshold, dtype=dtype) + graph, metaedge, dense_threshold=dense_threshold, dtype=dtype + ) adj_mat = _degree_weight(adj_mat, damping, dtype=dtype) if dwwc_matrix is None: row_names = rows @@ -140,17 +157,25 @@ def dwwc_sequential(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy return row_names, cols, dwwc_matrix -def dwwc_recursive(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): +def dwwc_recursive( + graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64 +): """ Recursive DWWC implementation to take better advantage of caching. """ rows, cols, adj_mat = hetmatpy.matrix.metaedge_to_adjacency_matrix( - graph, metapath[0], dense_threshold=dense_threshold, dtype=dtype) + graph, metapath[0], dense_threshold=dense_threshold, dtype=dtype + ) adj_mat = _degree_weight(adj_mat, damping, dtype=dtype) if len(metapath) > 1: _, cols, dwwc_next = dwwc( - graph, metapath[1:], damping=damping, dense_threshold=dense_threshold, - dtype=dtype, dwwc_method=dwwc_recursive) + graph, + metapath[1:], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + dwwc_method=dwwc_recursive, + ) dwwc_matrix = adj_mat @ dwwc_next else: dwwc_matrix = adj_mat @@ -166,11 +191,15 @@ def _multi_dot(metapath, order, i, j, graph, damping, dense_threshold, dtype): """ if i == j: _, _, adj_mat = hetmatpy.matrix.metaedge_to_adjacency_matrix( - graph, metapath[i], dense_threshold=dense_threshold, dtype=dtype) + graph, metapath[i], dense_threshold=dense_threshold, dtype=dtype + ) adj_mat = _degree_weight(adj_mat, damping=damping, dtype=dtype) return adj_mat - return _multi_dot(metapath, order, i, order[i, j], graph, damping, dense_threshold, dtype) \ - @ _multi_dot(metapath, order, order[i, j] + 1, j, graph, damping, dense_threshold, dtype) + return _multi_dot( + metapath, order, i, order[i, j], graph, damping, dense_threshold, dtype + ) @ _multi_dot( + metapath, order, order[i, j] + 1, j, graph, damping, dense_threshold, dtype + ) def _dimensions_to_ordering(dimensions): @@ -183,7 +212,11 @@ def _dimensions_to_ordering(dimensions): j = i + l_ m[i, j] = numpy.inf for k in range(i, j): - q = m[i, k] + m[k + 1, j] + dimensions[i] * dimensions[k + 1] * dimensions[j + 1] + q = ( + m[i, k] + + m[k + 1, j] + + dimensions[i] * dimensions[k + 1] * dimensions[j + 1] + ) if q < m[i, j]: m[i, j] = q ordering[i, j] = k @@ -202,7 +235,9 @@ def dwwc_chain(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.floa row_ids = hetmatpy.matrix.get_node_identifiers(graph, metapath.source()) columns_ids = hetmatpy.matrix.get_node_identifiers(graph, metapath.target()) ordering = _dimensions_to_ordering(array_dims) - dwwc_matrix = _multi_dot(metapath, ordering, 0, len(metapath) - 1, graph, damping, dense_threshold, dtype) + dwwc_matrix = _multi_dot( + metapath, ordering, 0, len(metapath) - 1, graph, damping, dense_threshold, dtype + ) dwwc_matrix = sparsify_or_densify(dwwc_matrix, dense_threshold) return row_ids, columns_ids, dwwc_matrix @@ -242,7 +277,7 @@ def categorize(metapath): repeated = {metanode for metanode, count in freq.items() if count > 1} if not repeated: - return 'no_repeats' + return "no_repeats" repeats_only = [node for node in metanodes if node in repeated] @@ -254,13 +289,13 @@ def categorize(metapath): # Identify if there is only one metanode if len(repeated) == 1: if max(freq.values()) < 4: - return 'short_repeat' + return "short_repeat" elif max(freq.values()) == 4: - return 'four_repeat' + return "four_repeat" else: - return 'long_repeat' + return "long_repeat" - return 'disjoint' + return "disjoint" assert len(repeats_only) > 3 @@ -268,31 +303,32 @@ def categorize(metapath): if len(repeats_only) == 4: if repeats_only[0] == repeats_only[-1]: assert repeats_only[1] == repeats_only[2] - return 'BAAB' + return "BAAB" else: - assert (repeats_only[0] == repeats_only[2] and - repeats_only[1] == repeats_only[3]) - return 'BABA' + assert ( + repeats_only[0] == repeats_only[2] + and repeats_only[1] == repeats_only[3] + ) + return "BABA" elif len(repeats_only) == 5 and max(map(len, grouped)) == 3: if repeats_only[0] == repeats_only[-1]: - return 'BAAB' - elif repeats_only == list(reversed(repeats_only)) and \ - not len(repeats_only) % 2: - return 'BAAB' + return "BAAB" + elif repeats_only == list(reversed(repeats_only)) and not len(repeats_only) % 2: + return "BAAB" # 6 node paths with 3x2 repeats elif len(repeated) == 3 and len(metapath) == 5: if repeats_only[0] == repeats_only[-1]: - return 'repeat_around' + return "repeat_around" # AABCCB or AABCBC elif len(grouped[0]) == 2 or len(grouped[-1]) == 2: - return 'disjoint_groups' + return "disjoint_groups" # ABA CC B elif len(repeats_only) - len(grouped) == 1: - return 'interior_complete_group' + return "interior_complete_group" # most complicated len 6 else: - return 'other' + return "other" else: # Multi-repeats that aren't disjoint, eg. ABCBAC @@ -300,13 +336,13 @@ def categorize(metapath): logging.info( f"{metapath}: Only two overlapping repeats currently supported" ) - return 'other' + return "other" if len(metanodes) > 4: logging.info( - f"{metapath}: Complex metapaths of length > 4 are not yet " - f"supported") - return 'other' + f"{metapath}: Complex metapaths of length > 4 are not yet " f"supported" + ) + return "other" assert False @@ -354,24 +390,29 @@ def add_head_tail(metapath, indices): freq = collections.Counter(metanodes) repeated = {i for i in freq.keys() if freq[i] > 1} - if category == 'no_repeats': + if category == "no_repeats": return [metapath] - elif category == 'repeat_around': + elif category == "repeat_around": # Note this is hard-coded and will need to be updated for various # metapath lengths indices = [[0, 1], [1, 4], [4, 5]] - elif category == 'disjoint_groups': + elif category == "disjoint_groups": # CCBABA or CCBAAB or BABACC or BAABCC -> [CC, BABA], etc. metanodes = list(metapath.get_nodes()) grouped = [list(v) for k, v in itertools.groupby(metanodes)] - indices = [[0, 1], [1, 2], [2, 5]] if len(grouped[0]) == 2 else [ - [0, 3], [3, 4], [4, 5]] - - elif category in ('disjoint', 'short_repeat', 'long_repeat'): - indices = sorted([[metanodes.index(i), len(metapath) - list( - reversed(metanodes)).index(i)] for i in repeated]) + indices = ( + [[0, 1], [1, 2], [2, 5]] + if len(grouped[0]) == 2 + else [[0, 3], [3, 4], [4, 5]] + ) + + elif category in ("disjoint", "short_repeat", "long_repeat"): + indices = sorted( + [metanodes.index(i), len(metapath) - list(reversed(metanodes)).index(i)] + for i in repeated + ) indices = add_head_tail(metapath, indices) # handle middle cases with non-repeated nodes between disjoint regions # Eg. [[0,2], [3,4]] -> [[0,2],[2,3],[3,4]] @@ -382,22 +423,22 @@ def add_head_tail(metapath, indices): inds.append([v[-1], indices[i + 1][0]]) indices = inds + [indices[-1]] - elif category == 'four_repeat': + elif category == "four_repeat": nodes = set(metanodes) - repeat_indices = ( - [[i for i, v in enumerate(metanodes) - if v == metanode] for metanode in nodes]) + repeat_indices = [ + [i for i, v in enumerate(metanodes) if v == metanode] for metanode in nodes + ] repeat_indices = [i for i in repeat_indices if len(i) > 1] simple_repeats = [i for group in repeat_indices for i in group] seconds = simple_repeats[1:] + [simple_repeats[-1]] indices = list(zip(simple_repeats, seconds)) indices = add_head_tail(metapath, indices) - elif category in ('BAAB', 'BABA', 'other', 'interior_complete_group'): + elif category in ("BAAB", "BABA", "other", "interior_complete_group"): nodes = set(metanodes) - repeat_indices = ( - [[i for i, v in enumerate(metanodes) - if v == metanode] for metanode in nodes]) + repeat_indices = [ + [i for i, v in enumerate(metanodes) if v == metanode] for metanode in nodes + ] repeat_indices = [i for i in repeat_indices if len(i) > 1] simple_repeats = [i for group in repeat_indices for i in group] inds = [] @@ -408,8 +449,9 @@ def add_head_tail(metapath, indices): inds.append(i[0]) inds.append(i[-1]) for j in i[1:-1]: - if (j - 1 in simple_repeats and j + 1 in simple_repeats) \ - and not (j - 1 in i and j + 1 in i): + if (j - 1 in simple_repeats and j + 1 in simple_repeats) and not ( + j - 1 in i and j + 1 in i + ): inds.append(j) inds = sorted(inds) seconds = inds[1:] + [inds[-1]] @@ -417,11 +459,11 @@ def add_head_tail(metapath, indices): indices = [i for i in indices if len(set(i)) == 2] indices = add_head_tail(metapath, indices) - segments = [metapath[i[0]:i[1]] for i in indices] + segments = [metapath[i[0] : i[1]] for i in indices] segments = [i for i in segments if i] segments = [metagraph.get_metapath(metaedges) for metaedges in segments] # eg: B CC ABA - if category == 'interior_complete_group': + if category == "interior_complete_group": segs = [] for i, v in enumerate(segments[:-1]): if segments[i + 1].source() == segments[i + 1].target(): @@ -491,7 +533,11 @@ def order_segments(metagraph, metapaths, store_inverses=False): collections.Counter Number of times each metapath segment appears when getting all segments. """ - all_segments = [segment for metapath in metapaths for segment in get_all_segments(metagraph, metapath)] + all_segments = [ + segment + for metapath in metapaths + for segment in get_all_segments(metagraph, metapath) + ] if not store_inverses: # Change all instances of inverted segments to the same direction, using a first-seen ordering seen = set() @@ -519,13 +565,12 @@ def _degree_weight(matrix, damping, copy=True, dtype=numpy.float64): matrix = hetmatpy.matrix.copy_array(matrix, copy, dtype=dtype) row_sums = numpy.array(matrix.sum(axis=1), dtype=dtype).flatten() column_sums = numpy.array(matrix.sum(axis=0), dtype=dtype).flatten() - matrix = hetmatpy.matrix.normalize(matrix, row_sums, 'rows', damping) - matrix = hetmatpy.matrix.normalize(matrix, column_sums, 'columns', damping) + matrix = hetmatpy.matrix.normalize(matrix, row_sums, "rows", damping) + matrix = hetmatpy.matrix.normalize(matrix, column_sums, "columns", damping) return matrix -def _dwpc_approx(graph, metapath, damping=0.5, dense_threshold=0, - dtype=numpy.float64): +def _dwpc_approx(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): """ Compute an approximation of DWPC. Only removes the diagonal for the first repeated node, and any disjoint repetitions that follow the last occurrence @@ -540,38 +585,58 @@ def _dwpc_approx(graph, metapath, damping=0.5, dense_threshold=0, row_names = None # Find the first repeated metanode and where it occurs nodes = metapath.get_nodes() - repeated_nodes = [node for i, node in enumerate(nodes) if node in nodes[i + 1:]] + repeated_nodes = [node for i, node in enumerate(nodes) if node in nodes[i + 1 :]] first_repeat = repeated_nodes[0] repeated_indices = [i for i, v in enumerate(nodes) if v == first_repeat] for i, segment in enumerate(repeated_indices[1:]): - rows, cols, dwpc_matrix = dwpc(graph, metapath[repeated_indices[i]:segment], - damping=damping, dense_threshold=dense_threshold, - dtype=dtype) + rows, cols, dwpc_matrix = dwpc( + graph, + metapath[repeated_indices[i] : segment], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) if row_names is None: row_names = rows # Add head and tail segments, if applicable if repeated_indices[0] != 0: - row_names, _, head_seg = dwwc(graph, metapath[0:repeated_indices[0]], damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + row_names, _, head_seg = dwwc( + graph, + metapath[0 : repeated_indices[0]], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = head_seg @ dwpc_matrix if nodes[repeated_indices[-1]] != nodes[-1]: - _, cols, tail_seg = dwpc(graph, metapath[repeated_indices[-1]:], damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + _, cols, tail_seg = dwpc( + graph, + metapath[repeated_indices[-1] :], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = dwpc_matrix @ tail_seg dwpc_matrix = sparsify_or_densify(dwpc_matrix, dense_threshold) return row_names, cols, dwpc_matrix -def _dwpc_disjoint(graph, metapath, damping=0.5, dense_threshold=0, - dtype=numpy.float64): +def _dwpc_disjoint( + graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64 +): """DWPC for disjoint repeats or disjoint groups""" segments = get_segments(graph.metagraph, metapath) row_names = None col_names = None dwpc_matrix = None for segment in segments: - rows, cols, seg_matrix = dwpc(graph, segment, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + rows, cols, seg_matrix = dwpc( + graph, + segment, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) if row_names is None: row_names = rows if segment is segments[-1]: @@ -584,26 +649,41 @@ def _dwpc_disjoint(graph, metapath, damping=0.5, dense_threshold=0, return row_names, col_names, dwpc_matrix -def _dwpc_repeat_around(graph, metapath, damping=0.5, dense_threshold=0, - dtype=numpy.float64): +def _dwpc_repeat_around( + graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64 +): """ DWPC for situations in which we have a surrounding repeat like B----B, where the middle group is a more complicated group. The purpose of this function is just as an order-of-operations simplification """ segments = get_segments(graph.metagraph, metapath) - mid = dwpc(graph, segments[1], damping=damping, - dense_threshold=dense_threshold, dtype=dtype)[2] - row_names, cols, adj0 = dwpc(graph, segments[0], damping=damping, - dense_threshold=dense_threshold, dtype=dtype) - rows, col_names, adj1 = dwpc(graph, segments[-1], damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + mid = dwpc( + graph, + segments[1], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + )[2] + row_names, cols, adj0 = dwpc( + graph, + segments[0], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) + rows, col_names, adj1 = dwpc( + graph, + segments[-1], + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = remove_diag(adj0 @ mid @ adj1, dtype=dtype) return row_names, col_names, dwpc_matrix -def _dwpc_baab(graph, metapath, damping=0.5, dense_threshold=0, - dtype=numpy.float64): +def _dwpc_baab(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): """ A function to handle metapath (segments) of the form BAAB. This function will handle arbitrary lengths of this repeated @@ -641,8 +721,8 @@ def _dwpc_baab(graph, metapath, damping=0.5, dense_threshold=0, mid_seg = s mid_ind = i rows, cols, dwpc_mid = dwpc( - graph, mid_seg, damping=damping, dense_threshold=dense_threshold, - dtype=dtype) + graph, mid_seg, damping=damping, dense_threshold=dense_threshold, dtype=dtype + ) dwpc_mid = remove_diag(dwpc_mid, dtype=dtype) # Get two indices for the segments ahead of and behind the middle region @@ -656,14 +736,22 @@ def _dwpc_baab(graph, metapath, damping=0.5, dense_threshold=0, # Multiply on the head if head is not None: row_names, cols, dwpc_head = dwpc( - graph, head, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + graph, + head, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_mid = dwpc_head @ dwpc_mid # Multiply on the tail if tail is not None: rows, col_names, dwpc_tail = dwpc( - graph, tail, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + graph, + tail, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_mid = dwpc_mid @ dwpc_tail # Remove the diagonal if the head and tail are repeats if head and tail: @@ -673,8 +761,7 @@ def _dwpc_baab(graph, metapath, damping=0.5, dense_threshold=0, return row_names, col_names, dwpc_mid -def _dwpc_baba(graph, metapath, damping=0.5, dense_threshold=0, - dtype=numpy.float64): +def _dwpc_baba(graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64): """ Computes the degree-weighted path count for overlapping metanode repeats of the form B-A-B-A. Supports random inserts. @@ -692,39 +779,60 @@ def _dwpc_baba(graph, metapath, damping=0.5, dense_threshold=0, seg_cda = segments[0] if i == 1 else None seg_bed = segments[-1] if segments[-1] != seg_azb else None # Collect segment DWPC and corrections - row_names, cols, axb = dwpc(graph, seg_axb, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) - rows, cols, bya = dwpc(graph, seg_bya, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) - rows, col_names, azb = dwpc(graph, seg_azb, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) - - correction_a = numpy.diag((axb @ bya).diagonal()) @ azb if \ - not sparse.issparse(axb) else \ - sparse.diags((axb @ bya).diagonal()) @ azb - correction_b = axb @ numpy.diag((bya @ azb).diagonal()) if \ - not sparse.issparse(bya) else \ - axb @ sparse.diags((bya @ azb).diagonal()) - correction_c = axb * bya.T * azb if not sparse.issparse(bya) else \ - (axb.multiply(bya.T)).multiply(azb) + row_names, cols, axb = dwpc( + graph, seg_axb, damping=damping, dense_threshold=dense_threshold, dtype=dtype + ) + rows, cols, bya = dwpc( + graph, seg_bya, damping=damping, dense_threshold=dense_threshold, dtype=dtype + ) + rows, col_names, azb = dwpc( + graph, seg_azb, damping=damping, dense_threshold=dense_threshold, dtype=dtype + ) + + correction_a = ( + numpy.diag((axb @ bya).diagonal()) @ azb + if not sparse.issparse(axb) + else sparse.diags((axb @ bya).diagonal()) @ azb + ) + correction_b = ( + axb @ numpy.diag((bya @ azb).diagonal()) + if not sparse.issparse(bya) + else axb @ sparse.diags((bya @ azb).diagonal()) + ) + correction_c = ( + axb * bya.T * azb + if not sparse.issparse(bya) + else (axb.multiply(bya.T)).multiply(azb) + ) # Apply the corrections dwpc_matrix = axb @ bya @ azb - correction_a - correction_b + correction_c if seg_axb.source == seg_azb.target: dwpc_matrix = remove_diag(dwpc_matrix) # Account for possible head and tail segments outside the BABA group if seg_cda is not None: - row_names, cols, cda = dwpc(graph, seg_cda, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + row_names, cols, cda = dwpc( + graph, + seg_cda, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = cda @ dwpc_matrix if seg_bed is not None: - rows, col_names, bed = dwpc(graph, seg_bed, damping=damping, - dense_threshold=dense_threshold, dtype=dtype) + rows, col_names, bed = dwpc( + graph, + seg_bed, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = dwpc_matrix @ bed return row_names, col_names, dwpc_matrix -def _dwpc_short_repeat(graph, metapath, damping=0.5, dense_threshold=0, - dtype=numpy.float64): +def _dwpc_short_repeat( + graph, metapath, damping=0.5, dense_threshold=0, dtype=numpy.float64 +): """ One metanode repeated 3 or fewer times (A-A-A), not (A-A-A-A) This can include other random inserts, so long as they are not @@ -753,13 +861,14 @@ def _dwpc_short_repeat(graph, metapath, damping=0.5, dense_threshold=0, # Calculate DWPC for the middle ("repeat") segment repeated_metanode = repeat_segment.source() - index_of_repeats = [i for i, v in enumerate(repeat_segment.get_nodes()) if - v == repeated_metanode] + index_of_repeats = [ + i for i, v in enumerate(repeat_segment.get_nodes()) if v == repeated_metanode + ] - for metaedge in repeat_segment[:index_of_repeats[1]]: + for metaedge in repeat_segment[: index_of_repeats[1]]: rows, cols, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( - graph, metaedge, dtype=dtype, - dense_threshold=dense_threshold) + graph, metaedge, dtype=dtype, dense_threshold=dense_threshold + ) adj = _degree_weight(adj, damping, dtype=dtype) if dwpc_matrix is None: row_names = rows @@ -771,10 +880,10 @@ def _dwpc_short_repeat(graph, metapath, damping=0.5, dense_threshold=0, # Extra correction for random metanodes in the repeat segment if len(index_of_repeats) == 3: - for metaedge in repeat_segment[index_of_repeats[1]:]: + for metaedge in repeat_segment[index_of_repeats[1] :]: rows, cols, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( - graph, metaedge, dtype=dtype, - dense_threshold=dense_threshold) + graph, metaedge, dtype=dtype, dense_threshold=dense_threshold + ) adj = _degree_weight(adj, damping, dtype=dtype) if dwpc_tail is None: dwpc_tail = adj @@ -786,21 +895,30 @@ def _dwpc_short_repeat(graph, metapath, damping=0.5, dense_threshold=0, col_names = cols if head_segment: - row_names, cols, head_dwpc = dwpc(graph, head_segment, damping=damping, - dense_threshold=dense_threshold, - dtype=dtype) + row_names, cols, head_dwpc = dwpc( + graph, + head_segment, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = head_dwpc @ dwpc_matrix if tail_segment: - rows, col_names, tail_dwpc = dwpc(graph, tail_segment, damping=damping, - dense_threshold=dense_threshold, - dtype=dtype) + rows, col_names, tail_dwpc = dwpc( + graph, + tail_segment, + damping=damping, + dense_threshold=dense_threshold, + dtype=dtype, + ) dwpc_matrix = dwpc_matrix @ tail_dwpc return row_names, col_names, dwpc_matrix -def _node_to_children(graph, metapath, node, metapath_index, damping=0, - history=None, dtype=numpy.float64): +def _node_to_children( + graph, metapath, node, metapath_index, damping=0, history=None, dtype=numpy.float64 +): """ Returns a history adjusted list of child nodes. Used in _dwpc_general_case. @@ -828,15 +946,19 @@ def _node_to_children(graph, metapath, node, metapath_index, damping=0, if history is None: history = { i.target: numpy.ones( - len(hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, i)[1] - ), dtype=dtype) - for i in metapath if i.target in repeated + len(hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, i)[1]), + dtype=dtype, + ) + for i in metapath + if i.target in repeated } history = history.copy() if metaedge.source in history: history[metaedge.source] -= numpy.array(node != 0, dtype=dtype) - rows, cols, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, metaedge, dtype=dtype) + rows, cols, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + graph, metaedge, dtype=dtype + ) adj = _degree_weight(adj, damping, dtype=dtype) vector = node @ adj @@ -844,8 +966,7 @@ def _node_to_children(graph, metapath, node, metapath_index, damping=0, vector *= history[metaedge.target] children = [i for i in numpy.diag(vector) if i.any()] - return {'children': children, 'history': history, - 'next_index': metapath_index + 1} + return {"children": children, "history": history, "next_index": metapath_index + 1} def _dwpc_general_case(graph, metapath, damping=0, dtype=numpy.float64): @@ -861,12 +982,16 @@ def _dwpc_general_case(graph, metapath, damping=0, dtype=numpy.float64): damping : float dtype : dtype object """ - dwpc_step = functools.partial(_node_to_children, graph=graph, - metapath=metapath, damping=damping, - dtype=dtype) + dwpc_step = functools.partial( + _node_to_children, graph=graph, metapath=metapath, damping=damping, dtype=dtype + ) - start_nodes, cols, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, metapath[0]) - rows, fin_nodes, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, metapath[-1]) + start_nodes, cols, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + graph, metapath[0] + ) + rows, fin_nodes, adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + graph, metapath[-1] + ) number_start = len(start_nodes) number_end = len(fin_nodes) @@ -881,21 +1006,20 @@ def _dwpc_general_case(graph, metapath, damping=0, dtype=numpy.float64): k += 1 step2 = [] for group in step1: - for child in group['children']: - hist = copy.deepcopy(group['history']) - out = dwpc_step(node=child, - metapath_index=group['next_index'], - history=hist) - if out['children']: + for child in group["children"]: + hist = copy.deepcopy(group["history"]) + out = dwpc_step( + node=child, metapath_index=group["next_index"], history=hist + ) + if out["children"]: step2.append(out) step1 = step2 - final_children = [group for group in step2 - if group['children'] != []] + final_children = [group for group in step2 if group["children"] != []] end_nodes = sum( - [child for group in final_children - for child in group['children']]) + child for group in final_children for child in group["children"] + ) if type(end_nodes) not in (list, numpy.ndarray): end_nodes = numpy.zeros(number_end) dwpc_matrix.append(end_nodes) diff --git a/hetmatpy/diffusion.py b/hetmatpy/diffusion.py index dfd36c1..050345c 100644 --- a/hetmatpy/diffusion.py +++ b/hetmatpy/diffusion.py @@ -1,15 +1,9 @@ from collections import OrderedDict import numpy -from hetnetpy.matrix import ( - get_node_to_position, -) +from hetnetpy.matrix import get_node_to_position -from .matrix import ( - copy_array, - metaedge_to_adjacency_matrix, - normalize, -) +from .matrix import copy_array, metaedge_to_adjacency_matrix, normalize def diffusion_step(matrix, row_damping=0, column_damping=0): @@ -42,23 +36,17 @@ def diffusion_step(matrix, row_damping=0, column_damping=0): # Perform column normalization if column_damping != 0: column_sums = numpy.array(matrix.sum(axis=0)).flatten() - matrix = normalize(matrix, column_sums, 'columns', column_damping) + matrix = normalize(matrix, column_sums, "columns", column_damping) # Perform row normalization if row_damping != 0: row_sums = numpy.array(matrix.sum(axis=1)).flatten() - matrix = normalize(matrix, row_sums, 'rows', row_damping) + matrix = normalize(matrix, row_sums, "rows", row_damping) return matrix -def diffuse( - graph, - metapath, - source_node_weights, - column_damping=0, - row_damping=1 - ): +def diffuse(graph, metapath, source_node_weights, column_damping=0, row_damping=1): """ Performs diffusion from the specified source nodes. @@ -85,12 +73,12 @@ def diffuse( node_scores[i] = weight for metaedge in metapath: - row_names, column_names, adjacency_matrix = ( - metaedge_to_adjacency_matrix(graph, metaedge)) + row_names, column_names, adjacency_matrix = metaedge_to_adjacency_matrix( + graph, metaedge + ) # Row/column normalization with degree damping - adjacency_matrix = diffusion_step( - adjacency_matrix, row_damping, column_damping) + adjacency_matrix = diffusion_step(adjacency_matrix, row_damping, column_damping) node_scores = node_scores @ adjacency_matrix diff --git a/hetmatpy/hetmat/__init__.py b/hetmatpy/hetmat/__init__.py index 3a8fd9e..15b885c 100644 --- a/hetmatpy/hetmat/__init__.py +++ b/hetmatpy/hetmat/__init__.py @@ -17,7 +17,9 @@ import hetmatpy.matrix -def hetmat_from_graph(graph, path, save_metagraph=True, save_nodes=True, save_edges=True): +def hetmat_from_graph( + graph, path, save_metagraph=True, save_nodes=True, save_edges=True +): """ Create a hetmat.HetMat from a hetnetpy.hetnet.Graph. """ @@ -33,14 +35,16 @@ def hetmat_from_graph(graph, path, save_metagraph=True, save_nodes=True, save_ed node_to_position = hetnetpy.matrix.get_node_to_position(graph, metanode) for node, position in node_to_position.items(): rows.append((position, node.identifier, node.name)) - node_df = pandas.DataFrame(rows, columns=['position', 'identifier', 'name']) + node_df = pandas.DataFrame(rows, columns=["position", "identifier", "name"]) path = hetmat.get_nodes_path(metanode) - node_df.to_csv(path, index=False, sep='\t') + node_df.to_csv(path, index=False, sep="\t") # Save metaedges metaedges = list(graph.metagraph.get_edges(exclude_inverts=True)) for metaedge in metaedges: - rows, cols, matrix = hetnetpy.matrix.metaedge_to_adjacency_matrix(graph, metaedge, dense_threshold=1) + rows, cols, matrix = hetnetpy.matrix.metaedge_to_adjacency_matrix( + graph, metaedge, dense_threshold=1 + ) path = hetmat.get_edges_path(metaedge, file_format=None) save_matrix(matrix, path) return hetmat @@ -53,7 +57,11 @@ def hetmat_from_permuted_graph(hetmat, permutation_id, permuted_graph): """ permuted_hetmat = initialize_permutation_directory(hetmat, permutation_id) permuted_hetmat = hetmat_from_graph( - permuted_graph, permuted_hetmat.directory, save_metagraph=False, save_nodes=False) + permuted_graph, + permuted_hetmat.directory, + save_metagraph=False, + save_nodes=False, + ) return permuted_hetmat @@ -72,37 +80,37 @@ def initialize_permutation_directory(hetmat, permutation_id): """ if not hetmat.permutations_directory.is_dir(): hetmat.permutations_directory.mkdir() - directory = hetmat.permutations_directory.joinpath(f'{permutation_id}.hetmat') + directory = hetmat.permutations_directory.joinpath(f"{permutation_id}.hetmat") if directory.is_dir(): # If directory exists, back it up using a .bak extension - backup_directory = directory.with_name(directory.name + '.bak') + backup_directory = directory.with_name(directory.name + ".bak") if backup_directory.is_dir(): shutil.rmtree(backup_directory) shutil.move(directory, backup_directory) permuted_hetmat = HetMat(directory, initialize=True) permuted_hetmat.is_permutation = True - permuted_hetmat.metagraph_path.symlink_to('../../metagraph.json') + permuted_hetmat.metagraph_path.symlink_to("../../metagraph.json") permuted_hetmat.nodes_directory.rmdir() - permuted_hetmat.nodes_directory.symlink_to('../../nodes', target_is_directory=True) + permuted_hetmat.nodes_directory.symlink_to("../../nodes", target_is_directory=True) return permuted_hetmat -def read_matrix(path, file_format='infer'): +def read_matrix(path, file_format="infer"): path = str(path) - if file_format == 'infer': - if path.endswith('.sparse.npz'): - file_format = 'sparse.npz' - if path.endswith('.npy'): - file_format = 'npy' - if file_format == 'infer': - raise ValueError('Could not infer file_format for {path}') - if file_format == 'sparse.npz': + if file_format == "infer": + if path.endswith(".sparse.npz"): + file_format = "sparse.npz" + if path.endswith(".npy"): + file_format = "npy" + if file_format == "infer": + raise ValueError("Could not infer file_format for {path}") + if file_format == "sparse.npz": # https://docs.scipy.org/doc/scipy-1.0.0/reference/generated/scipy.sparse.load_npz.html return scipy.sparse.load_npz(path) - if file_format == 'npy': + if file_format == "npy": # https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.load.html return numpy.load(path) - raise ValueError(f'file_format={file_format} is not supported.') + raise ValueError(f"file_format={file_format} is not supported.") def save_matrix(matrix, path): @@ -115,12 +123,12 @@ def save_matrix(matrix, path): path.parent.mkdir() path = str(path) if isinstance(matrix, numpy.ndarray): - if not path.endswith('.npy'): - path += '.npy' + if not path.endswith(".npy"): + path += ".npy" numpy.save(path, matrix) elif scipy.sparse.issparse(matrix): - if not path.endswith('.sparse.npz'): - path += '.sparse.npz' + if not path.endswith(".sparse.npz"): + path += ".sparse.npz" scipy.sparse.save_npz(path, matrix, compressed=True) @@ -138,34 +146,34 @@ def read_first_matrix(specs, delete_failures=False): """ paths = list() for spec in specs: - path = pathlib.Path(spec['path']) + path = pathlib.Path(spec["path"]) paths.append(str(path)) if not path.is_file(): continue - transpose = spec.get('transpose', False) - file_format = spec.get('file_format', 'infer') + transpose = spec.get("transpose", False) + file_format = spec.get("file_format", "infer") try: matrix = read_matrix(path, file_format=file_format) except Exception as error: - logging.warning(f'Error reading matrix at {path}:\n{error}') + logging.warning(f"Error reading matrix at {path}:\n{error}") if delete_failures: path.unlink() - logging.warning(f'Deleting file at {path}') + logging.warning(f"Deleting file at {path}") continue if transpose: matrix = matrix.transpose() return matrix raise FileNotFoundError( - 'No matrix files found at the specified paths:\n' + - '\n'.join(paths)) + "No matrix files found at the specified paths:\n" + "\n".join(paths) + ) compression_extension = { - 'gzip': '.gz', - 'bz2': '.bz2', - 'zip': '.zip', - 'xz': '.xz', - None: '', + "gzip": ".gz", + "bz2": ".bz2", + "zip": ".zip", + "xz": ".xz", + None: "", } @@ -173,7 +181,7 @@ class HetMat: # Supported formats for nodes files nodes_formats = { - 'tsv', + "tsv", # 'feather', # 'pickle', # 'json', @@ -181,8 +189,8 @@ class HetMat: # Supported formats for edges files edges_formats = { - 'npy', - 'sparse.npz', + "npy", + "sparse.npz", # 'tsv', } @@ -191,14 +199,14 @@ def __init__(self, directory, initialize=False): Initialize a HetMat with its MetaGraph. """ self.directory = pathlib.Path(directory) - self.metagraph_path = self.directory.joinpath('metagraph.json') - self.nodes_directory = self.directory.joinpath('nodes') - self.edges_directory = self.directory.joinpath('edges') - self.path_counts_directory = self.directory.joinpath('path-counts') + self.metagraph_path = self.directory.joinpath("metagraph.json") + self.nodes_directory = self.directory.joinpath("nodes") + self.edges_directory = self.directory.joinpath("edges") + self.path_counts_directory = self.directory.joinpath("path-counts") self.path_counts_cache = None # Permutations should set is_permutation=True self.is_permutation = False - self.permutations_directory = self.directory.joinpath('permutations') + self.permutations_directory = self.directory.joinpath("permutations") if initialize: self.initialize() @@ -226,17 +234,23 @@ def permutations(self): extension. """ permutations = {} - for directory in sorted(self.permutations_directory.glob('*.hetmat')): + for directory in sorted(self.permutations_directory.glob("*.hetmat")): if not directory.is_dir(): continue permutation = HetMat(directory) permutation.is_permutation = True - name, _ = directory.name.rsplit('.', 1) + name, _ = directory.name.rsplit(".", 1) permutations[name] = permutation return permutations - def permute_graph(self, num_new_permutations=None, namer=None, start_from=None, - multiplier=10, seed=0): + def permute_graph( + self, + num_new_permutations=None, + namer=None, + start_from=None, + multiplier=10, + seed=0, + ): """ Generate and save permutations of the HetMat adjacency matrices. @@ -257,7 +271,7 @@ def permute_graph(self, num_new_permutations=None, namer=None, start_from=None, """ if namer is None: # If no namer given, continue increasing names by one for new permutations - namer = (f'{x:03}' for x in itertools.count(start=1)) + namer = (f"{x:03}" for x in itertools.count(start=1)) stat_dfs = list() for _ in range(num_new_permutations): @@ -273,17 +287,21 @@ def permute_graph(self, num_new_permutations=None, namer=None, start_from=None, metaedges = list(self.metagraph.get_edges(exclude_inverts=True)) for metaedge in metaedges: rows, cols, original_matrix = start_from.metaedge_to_adjacency_matrix( - metaedge, dense_threshold=1) - is_directed = metaedge.direction != 'both' + metaedge, dense_threshold=1 + ) + is_directed = metaedge.direction != "both" permuted_matrix, stats = hetmatpy.matrix.permute_matrix( - original_matrix, directed=is_directed, multiplier=multiplier, - seed=seed) + original_matrix, + directed=is_directed, + multiplier=multiplier, + seed=seed, + ) path = new_hetmat.get_edges_path(metaedge, file_format=None) save_matrix(permuted_matrix, path) stat_df = pandas.DataFrame(stats) - stat_df['metaedge'] = metaedge - stat_df['abbrev'] = metaedge.get_abbrev() - stat_df['permutation'] = permutation_name + stat_df["metaedge"] = metaedge + stat_df["abbrev"] = metaedge.get_abbrev() + stat_df["permutation"] = permutation_name stat_dfs.append(stat_df) start_from = permutation_name seed += 1 @@ -309,26 +327,26 @@ def metagraph(self, metagraph): """ hetnetpy.readwrite.write_metagraph(metagraph, self.metagraph_path) - def get_nodes_path(self, metanode, file_format='tsv'): + def get_nodes_path(self, metanode, file_format="tsv"): """ Get the path for the nodes file for the specified metanode. Setting file_format=None returns the path without any extension suffix. """ metanode = self.metagraph.get_metanode(metanode) - path = self.nodes_directory.joinpath(f'{metanode}') + path = self.nodes_directory.joinpath(f"{metanode}") if file_format is not None: - path = path.with_name(f'{path.name}.{file_format}') + path = path.with_name(f"{path.name}.{file_format}") return path - def get_edges_path(self, metaedge, file_format='npy'): + def get_edges_path(self, metaedge, file_format="npy"): """ Get the path for the edges file for the specified metaedge. Setting file_format=None returns the path without any extension suffix. """ metaedge_abbrev = self.metagraph.get_metaedge(metaedge).get_abbrev() - path = self.edges_directory.joinpath(f'{metaedge_abbrev}') + path = self.edges_directory.joinpath(f"{metaedge_abbrev}") if file_format is not None: - path = path.with_name(f'{path.name}.{file_format}') + path = path.with_name(f"{path.name}.{file_format}") return path def get_path_counts_path(self, metapath, metric, damping, file_format): @@ -337,27 +355,36 @@ def get_path_counts_path(self, metapath, metric, damping, file_format): Supported metrics are 'dwpc' and 'dwwc'. """ damping = float(damping) - path = self.path_counts_directory.joinpath(f'{metric}-{damping}/{metapath}') + path = self.path_counts_directory.joinpath(f"{metric}-{damping}/{metapath}") if file_format is not None: - path = path.with_name(f'{path.name}.{file_format}') + path = path.with_name(f"{path.name}.{file_format}") return path - def get_running_degree_group_path(self, metapath, metric, damping, extension='.tsv.gz'): + def get_running_degree_group_path( + self, metapath, metric, damping, extension=".tsv.gz" + ): """ Get path for degree-grouped permutatation running metrics. Must specify extension. """ damping = float(damping) path = self.directory.joinpath( - 'adjusted-path-counts', f'{metric}-{damping}', - 'degree-grouped-permutations', f'{metapath}{extension}') + "adjusted-path-counts", + f"{metric}-{damping}", + "degree-grouped-permutations", + f"{metapath}{extension}", + ) return path def get_metapath_summary_path(self, metapath, metric, damping, compression=None): damping = float(damping) compr = compression_extension[compression] - path = self.directory.joinpath('adjusted-path-counts', f'{metric}-{damping}', - 'adjusted-dwpcs', f'{metapath}.tsv{compr}') + path = self.directory.joinpath( + "adjusted-path-counts", + f"{metric}-{damping}", + "adjusted-dwpcs", + f"{metapath}.tsv{compr}", + ) return path @functools.lru_cache() @@ -365,9 +392,9 @@ def get_node_identifiers(self, metanode): """ Returns a list of node identifiers for a metapath """ - path = self.get_nodes_path(metanode, file_format='tsv') - node_df = pandas.read_csv(path, sep='\t') - return list(node_df['identifier']) + path = self.get_nodes_path(metanode, file_format="tsv") + node_df = pandas.read_csv(path, sep="\t") + return list(node_df["identifier"]) @functools.lru_cache() def count_nodes(self, metanode): @@ -375,9 +402,12 @@ def count_nodes(self, metanode): return len(nodes) def metaedge_to_adjacency_matrix( - self, metaedge, - dtype=None, dense_threshold=None, - file_formats=['sparse.npz', 'npy']): + self, + metaedge, + dtype=None, + dense_threshold=None, + file_formats=["sparse.npz", "npy"], + ): """ file_formats sets the precedence of which file to read in """ @@ -389,11 +419,13 @@ def metaedge_to_adjacency_matrix( metaedge=metaedge.inverse if invert else metaedge, file_format=file_format, ) - spec = {'path': path, 'transpose': invert, 'file_format': file_format} + spec = {"path": path, "transpose": invert, "file_format": file_format} specs.append(spec) matrix = read_first_matrix(specs) if dense_threshold is not None: - matrix = hetnetpy.matrix.sparsify_or_densify(matrix, dense_threshold=dense_threshold) + matrix = hetnetpy.matrix.sparsify_or_densify( + matrix, dense_threshold=dense_threshold + ) if dtype is not None: matrix = matrix.astype(dtype) row_ids = self.get_node_identifiers(metaedge.source) @@ -401,18 +433,18 @@ def metaedge_to_adjacency_matrix( return row_ids, col_ids, matrix def read_path_counts( - self, metapath, metric, damping, - file_formats=['sparse.npz', 'npy']): + self, metapath, metric, damping, file_formats=["sparse.npz", "npy"] + ): """ Read matrix with values of a path-count-based metric. Attempts to locate any files with the matrix (or with trivial transformations). """ category = hetmatpy.degree_weight.categorize(metapath) metrics = [metric] - if metric == 'dwpc' and category == 'no_repeats': - metrics.append('dwwc') - if metric == 'dwwc' and category == 'no_repeats': - metrics.append('dwpc') + if metric == "dwpc" and category == "no_repeats": + metrics.append("dwwc") + if metric == "dwwc" and category == "no_repeats": + metrics.append("dwpc") specs = list() configurations = itertools.product( file_formats, @@ -421,11 +453,12 @@ def read_path_counts( ) for file_format, metric, invert in configurations: path = self.get_path_counts_path( - metapath=metapath.inverse if invert else metapath, metric=metric, + metapath=metapath.inverse if invert else metapath, + metric=metric, damping=damping, file_format=file_format, ) - spec = {'path': path, 'transpose': invert, 'file_format': file_format} + spec = {"path": path, "transpose": invert, "file_format": file_format} specs.append(spec) row_ids = self.get_node_identifiers(metapath.source()) col_ids = self.get_node_identifiers(metapath.target()) diff --git a/hetmatpy/hetmat/archive.py b/hetmatpy/hetmat/archive.py index 5981b48..e4f52a8 100644 --- a/hetmatpy/hetmat/archive.py +++ b/hetmatpy/hetmat/archive.py @@ -14,21 +14,27 @@ def create_hetmat_archive(hetmat, destination_path=None): directory with .zip appended. Returns the destination path. """ if destination_path is None: - destination_path = hetmat.directory.joinpath('..', hetmat.directory.absolute().name + '.zip') + destination_path = hetmat.directory.joinpath( + "..", hetmat.directory.absolute().name + ".zip" + ) create_archive_by_globs( destination_path=destination_path, root_directory=hetmat.directory, - include_globs=['nodes/*', 'edges/*'], - include_paths=['metagraph.json'], - zip_mode='w', + include_globs=["nodes/*", "edges/*"], + include_paths=["metagraph.json"], + zip_mode="w", ) return destination_path def create_archive_by_globs( - destination_path, root_directory, - include_globs=[], exclude_globs=[], include_paths=[], - **kwargs): + destination_path, + root_directory, + include_globs=[], + exclude_globs=[], + include_paths=[], + **kwargs, +): """ First, paths relative to root_directory are included according to include_globs. Second, paths relative to root_directory are excluded according to exclude_globs. @@ -46,8 +52,13 @@ def create_archive_by_globs( def create_archive( - destination_path, root_directory, source_paths, - zip_mode='x', compression=zipfile.ZIP_LZMA, split_size=None): + destination_path, + root_directory, + source_paths, + zip_mode="x", + compression=zipfile.ZIP_LZMA, + split_size=None, +): """ Create a zip archive of the source paths at the destination path. source_paths as paths relative to the hetmat root directory. @@ -57,13 +68,15 @@ def create_archive( path is a TSV with information on each archived file. """ root_directory = pathlib.Path(root_directory) - assert zip_mode in {'w', 'x', 'a'} + assert zip_mode in {"w", "x", "a"} source_paths = sorted(set(map(str, source_paths))) destination_path = pathlib.Path(destination_path) if split_size is None: zip_path = destination_path else: - zip_path_formatter = f'{destination_path.stem}-{{:04d}}{destination_path.suffix}'.format + zip_path_formatter = ( + f"{destination_path.stem}-{{:04d}}{destination_path.suffix}".format + ) split_num = 0 zip_path = destination_path.with_name(zip_path_formatter(split_num)) zip_paths = [zip_path] @@ -78,12 +91,14 @@ def create_archive( split_num += 1 zip_path = destination_path.with_name(zip_path_formatter(split_num)) zip_paths.append(zip_path) - zip_file = zipfile.ZipFile(zip_path, mode=zip_mode, compression=compression) + zip_file = zipfile.ZipFile( + zip_path, mode=zip_mode, compression=compression + ) zip_file.write(source_fs_path, source_path) zip_file.close() info_df = get_archive_info_df(zip_paths) - info_path = destination_path.with_name(destination_path.name + '-info.tsv') - info_df.to_csv(info_path, sep='\t', index=False) + info_path = destination_path.with_name(destination_path.name + "-info.tsv") + info_df.to_csv(info_path, sep="\t", index=False) return [info_path] + zip_paths @@ -92,11 +107,11 @@ def get_archive_info_df(zip_paths): Return member file info for a list of zip archives. """ fields = [ - 'filename', - 'file_size', - 'compress_type', - 'compress_size', - 'CRC', + "filename", + "file_size", + "compress_type", + "compress_size", + "CRC", ] rows = list() for path in zip_paths: @@ -105,7 +120,7 @@ def get_archive_info_df(zip_paths): infolist = zip_file.infolist() for info in infolist: row = collections.OrderedDict() - row['archive'] = path.name + row["archive"] = path.name for field in fields: row[field] = getattr(info, field) rows.append(row) @@ -120,10 +135,12 @@ def load_archive(archive_path, destination_dir, source_paths=None): If source_paths=None, all zipped files are extracted. Pass source_paths a list of specific paths within the zipfile to extract only those members. """ - is_url = isinstance(archive_path, str) and re.match('^(http|ftp)s?://', archive_path) + is_url = isinstance(archive_path, str) and re.match( + "^(http|ftp)s?://", archive_path + ) if is_url: archive_path, _ = urllib.request.urlretrieve(archive_path) - with zipfile.ZipFile(archive_path, mode='r') as zip_file: + with zipfile.ZipFile(archive_path, mode="r") as zip_file: zip_file.extractall(destination_dir, members=source_paths) if is_url: urllib.request.urlcleanup() diff --git a/hetmatpy/hetmat/caching.py b/hetmatpy/hetmat/caching.py index b229ea9..daa6586 100644 --- a/hetmatpy/hetmat/caching.py +++ b/hetmatpy/hetmat/caching.py @@ -17,6 +17,7 @@ def path_count_cache(metric): Decorator to apply caching to the DWWC and DWPC functions from hetmatpy.degree_weight. """ + def decorator(user_function): signature = inspect.signature(user_function) @@ -25,43 +26,47 @@ def wrapper(*args, **kwargs): bound_args = signature.bind(*args, **kwargs) bound_args.apply_defaults() arguments = bound_args.arguments - graph = arguments['graph'] - metapath = graph.metagraph.get_metapath(arguments['metapath']) - arguments['metapath'] = metapath - damping = arguments['damping'] + graph = arguments["graph"] + metapath = graph.metagraph.get_metapath(arguments["metapath"]) + arguments["metapath"] = metapath + damping = arguments["damping"] cached_result = None start = time.perf_counter() - supports_cache = isinstance(graph, hetmatpy.hetmat.HetMat) and graph.path_counts_cache + supports_cache = ( + isinstance(graph, hetmatpy.hetmat.HetMat) and graph.path_counts_cache + ) if supports_cache: - cache_key = {'metapath': metapath, 'metric': metric, 'damping': damping} + cache_key = {"metapath": metapath, "metric": metric, "damping": damping} cached_result = graph.path_counts_cache.get(**cache_key) if cached_result: row_names, col_names, matrix = cached_result - matrix = sparsify_or_densify(matrix, arguments['dense_threshold']) - matrix = matrix.astype(arguments['dtype']) + matrix = sparsify_or_densify(matrix, arguments["dense_threshold"]) + matrix = matrix.astype(arguments["dtype"]) if cached_result is None: - if arguments['dwwc_method'] is None: + if arguments["dwwc_method"] is None: # import default_dwwc_method here to avoid circular dependencies from hetmatpy.degree_weight import default_dwwc_method - arguments['dwwc_method'] = default_dwwc_method + + arguments["dwwc_method"] = default_dwwc_method row_names, col_names, matrix = user_function(**arguments) if supports_cache: runtime = time.perf_counter() - start graph.path_counts_cache.set(**cache_key, matrix=matrix, runtime=runtime) return row_names, col_names, matrix + return wrapper + return decorator class PathCountCache: - def __init__(self, hetmat): self.hetmat = hetmat self.cache = {} self.hits = { - 'memory': 0, - 'disk': 0, - 'absent': 0, + "memory": 0, + "disk": 0, + "absent": 0, } def get(self, metapath, metric, damping): @@ -78,16 +83,16 @@ def get(self, metapath, metric, damping): if invert: matrix = matrix.transpose() if matrix is not None: - self.hits['memory'] += 1 + self.hits["memory"] += 1 row_ids = self.hetmat.get_node_identifiers(metapath.source()) col_ids = self.hetmat.get_node_identifiers(metapath.target()) return row_ids, col_ids, matrix try: result = self.hetmat.read_path_counts(metapath, metric, damping) - self.hits['disk'] += 1 + self.hits["disk"] += 1 return result except FileNotFoundError: - self.hits['absent'] += 1 + self.hits["absent"] += 1 return None def set(self, metapath, metric, damping, matrix, runtime): @@ -102,16 +107,17 @@ def get_stats(self): """ Return a str with formatted stats about cache operations """ - hits_str = ', '.join(f'{kind} = {count:,}' for kind, count in self.hits.items()) - stats_str = textwrap.dedent(f'''\ + hits_str = ", ".join(f"{kind} = {count:,}" for kind, count in self.hits.items()) + stats_str = textwrap.dedent( + f"""\ {self.__class__.__name__} containing {len(self.cache):,} items total gets: {sum(self.hits.values()):,} - cache hits: {hits_str}''') + cache hits: {hits_str}""" + ) return stats_str class PathCountPriorityCache(PathCountCache): - def __init__(self, hetmat, allocate_GB): super().__init__(hetmat) self.bytes_per_gigabyte = 1_000_000_000 @@ -167,9 +173,15 @@ def get_matrix_size(matrix): return matrix.nbytes if scipy.sparse.isspmatrix_coo(matrix): return matrix.col.nbytes + matrix.row.nbytes + matrix.data.nbytes - if scipy.sparse.isspmatrix_csc(matrix) or scipy.sparse.isspmatrix_csr(matrix) or scipy.sparse.isspmatrix_bsr(matrix): # noqa: E501 + if ( + scipy.sparse.isspmatrix_csc(matrix) + or scipy.sparse.isspmatrix_csr(matrix) + or scipy.sparse.isspmatrix_bsr(matrix) + ): # noqa: E501 return matrix.data.nbytes + matrix.indptr.nbytes + matrix.indices.nbytes if scipy.sparse.isparse(matrix): # Estimate size based on number of nonzeros for remaining sparse types return 2 * matrix.nnz * 4 + matrix.data.nbytes - raise NotImplementedError(f'cannot calculate get_matrix_size for type {type(matrix)}') + raise NotImplementedError( + f"cannot calculate get_matrix_size for type {type(matrix)}" + ) diff --git a/hetmatpy/hetmat/tests/test_archive.py b/hetmatpy/hetmat/tests/test_archive.py index bd8f97f..27bae7f 100644 --- a/hetmatpy/hetmat/tests/test_archive.py +++ b/hetmatpy/hetmat/tests/test_archive.py @@ -13,8 +13,8 @@ def test_disease_gene_example_hetmat_archiving(tmpdir): Test archiving the hetmat corresponding to the hetnet in Figure 2C at https://doi.org/crz8. """ tmpdir = pathlib.Path(tmpdir) - graph = get_graph('disease-gene-example') - hetmat_0_dir = tmpdir.joinpath('disease-gene-example-0.hetmat') + graph = get_graph("disease-gene-example") + hetmat_0_dir = tmpdir.joinpath("disease-gene-example-0.hetmat") hetmat = hetmatpy.hetmat.hetmat_from_graph(graph, hetmat_0_dir) # Test creating archive @@ -22,35 +22,41 @@ def test_disease_gene_example_hetmat_archiving(tmpdir): with zipfile.ZipFile(archive_path) as zip_file: name_list = zip_file.namelist() expected = [ - 'edges/DlT.sparse.npz', - 'edges/GaD.sparse.npz', - 'edges/GeT.sparse.npz', - 'edges/GiG.sparse.npz', - 'metagraph.json', - 'nodes/Disease.tsv', - 'nodes/Gene.tsv', - 'nodes/Tissue.tsv', + "edges/DlT.sparse.npz", + "edges/GaD.sparse.npz", + "edges/GeT.sparse.npz", + "edges/GiG.sparse.npz", + "metagraph.json", + "nodes/Disease.tsv", + "nodes/Gene.tsv", + "nodes/Tissue.tsv", ] assert name_list == expected # Test round-tripped hetmat has same files - hetmat_1_dir = tmpdir.joinpath('disease-gene-example-1.hetmat') + hetmat_1_dir = tmpdir.joinpath("disease-gene-example-1.hetmat") hetmatpy.hetmat.archive.load_archive(archive_path, hetmat_1_dir) - match, mismatch, errors = filecmp.cmpfiles(hetmat_0_dir, hetmat_1_dir, common=expected, shallow=False) + match, mismatch, errors = filecmp.cmpfiles( + hetmat_0_dir, hetmat_1_dir, common=expected, shallow=False + ) assert match == expected assert not mismatch assert not errors # Test round-tripped hetmat has same files when specifying a # subset of zip members in load_archive with source_paths. - hetmat_2_dir = tmpdir.joinpath('disease-gene-example-2.hetmat') + hetmat_2_dir = tmpdir.joinpath("disease-gene-example-2.hetmat") extract_only = [ - 'edges/GiG.sparse.npz', - 'metagraph.json', - 'nodes/Disease.tsv', + "edges/GiG.sparse.npz", + "metagraph.json", + "nodes/Disease.tsv", ] - hetmatpy.hetmat.archive.load_archive(archive_path, hetmat_2_dir, source_paths=extract_only) - match, mismatch, errors = filecmp.cmpfiles(hetmat_0_dir, hetmat_2_dir, common=expected, shallow=False) + hetmatpy.hetmat.archive.load_archive( + archive_path, hetmat_2_dir, source_paths=extract_only + ) + match, mismatch, errors = filecmp.cmpfiles( + hetmat_0_dir, hetmat_2_dir, common=expected, shallow=False + ) assert match == extract_only assert not mismatch - assert 'nodes/Tissue.tsv' in errors + assert "nodes/Tissue.tsv" in errors diff --git a/hetmatpy/hetmat/tests/test_caching.py b/hetmatpy/hetmat/tests/test_caching.py index 1517b40..755007d 100644 --- a/hetmatpy/hetmat/tests/test_caching.py +++ b/hetmatpy/hetmat/tests/test_caching.py @@ -5,12 +5,12 @@ from hetmatpy.testing import get_graph -@pytest.mark.parametrize('allocate_GB', [0, 0.1]) +@pytest.mark.parametrize("allocate_GB", [0, 0.1]) def test_path_count_priority_cache(tmpdir, allocate_GB): """ Test PathCountPriorityCache by runnin the same DWWC computation three times. """ - hetmat = get_graph('bupropion-subgraph', hetmat=True, directory=tmpdir) + hetmat = get_graph("bupropion-subgraph", hetmat=True, directory=tmpdir) cache = hetmatpy.hetmat.caching.PathCountPriorityCache(hetmat, allocate_GB) hetmat.path_counts_cache = cache print(cache.get_stats) @@ -18,49 +18,55 @@ def test_path_count_priority_cache(tmpdir, allocate_GB): # First run assert sum(cache.hits.values()) == 0 row_ids, col_ids, matrix = hetmatpy.degree_weight.dwwc( - graph=hetmat, metapath='CbGpPWpGaD', damping=0.5, + graph=hetmat, + metapath="CbGpPWpGaD", + damping=0.5, dwwc_method=hetmatpy.degree_weight.dwwc_recursive, ) assert sum(cache.hits.values()) > 0 if allocate_GB == 0: - assert cache.hits['memory'] == 0 - assert cache.hits['disk'] == 0 - assert cache.hits['absent'] == 4 + assert cache.hits["memory"] == 0 + assert cache.hits["disk"] == 0 + assert cache.hits["absent"] == 4 elif allocate_GB > 0: - assert cache.hits['memory'] == 0 - assert cache.hits['disk'] == 0 - assert cache.hits['absent'] == 4 + assert cache.hits["memory"] == 0 + assert cache.hits["disk"] == 0 + assert cache.hits["absent"] == 4 # Second run row_ids, col_ids, matrix = hetmatpy.degree_weight.dwwc( - graph=hetmat, metapath='CbGpPWpGaD', damping=0.5, + graph=hetmat, + metapath="CbGpPWpGaD", + damping=0.5, dwwc_method=hetmatpy.degree_weight.dwwc_recursive, ) if allocate_GB == 0: - assert cache.hits['memory'] == 0 - assert cache.hits['disk'] == 0 - assert cache.hits['absent'] == 8 + assert cache.hits["memory"] == 0 + assert cache.hits["disk"] == 0 + assert cache.hits["absent"] == 8 elif allocate_GB > 0: - assert cache.hits['memory'] == 1 - assert cache.hits['disk'] == 0 - assert cache.hits['absent'] == 4 + assert cache.hits["memory"] == 1 + assert cache.hits["disk"] == 0 + assert cache.hits["absent"] == 4 # Save DWWC matrix - path = hetmat.get_path_counts_path('CbGpPWpGaD', 'dwwc', 0.5, 'npy') + path = hetmat.get_path_counts_path("CbGpPWpGaD", "dwwc", 0.5, "npy") path.parent.mkdir(parents=True) hetmatpy.hetmat.save_matrix(matrix, path) # Third run row_ids, col_ids, matrix = hetmatpy.degree_weight.dwwc( - graph=hetmat, metapath='CbGpPWpGaD', damping=0.5, + graph=hetmat, + metapath="CbGpPWpGaD", + damping=0.5, dwwc_method=hetmatpy.degree_weight.dwwc_recursive, ) if allocate_GB == 0: - assert cache.hits['memory'] == 0 - assert cache.hits['disk'] == 1 - assert cache.hits['absent'] == 8 + assert cache.hits["memory"] == 0 + assert cache.hits["disk"] == 1 + assert cache.hits["absent"] == 8 elif allocate_GB > 0: - assert cache.hits['memory'] == 2 - assert cache.hits['disk'] == 0 - assert cache.hits['absent'] == 4 + assert cache.hits["memory"] == 2 + assert cache.hits["disk"] == 0 + assert cache.hits["absent"] == 4 print(cache.get_stats) diff --git a/hetmatpy/hetmat/tests/test_hetmat.py b/hetmatpy/hetmat/tests/test_hetmat.py index 3bf4537..662617d 100644 --- a/hetmatpy/hetmat/tests/test_hetmat.py +++ b/hetmatpy/hetmat/tests/test_hetmat.py @@ -10,20 +10,28 @@ def test_disease_gene_example_conversion_to_hetmat(tmpdir): Test converting the hetmat from Figure 2C of https://doi.org/crz8 into a hetmat. """ - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") hetmat = hetmatpy.hetmat.hetmat_from_graph(graph, tmpdir) assert list(graph.metagraph.get_nodes()) == list(hetmat.metagraph.get_nodes()) # Test GaD adjacency matrix - hetnet_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, 'GaD', dense_threshold=0) - hetmat_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(hetmat, 'GaD', dense_threshold=0) + hetnet_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + graph, "GaD", dense_threshold=0 + ) + hetmat_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + hetmat, "GaD", dense_threshold=0 + ) assert hetnet_adj[0] == hetmat_adj[0] # row identifiers assert hetnet_adj[1] == hetmat_adj[1] # column identifiers assert numpy.array_equal(hetnet_adj[2], hetmat_adj[2]) # adj matrices # Test DaG adjacency matrix (hetmat only stores GaD and must transpose) - hetnet_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(graph, 'DaG', dense_threshold=0) - hetmat_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix(hetmat, 'DaG', dense_threshold=0) + hetnet_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + graph, "DaG", dense_threshold=0 + ) + hetmat_adj = hetmatpy.matrix.metaedge_to_adjacency_matrix( + hetmat, "DaG", dense_threshold=0 + ) assert hetnet_adj[0] == hetmat_adj[0] # row identifiers assert hetnet_adj[1] == hetmat_adj[1] # column identifiers assert numpy.array_equal(hetnet_adj[2], hetmat_adj[2]) # adj matrices diff --git a/hetmatpy/matrix.py b/hetmatpy/matrix.py index bce9d77..3952d5d 100644 --- a/hetmatpy/matrix.py +++ b/hetmatpy/matrix.py @@ -1,9 +1,8 @@ -import numpy -import scipy.sparse - import hetnetpy.hetnet import hetnetpy.matrix import hetnetpy.permute +import numpy +import scipy.sparse import hetmatpy.hetmat @@ -17,8 +16,10 @@ def metaedge_to_adjacency_matrix(graph_or_hetmat, *args, **kwargs): if isinstance(graph_or_hetmat, hetmatpy.hetmat.HetMat): return graph_or_hetmat.metaedge_to_adjacency_matrix(*args, **kwargs) if isinstance(graph_or_hetmat, hetnetpy.hetnet.Graph): - return hetnetpy.matrix.metaedge_to_adjacency_matrix(graph_or_hetmat, *args, **kwargs) - raise TypeError(f'graph_or_hetmat is an unsupported type: {type(graph_or_hetmat)}') + return hetnetpy.matrix.metaedge_to_adjacency_matrix( + graph_or_hetmat, *args, **kwargs + ) + raise TypeError(f"graph_or_hetmat is an unsupported type: {type(graph_or_hetmat)}") def get_node_identifiers(graph_or_hetmat, metanode): @@ -30,7 +31,7 @@ def get_node_identifiers(graph_or_hetmat, metanode): return graph_or_hetmat.get_node_identifiers(metanode) if isinstance(graph_or_hetmat, hetnetpy.hetnet.Graph): return hetnetpy.matrix.get_node_identifiers(graph_or_hetmat, metanode) - raise TypeError(f'graph_or_hetmat is an unsupported type: {type(graph_or_hetmat)}') + raise TypeError(f"graph_or_hetmat is an unsupported type: {type(graph_or_hetmat)}") def normalize(matrix, vector, axis, damping_exponent): @@ -51,11 +52,11 @@ def normalize(matrix, vector, axis, damping_exponent): assert vector.ndim == 1 if damping_exponent == 0: return matrix - with numpy.errstate(divide='ignore'): + with numpy.errstate(divide="ignore"): vector **= -damping_exponent vector[numpy.isinf(vector)] = 0 vector = scipy.sparse.diags(vector) - if axis == 'rows': + if axis == "rows": # equivalent to `vector @ matrix` but returns scipy.sparse.csc not scipy.sparse.csr # noqa: E501 matrix = (matrix.transpose() @ vector).transpose() else: @@ -66,7 +67,7 @@ def normalize(matrix, vector, axis, damping_exponent): def copy_array(matrix, copy=True, dtype=numpy.float64): """Returns a newly allocated array if copy is True""" assert matrix.ndim == 2 - assert matrix.dtype != 'O' # Ensures no empty row + assert matrix.dtype != "O" # Ensures no empty row if not scipy.sparse.issparse(matrix): assert numpy.isfinite(matrix).all() # Checks NaN and Inf try: @@ -81,8 +82,9 @@ def copy_array(matrix, copy=True, dtype=numpy.float64): return matrix -def permute_matrix(adjacency_matrix, directed=False, multiplier=10, - excluded_pair_set=set(), seed=0): +def permute_matrix( + adjacency_matrix, directed=False, multiplier=10, excluded_pair_set=set(), seed=0 +): """ Perform a degree-preserving permutation on a given adjacency matrix. Assumes boolean matrix, and is incompatible with weighted edges. @@ -106,13 +108,18 @@ def permute_matrix(adjacency_matrix, directed=False, multiplier=10, """ edge_list = list(zip(*adjacency_matrix.nonzero())) permuted_edges, stats = hetnetpy.permute.permute_pair_list( - edge_list, directed=directed, multiplier=multiplier, - excluded_pair_set=excluded_pair_set, seed=seed) + edge_list, + directed=directed, + multiplier=multiplier, + excluded_pair_set=excluded_pair_set, + seed=seed, + ) edges = numpy.array(permuted_edges) ones = numpy.ones(len(edges), dtype=adjacency_matrix.dtype) - permuted_adjacency = scipy.sparse.csc_matrix((ones, (edges[:, 0], edges[:, 1])), - shape=adjacency_matrix.shape) + permuted_adjacency = scipy.sparse.csc_matrix( + (ones, (edges[:, 0], edges[:, 1])), shape=adjacency_matrix.shape + ) # Keep the same sparse type as adjacency_matrix if scipy.sparse.issparse(adjacency_matrix): diff --git a/hetmatpy/pipeline.py b/hetmatpy/pipeline.py index 98a02a4..93dd281 100644 --- a/hetmatpy/pipeline.py +++ b/hetmatpy/pipeline.py @@ -27,7 +27,7 @@ def calculate_sd(sum_of_squares, unsquared_sum, number_nonzero): if number_nonzero < 2: return None - squared_deviations = sum_of_squares - unsquared_sum ** 2 / number_nonzero + squared_deviations = sum_of_squares - unsquared_sum**2 / number_nonzero # If all the values in the row are the same we'll manually return zero, # because not doing so can lead to some issues with float imprecision @@ -46,21 +46,26 @@ def add_gamma_hurdle_to_dgp_df(dgp_df): """ # Validate dgp_df if not isinstance(dgp_df, pandas.DataFrame): - raise ValueError('add_gamma_hurdle_to_dgp_df: dgp_df must be a pandas.DataFrame') - missing = {'nnz', 'sum', 'sum_of_squares'} - set(dgp_df.columns) + raise ValueError( + "add_gamma_hurdle_to_dgp_df: dgp_df must be a pandas.DataFrame" + ) + missing = {"nnz", "sum", "sum_of_squares"} - set(dgp_df.columns) if missing: raise ValueError( - 'add_gamma_hurdle_to_dgp_df: ' - 'dgp_df missing the following required columns: ' + - ', '.join(missing) + "add_gamma_hurdle_to_dgp_df: " + "dgp_df missing the following required columns: " + ", ".join(missing) ) # Compute gamma-hurdle parameters # to_numeric prevents ZeroDivisionError when nnz is an column with object dtype # https://github.com/pandas-dev/pandas/issues/46292 - dgp_df['mean_nz'] = dgp_df['sum'] / pandas.to_numeric(dgp_df['nnz']) - dgp_df['sd_nz'] = dgp_df[['sum_of_squares', 'sum', 'nnz']].apply(lambda row: calculate_sd(*row), raw=True, axis=1) - dgp_df['beta'] = (dgp_df['mean_nz'] / pandas.to_numeric(dgp_df['sd_nz'] ** 2)).replace(numpy.inf, numpy.nan) - dgp_df['alpha'] = dgp_df['mean_nz'] * dgp_df['beta'] + dgp_df["mean_nz"] = dgp_df["sum"] / pandas.to_numeric(dgp_df["nnz"]) + dgp_df["sd_nz"] = dgp_df[["sum_of_squares", "sum", "nnz"]].apply( + lambda row: calculate_sd(*row), raw=True, axis=1 + ) + dgp_df["beta"] = ( + dgp_df["mean_nz"] / pandas.to_numeric(dgp_df["sd_nz"] ** 2) + ).replace(numpy.inf, numpy.nan) + dgp_df["alpha"] = dgp_df["mean_nz"] * dgp_df["beta"] return dgp_df @@ -71,13 +76,17 @@ def calculate_gamma_hurdle_p_value(row): If beta and alpha gamma-hurdle parameters are missing, calculate them and add them to row. """ - if 'beta' not in row: - row['beta'] = row['mean_nz'] / row['sd_nz'] ** 2 - if numpy.isinf(row['beta']): - row['beta'] = numpy.nan - if 'alpha' not in row: - row['alpha'] = row['mean_nz'] * row['beta'] - return row['nnz'] / row['n'] * scipy.special.gammaincc(row['alpha'], row['beta'] * row['dwpc']) + if "beta" not in row: + row["beta"] = row["mean_nz"] / row["sd_nz"] ** 2 + if numpy.isinf(row["beta"]): + row["beta"] = numpy.nan + if "alpha" not in row: + row["alpha"] = row["mean_nz"] * row["beta"] + return ( + row["nnz"] + / row["n"] + * scipy.special.gammaincc(row["alpha"], row["beta"] * row["dwpc"]) + ) def path_does_not_exist(row): @@ -86,9 +95,9 @@ def path_does_not_exist(row): isn't a path if the row has a zero path count, or has a zero dwpc if the path count isn't present in the row """ - if 'path_count' in row: - return row['path_count'] == 0 - return row['dwpc'] == 0 + if "path_count" in row: + return row["path_count"] == 0 + return row["dwpc"] == 0 def calculate_empirical_p_value(row): @@ -98,16 +107,16 @@ def calculate_empirical_p_value(row): if path_does_not_exist(row): # No paths exist between the given source and target nodes return 1.0 - if row['nnz'] == 0: + if row["nnz"] == 0: # No nonzero DWPCs are found in the permuted network, but paths are # observed in the true network return 0.0 - if not sd_is_positive(row['sd_nz']): + if not sd_is_positive(row["sd_nz"]): # The DWPCs in the permuted network are identical - if row['dwpc'] <= row['mean_nz'] + FLOAT_ERROR_TOLERANCE: + if row["dwpc"] <= row["mean_nz"] + FLOAT_ERROR_TOLERANCE: # The DWPC you found in the true network is smaller than or equal # to those in the permuted network - return row['nnz'] / row['n'] + return row["nnz"] / row["n"] # The DWPC you found in the true network is larger than those in the # permuted network @@ -119,7 +128,7 @@ def calculate_p_value(row): """ Calculate the p_value for a given metapath """ - if row['nnz'] == 0 or path_does_not_exist(row) or not sd_is_positive(row['sd_nz']): + if row["nnz"] == 0 or path_does_not_exist(row) or not sd_is_positive(row["sd_nz"]): return calculate_empirical_p_value(row) else: return calculate_gamma_hurdle_p_value(row) @@ -130,20 +139,25 @@ def combine_dwpc_dgp(graph, metapath, damping, ignore_zeros=False, max_p_value=1 Combine DWPC information with degree-grouped permutation summary metrics. Includes gamma-hurdle significance estimates. """ - stats_path = graph.get_running_degree_group_path(metapath, 'dwpc', damping, extension='.tsv.gz') - dgp_df = pandas.read_csv(stats_path, sep='\t') + stats_path = graph.get_running_degree_group_path( + metapath, "dwpc", damping, extension=".tsv.gz" + ) + dgp_df = pandas.read_csv(stats_path, sep="\t") dgp_df = add_gamma_hurdle_to_dgp_df(dgp_df) - degrees_to_dgp = dgp_df.set_index(['source_degree', 'target_degree']).to_dict(orient='index') + degrees_to_dgp = dgp_df.set_index(["source_degree", "target_degree"]).to_dict( + orient="index" + ) dwpc_row_generator = hetmatpy.degree_group.dwpc_to_degrees( - graph, metapath, damping=damping, ignore_zeros=ignore_zeros) + graph, metapath, damping=damping, ignore_zeros=ignore_zeros + ) for row in dwpc_row_generator: - degrees = row['source_degree'], row['target_degree'] + degrees = row["source_degree"], row["target_degree"] dgp = degrees_to_dgp[degrees] row.update(dgp) - row['p_value'] = calculate_p_value(row) - if row['p_value'] is not None and row['p_value'] > max_p_value: + row["p_value"] = calculate_p_value(row) + if row["p_value"] is not None and row["p_value"] > max_p_value: continue - for key in ['sum', 'sum_of_squares', 'beta', 'alpha']: + for key in ["sum", "sum_of_squares", "beta", "alpha"]: del row[key] yield row @@ -157,13 +171,15 @@ def grouper(iterable, group_size): while True: chunk = itertools.islice(iterable, group_size) try: - head = next(chunk), + head = (next(chunk),) except StopIteration: break yield itertools.chain(head, chunk) -def grouped_tsv_writer(row_generator, path, group_size=20_000, sep='\t', index=False, **kwargs): +def grouped_tsv_writer( + row_generator, path, group_size=20_000, sep="\t", index=False, **kwargs +): """ Write an iterable of dictionaries to a TSV, where each dictionary is a row. Uses pandas (extra keyword arguments are passed to DataFrame.to_csv) to @@ -173,6 +189,6 @@ def grouped_tsv_writer(row_generator, path, group_size=20_000, sep='\t', index=F chunks = grouper(row_generator, group_size=group_size) for i, chunk in enumerate(chunks): df = pandas.DataFrame.from_records(chunk) - kwargs['header'] = not bool(i) - kwargs['mode'] = 'a' if i else 'w' + kwargs["header"] = not bool(i) + kwargs["mode"] = "a" if i else "w" df.to_csv(path, sep=sep, index=index, **kwargs) diff --git a/hetmatpy/testing.py b/hetmatpy/testing.py index f52128a..83aed22 100644 --- a/hetmatpy/testing.py +++ b/hetmatpy/testing.py @@ -4,31 +4,28 @@ import hetmatpy.hetmat -format_github_url = 'https://github.com/{repo_slug}/raw/{commit}/{path}'.format +format_github_url = "https://github.com/{repo_slug}/raw/{commit}/{path}".format hetnet_urls = { # Figure 2D of Himmelstein & Baranzini # (2015) PLOS Comp Bio. https://doi.org/10.1371/journal.pcbi.1004259.g002 - 'disease-gene-example': - format_github_url( - repo_slug='hetio/hetnetpy', - commit='9dc747b8fc4e23ef3437829ffde4d047f2e1bdde', - path='test/data/disease-gene-example-graph.json', - ), + "disease-gene-example": format_github_url( + repo_slug="hetio/hetnetpy", + commit="9dc747b8fc4e23ef3437829ffde4d047f2e1bdde", + path="test/data/disease-gene-example-graph.json", + ), # The bupropion and nicotine dependence Hetionet v1.0 subgraph. - 'bupropion-subgraph': - format_github_url( - repo_slug='hetio/hetnetpy', - commit='30c6dbb18a17c05d71cb909cf57af7372e4d4908', - path='test/data/bupropion-CbGpPWpGaD-subgraph.json.xz', - ), + "bupropion-subgraph": format_github_url( + repo_slug="hetio/hetnetpy", + commit="30c6dbb18a17c05d71cb909cf57af7372e4d4908", + path="test/data/bupropion-CbGpPWpGaD-subgraph.json.xz", + ), # A random Hetionet v1.0 subgraph. - 'random-subgraph': - format_github_url( - repo_slug='hetio/hetnetpy', - commit='30c6dbb18a17c05d71cb909cf57af7372e4d4908', - path='test/data/random-subgraph.json.xz', - ), + "random-subgraph": format_github_url( + repo_slug="hetio/hetnetpy", + commit="30c6dbb18a17c05d71cb909cf57af7372e4d4908", + path="test/data/random-subgraph.json.xz", + ), } @@ -41,9 +38,9 @@ def get_graph(name, hetmat=False, directory=None): """ if name not in hetnet_urls: raise ValueError( - f'{name} is not a supported test hetnet.\n' - 'Choose from the following currently defined hetnets: ' + - ', '.join(hetnet_urls) + f"{name} is not a supported test hetnet.\n" + "Choose from the following currently defined hetnets: " + + ", ".join(hetnet_urls) ) if name not in hetnet_io_cache: url = hetnet_urls[name] diff --git a/hetmatpy/tests/test_degree_weight.py b/hetmatpy/tests/test_degree_weight.py index 83e8992..9fa11a1 100644 --- a/hetmatpy/tests/test_degree_weight.py +++ b/hetmatpy/tests/test_degree_weight.py @@ -12,108 +12,122 @@ categorize, dwpc, dwwc, - dwwc_sequential, - dwwc_recursive, dwwc_chain, + dwwc_recursive, + dwwc_sequential, get_segments, ) from hetmatpy.testing import get_graph -@pytest.mark.parametrize('dwwc_method', [ - None, - dwwc_sequential, - dwwc_recursive, - dwwc_chain, -]) +@pytest.mark.parametrize( + "dwwc_method", + [ + None, + dwwc_sequential, + dwwc_recursive, + dwwc_chain, + ], +) def test_disease_gene_example_dwwc(dwwc_method): """ Test the PC & DWWC computations in Figure 2D of Himmelstein & Baranzini (2015) PLOS Comp Bio. https://doi.org/10.1371/journal.pcbi.1004259.g002 """ - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metagraph = graph.metagraph # Compute GiGaD path count and DWWC matrices - metapath = metagraph.metapath_from_abbrev('GiGaD') + metapath = metagraph.metapath_from_abbrev("GiGaD") rows, cols, wc_matrix = dwwc(graph, metapath, damping=0, dwwc_method=dwwc_method) - rows, cols, dwwc_matrix = dwwc(graph, metapath, damping=0.5, dwwc_method=dwwc_method) + rows, cols, dwwc_matrix = dwwc( + graph, metapath, damping=0.5, dwwc_method=dwwc_method + ) # Check row and column name assignment - assert rows == ['CXCR4', 'IL2RA', 'IRF1', 'IRF8', 'ITCH', 'STAT3', 'SUMO1'] - assert cols == ["Crohn's Disease", 'Multiple Sclerosis'] + assert rows == ["CXCR4", "IL2RA", "IRF1", "IRF8", "ITCH", "STAT3", "SUMO1"] + assert cols == ["Crohn's Disease", "Multiple Sclerosis"] # Check concordance with https://doi.org/10.1371/journal.pcbi.1004259.g002 - i = rows.index('IRF1') - j = cols.index('Multiple Sclerosis') + i = rows.index("IRF1") + j = cols.index("Multiple Sclerosis") # Warning: the WC (walk count) and PC (path count) are only equivalent # because none of the GiGaD paths contain duplicate nodes. Since, GiGaD # contains duplicate metanodes, WC and PC are not guaranteed to be the # same. However, they happen to be equivalent for this example. assert wc_matrix[i, j] == 3 - assert dwwc_matrix[i, j] == pytest.approx(0.25 + 0.25 + 32 ** -0.5) + assert dwwc_matrix[i, j] == pytest.approx(0.25 + 0.25 + 32**-0.5) def get_nodes(metapath): node_dict = { - 'G': ['CXCR4', 'IL2RA', 'IRF1', 'IRF8', 'ITCH', 'STAT3', 'SUMO1'], - 'D': ["Crohn's Disease", 'Multiple Sclerosis'], - 'T': ['Leukocyte', 'Lung'] + "G": ["CXCR4", "IL2RA", "IRF1", "IRF8", "ITCH", "STAT3", "SUMO1"], + "D": ["Crohn's Disease", "Multiple Sclerosis"], + "T": ["Leukocyte", "Lung"], } exp_row = node_dict[metapath[0]] exp_col = node_dict[metapath[-1]] return exp_row, exp_col -@pytest.mark.parametrize('metapath,expected,path_type', [ - ('DaGeT', [[0.5, 0.5], - [0, 0]], 0), - ('DlTeG', [[0, 0, 0, 0, 0, 0, 0], - [0, 0, 0.70710678, 0, 0, 0, 0]], 0), - ('GeTlD', [[0, 0], - [0, 0], - [0, 0.70710678], - [0, 0], - [0, 0], - [0, 0], - [0, 0]], 0), - ('GaDlT', [[0.5, 0], - [0.5, 0], - [0, 0], - [0.5, 0], - [0, 0], - [0.35355339, 0], - [0, 0]], 0), - ('TeGaD', [[0.5, 0], - [0.5, 0]], 0), - ('TlDaG', [[0.5, 0.5, 0, 0.5, 0, 0.35355339, 0], - [0, 0, 0, 0, 0, 0, 0]], 0), - ('GiG', [[0., 0., 0.35355339, 0., 0.70710678, 0., 0.], - [0., 0., 0.5, 0., 0., 0., 0.], - [0.35355339, 0.5, 0., 0.5, 0., 0., 0.5], - [0., 0., 0.5, 0., 0., 0., 0.], - [0.70710678, 0., 0., 0., 0., 0., 0.], - [0., 0., 0., 0., 0., 0., 0.], - [0., 0., 0.5, 0., 0., 0., 0.]], 1), - ('GaDaG', [[0., 0.25, 0., 0.25, 0., 0.1767767, 0.], - [0.25, 0., 0., 0.25, 0., 0.1767767, 0.], - [0., 0., 0., 0., 0., 0.35355339, 0.], - [0.25, 0.25, 0., 0., 0., 0.1767767, 0.], - [0., 0., 0., 0., 0., 0., 0.], - [0.1767767, 0.1767767, 0.35355339, 0.1767767, 0., 0., 0.], - [0., 0., 0., 0., 0., 0., 0.]], 1), - ('GiGiG', [[0, 0.1767767, 0, 0.1767767, 0, 0, 0.1767767], - [0.1767767, 0, 0, 0.25, 0, 0, 0.25], - [0, 0, 0, 0, 0.25, 0, 0], - [0.1767767, 0.25, 0, 0, 0, 0, 0.25], - [0, 0, 0.25, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0], - [0.1767767, 0.25, 0, 0.25, 0, 0, 0]], 1) -]) +@pytest.mark.parametrize( + "metapath,expected,path_type", + [ + ("DaGeT", [[0.5, 0.5], [0, 0]], 0), + ("DlTeG", [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0.70710678, 0, 0, 0, 0]], 0), + ("GeTlD", [[0, 0], [0, 0], [0, 0.70710678], [0, 0], [0, 0], [0, 0], [0, 0]], 0), + ( + "GaDlT", + [[0.5, 0], [0.5, 0], [0, 0], [0.5, 0], [0, 0], [0.35355339, 0], [0, 0]], + 0, + ), + ("TeGaD", [[0.5, 0], [0.5, 0]], 0), + ("TlDaG", [[0.5, 0.5, 0, 0.5, 0, 0.35355339, 0], [0, 0, 0, 0, 0, 0, 0]], 0), + ( + "GiG", + [ + [0.0, 0.0, 0.35355339, 0.0, 0.70710678, 0.0, 0.0], + [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0], + [0.35355339, 0.5, 0.0, 0.5, 0.0, 0.0, 0.5], + [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0], + [0.70710678, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0], + ], + 1, + ), + ( + "GaDaG", + [ + [0.0, 0.25, 0.0, 0.25, 0.0, 0.1767767, 0.0], + [0.25, 0.0, 0.0, 0.25, 0.0, 0.1767767, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.35355339, 0.0], + [0.25, 0.25, 0.0, 0.0, 0.0, 0.1767767, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.1767767, 0.1767767, 0.35355339, 0.1767767, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + ], + 1, + ), + ( + "GiGiG", + [ + [0, 0.1767767, 0, 0.1767767, 0, 0, 0.1767767], + [0.1767767, 0, 0, 0.25, 0, 0, 0.25], + [0, 0, 0, 0, 0.25, 0, 0], + [0.1767767, 0.25, 0, 0, 0, 0, 0.25], + [0, 0, 0.25, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0.1767767, 0.25, 0, 0.25, 0, 0, 0], + ], + 1, + ), + ], +) def test_no_and_short_repeat(metapath, expected, path_type): exp_row, exp_col = get_nodes(metapath) - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metapath = graph.metagraph.metapath_from_abbrev(metapath) func_dict = {0: dwwc, 1: _dwpc_short_repeat} @@ -125,69 +139,518 @@ def test_no_and_short_repeat(metapath, expected, path_type): assert col == exp_col -@pytest.mark.parametrize('metapath,exp_row,exp_col,exp_data,shape', [ - ('DrDaGiG', - [1, 9, 16, 26, 43, 68, 17, 21, 25, 29, 36, 38, 62, 73], - [21, 21, 21, 21, 21, 21, 42, 42, 42, 42, 42, 42, 42, 42], - [0.111803398875, 0.380039827693, 0.102062072616, 0.288675134595, - 0.204124145232, 0.144337567297, 0.037688918072, 0.058925565098, - 0.044194173824, 0.055901699437, 0.047245559126, 0.055901699437, - 0.039528470752, 0.05103103630], (104, 105)), - ('CrCpDrD', - [3, 44, 49, 44, 44, 22, 84, 22, 84, 44, 44, 44, 3, 22, 49, 84, 40, 51, - 84, 40, 51, 84, 40, 51, 84, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [2, 2, 2, 13, 14, 16, 16, 30, 30, 32, 33, 37, 45, 45, 45, 45, 51, 51, - 51, 56, 56, 56, 57, 57, 57, 61, 74, 81, 85, 88, 89, 93, 97, 99], - [0.22360679775, 0.115470053838, 0.22360679775, 0.105409255339, - 0.0645497224368, 0.068041381744, 0.0833333333333, 0.0833333333333, - 0.102062072616, 0.105409255339, 0.0645497224368, 0.0873962324984, - 0.22360679775, 0.07453559925, 0.22360679775, 0.0912870929175, - 0.0589255650989, 0.0589255650989, 0.0589255650989, 0.0710669054519, - 0.0710669054519, 0.0710669054519, 0.0710669054519, 0.0710669054519, - 0.0710669054519, 0.0645497224368, 0.107038087531, 0.1490711985, - 0.182574185835, 0.0873962324984, 0.0589255650989, 0.0833333333333, - 0.0926977029746, 0.0416666666667], (103, 104)), - ('CrCpDrDaGiG', - [0], [0], [0], (103, 105)), - ('CrCrCpDrDrD', - [22, 22, 22, 22, 22, 22, 22, 22, 40, 40, 40, 40, 40, 40, 40, 40, 51, 51, - 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, - 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 51, 61, 61, 61, 61, 61, 61, - 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, 61, - 61, 61, 61, 61, 61, 61, 61, 61, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, - 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, - 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84], - [1, 2, 13, 16, 30, 32, 43, 85, 1, 2, 13, 16, 30, 32, 43, 85, 5, 7, 8, 17, - 18, 20, 24, 25, 29, 34, 35, 36, 38, 48, 49, 50, 53, 56, 57, 59, 62, 63, - 65, 70, 72, 73, 75, 82, 83, 94, 95, 96, 5, 7, 8, 17, 18, 20, 24, 25, 29, - 34, 35, 36, 38, 48, 49, 50, 53, 56, 57, 59, 62, 63, 65, 70, 72, 73, 75, - 82, 83, 94, 95, 96, 1, 2, 5, 7, 8, 13, 16, 17, 18, 20, 24, 25, 29, 30, - 32, 34, 35, 36, 38, 43, 48, 49, 50, 53, 56, 57, 59, 62, 63, 65, 70, 72, - 73, 75, 82, 83, 85, 94, 95, 96], - [0.00621129, 0.00745355, 0.02097942, 0.00850517, 0.00694444, 0.02097942, - 0.00694444, 0.01178511, 0.00507150, 0.00608580, 0.01712962, 0.00694444, - 0.00567011, 0.01712962, 0.00567011, 0.00962250, 0.00260416, 0.00329403, - 0.00357124, 0.00157037, 0.00714249, 0.00535686, 0.00204287, 0.00184142, - 0.00338798, 0.00368284, 0.00276627, 0.00483193, 0.00338798, 0.00222084, - 0.00357124, 0.00323031, 0.00404941, 0.00323031, 0.00323031, 0.00260416, - 0.00404268, 0.00297145, 0.00404941, 0.00309279, 0.00173611, 0.00212629, - 0.00479132, 0.00196856, 0.00329403, 0.00479132, 0.00437386, 0.00245523, - 0.00368284, 0.00465847, 0.00505050, 0.00222084, 0.01010101, 0.00757575, - 0.00288906, 0.00260416, 0.00479132, 0.00520833, 0.00391210, 0.00683338, - 0.00479132, 0.00314074, 0.00505050, 0.00456835, 0.00572673, 0.00456835, - 0.00456835, 0.00368284, 0.00571721, 0.00420227, 0.00572673, 0.00437386, - 0.00245523, 0.00300703, 0.00677596, 0.00278397, 0.00465847, 0.00677596, - 0.00618558, 0.00347222, 0.00507150, 0.00608580, 0.00260416, 0.00329403, - 0.00357124, 0.01712962, 0.00694444, 0.00157037, 0.00714249, 0.00535686, - 0.00204287, 0.00184142, 0.00338798, 0.00567011, 0.01712962, 0.00368284, - 0.00276627, 0.00483193, 0.00338798, 0.00567011, 0.00222084, 0.00357124, - 0.00323031, 0.00404941, 0.00323031, 0.00323031, 0.00260416, 0.00404268, - 0.00297145, 0.00404941, 0.00309279, 0.00173611, 0.00212629, 0.00479132, - 0.00196856, 0.00329403, 0.00962250, 0.00479132, 0.00437386, 0.00245523], - (103, 104)) -]) +@pytest.mark.parametrize( + "metapath,exp_row,exp_col,exp_data,shape", + [ + ( + "DrDaGiG", + [1, 9, 16, 26, 43, 68, 17, 21, 25, 29, 36, 38, 62, 73], + [21, 21, 21, 21, 21, 21, 42, 42, 42, 42, 42, 42, 42, 42], + [ + 0.111803398875, + 0.380039827693, + 0.102062072616, + 0.288675134595, + 0.204124145232, + 0.144337567297, + 0.037688918072, + 0.058925565098, + 0.044194173824, + 0.055901699437, + 0.047245559126, + 0.055901699437, + 0.039528470752, + 0.05103103630, + ], + (104, 105), + ), + ( + "CrCpDrD", + [ + 3, + 44, + 49, + 44, + 44, + 22, + 84, + 22, + 84, + 44, + 44, + 44, + 3, + 22, + 49, + 84, + 40, + 51, + 84, + 40, + 51, + 84, + 40, + 51, + 84, + 44, + 44, + 44, + 44, + 44, + 44, + 44, + 44, + 44, + ], + [ + 2, + 2, + 2, + 13, + 14, + 16, + 16, + 30, + 30, + 32, + 33, + 37, + 45, + 45, + 45, + 45, + 51, + 51, + 51, + 56, + 56, + 56, + 57, + 57, + 57, + 61, + 74, + 81, + 85, + 88, + 89, + 93, + 97, + 99, + ], + [ + 0.22360679775, + 0.115470053838, + 0.22360679775, + 0.105409255339, + 0.0645497224368, + 0.068041381744, + 0.0833333333333, + 0.0833333333333, + 0.102062072616, + 0.105409255339, + 0.0645497224368, + 0.0873962324984, + 0.22360679775, + 0.07453559925, + 0.22360679775, + 0.0912870929175, + 0.0589255650989, + 0.0589255650989, + 0.0589255650989, + 0.0710669054519, + 0.0710669054519, + 0.0710669054519, + 0.0710669054519, + 0.0710669054519, + 0.0710669054519, + 0.0645497224368, + 0.107038087531, + 0.1490711985, + 0.182574185835, + 0.0873962324984, + 0.0589255650989, + 0.0833333333333, + 0.0926977029746, + 0.0416666666667, + ], + (103, 104), + ), + ("CrCpDrDaGiG", [0], [0], [0], (103, 105)), + ( + "CrCrCpDrDrD", + [ + 22, + 22, + 22, + 22, + 22, + 22, + 22, + 22, + 40, + 40, + 40, + 40, + 40, + 40, + 40, + 40, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 51, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 61, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + 84, + ], + [ + 1, + 2, + 13, + 16, + 30, + 32, + 43, + 85, + 1, + 2, + 13, + 16, + 30, + 32, + 43, + 85, + 5, + 7, + 8, + 17, + 18, + 20, + 24, + 25, + 29, + 34, + 35, + 36, + 38, + 48, + 49, + 50, + 53, + 56, + 57, + 59, + 62, + 63, + 65, + 70, + 72, + 73, + 75, + 82, + 83, + 94, + 95, + 96, + 5, + 7, + 8, + 17, + 18, + 20, + 24, + 25, + 29, + 34, + 35, + 36, + 38, + 48, + 49, + 50, + 53, + 56, + 57, + 59, + 62, + 63, + 65, + 70, + 72, + 73, + 75, + 82, + 83, + 94, + 95, + 96, + 1, + 2, + 5, + 7, + 8, + 13, + 16, + 17, + 18, + 20, + 24, + 25, + 29, + 30, + 32, + 34, + 35, + 36, + 38, + 43, + 48, + 49, + 50, + 53, + 56, + 57, + 59, + 62, + 63, + 65, + 70, + 72, + 73, + 75, + 82, + 83, + 85, + 94, + 95, + 96, + ], + [ + 0.00621129, + 0.00745355, + 0.02097942, + 0.00850517, + 0.00694444, + 0.02097942, + 0.00694444, + 0.01178511, + 0.00507150, + 0.00608580, + 0.01712962, + 0.00694444, + 0.00567011, + 0.01712962, + 0.00567011, + 0.00962250, + 0.00260416, + 0.00329403, + 0.00357124, + 0.00157037, + 0.00714249, + 0.00535686, + 0.00204287, + 0.00184142, + 0.00338798, + 0.00368284, + 0.00276627, + 0.00483193, + 0.00338798, + 0.00222084, + 0.00357124, + 0.00323031, + 0.00404941, + 0.00323031, + 0.00323031, + 0.00260416, + 0.00404268, + 0.00297145, + 0.00404941, + 0.00309279, + 0.00173611, + 0.00212629, + 0.00479132, + 0.00196856, + 0.00329403, + 0.00479132, + 0.00437386, + 0.00245523, + 0.00368284, + 0.00465847, + 0.00505050, + 0.00222084, + 0.01010101, + 0.00757575, + 0.00288906, + 0.00260416, + 0.00479132, + 0.00520833, + 0.00391210, + 0.00683338, + 0.00479132, + 0.00314074, + 0.00505050, + 0.00456835, + 0.00572673, + 0.00456835, + 0.00456835, + 0.00368284, + 0.00571721, + 0.00420227, + 0.00572673, + 0.00437386, + 0.00245523, + 0.00300703, + 0.00677596, + 0.00278397, + 0.00465847, + 0.00677596, + 0.00618558, + 0.00347222, + 0.00507150, + 0.00608580, + 0.00260416, + 0.00329403, + 0.00357124, + 0.01712962, + 0.00694444, + 0.00157037, + 0.00714249, + 0.00535686, + 0.00204287, + 0.00184142, + 0.00338798, + 0.00567011, + 0.01712962, + 0.00368284, + 0.00276627, + 0.00483193, + 0.00338798, + 0.00567011, + 0.00222084, + 0.00357124, + 0.00323031, + 0.00404941, + 0.00323031, + 0.00323031, + 0.00260416, + 0.00404268, + 0.00297145, + 0.00404941, + 0.00309279, + 0.00173611, + 0.00212629, + 0.00479132, + 0.00196856, + 0.00329403, + 0.00962250, + 0.00479132, + 0.00437386, + 0.00245523, + ], + (103, 104), + ), + ], +) def test_disjoint_dwpc(metapath, exp_row, exp_col, exp_data, shape): - graph = get_graph('random-subgraph') + graph = get_graph("random-subgraph") metapath = graph.metagraph.metapath_from_abbrev(metapath) row, col, dwpc_matrix = dwpc(graph, metapath) @@ -197,27 +660,23 @@ def test_disjoint_dwpc(metapath, exp_row, exp_col, exp_data, shape): assert abs(dwpc_matrix - expected).max() == pytest.approx(0, abs=1e-7) -@pytest.mark.parametrize('metapath,expected', [ - ('DaGiGaD', [[0., 0.47855339], - [0.47855339, 0.]]), - ('TeGiGeT', [[0, 0], - [0, 0]]), - ('DaGiGeTlD', [[0, 0], - [0, 0]]), - ('DaGeTeGaD', [[0, 0], - [0, 0]]), - ('TlDaGiGeT', [[0., 0.47855339], - [0., 0.]]), - ('DaGiGaDlT', [[0.47855339, 0], - [0, 0]]) -]) +@pytest.mark.parametrize( + "metapath,expected", + [ + ("DaGiGaD", [[0.0, 0.47855339], [0.47855339, 0.0]]), + ("TeGiGeT", [[0, 0], [0, 0]]), + ("DaGiGeTlD", [[0, 0], [0, 0]]), + ("DaGeTeGaD", [[0, 0], [0, 0]]), + ("TlDaGiGeT", [[0.0, 0.47855339], [0.0, 0.0]]), + ("DaGiGaDlT", [[0.47855339, 0], [0, 0]]), + ], +) def test__dwpc_baab(metapath, expected): exp_row, exp_col = get_nodes(metapath) - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metapath = graph.metagraph.metapath_from_abbrev(metapath) - row, col, dwpc_matrix = _dwpc_baab(graph, metapath, damping=0.5, - dense_threshold=1) + row, col, dwpc_matrix = _dwpc_baab(graph, metapath, damping=0.5, dense_threshold=1) expected = numpy.array(expected, dtype=numpy.float64) @@ -228,70 +687,73 @@ def test__dwpc_baab(metapath, expected): def get_baba_matrices(metapath): node_dict = { - 'G': ['CXCR4', 'IL2RA', 'IRF1', 'IRF8', 'ITCH', 'STAT3', 'SUMO1'], - 'D': ["Crohn's Disease", 'Multiple Sclerosis'], - 'T': ['Leukocyte', 'Lung'] + "G": ["CXCR4", "IL2RA", "IRF1", "IRF8", "ITCH", "STAT3", "SUMO1"], + "D": ["Crohn's Disease", "Multiple Sclerosis"], + "T": ["Leukocyte", "Lung"], } edge_dict = { - 0: [[0.08838835, 0], + 0: [ + [0.08838835, 0], [0.08838835, 0], [0, 0.125], [0.08838835, 0], - [0, 0], - [0, 0], - [0, 0]], - 1: [[0, 0], - [0, 0]], - 2: [[0, 0], [0, 0], [0, 0], [0, 0], - [0, 0], - [0, 0], - [0, 0]], - 3: [[0.25, 0.], - [0.25, 0.], - [0., 0.], - [0.25, 0.], - [0., 0.], - [0.1767767, 0.], - [0., 0.]], - 4: [[0., 0.], - [0., 0.], - [0.125, 0.], - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.]], - 5: [[0., 0.], - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.25], - [0., 0.]], - 6: [[0., 0.], - [0., 0.], - [0.125, 0.], - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.]] + ], + 1: [[0, 0], [0, 0]], + 2: [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], + 3: [ + [0.25, 0.0], + [0.25, 0.0], + [0.0, 0.0], + [0.25, 0.0], + [0.0, 0.0], + [0.1767767, 0.0], + [0.0, 0.0], + ], + 4: [ + [0.0, 0.0], + [0.0, 0.0], + [0.125, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + ], + 5: [ + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.25], + [0.0, 0.0], + ], + 6: [ + [0.0, 0.0], + [0.0, 0.0], + [0.125, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + ], } mat_dict = { - 'GaDaGaD': (0, 0), - 'DaGaDaG': (0, 1), - 'DlTlDlT': (1, 0), - 'TlDlTlD': (1, 1), - 'GeTeGeT': (2, 0), - 'TeGeTeG': (2, 1), - 'GaDlTeGaD': (3, 0), - 'DaGeTlDaG': (3, 1), - 'GeTlDaGaD': (4, 0), - 'DaGaDlTeG': (4, 1), - 'GaDaGeTlD': (5, 0), - 'DlTeGaDaG': (5, 1), - 'TlDaGaDaG': (6, 1) + "GaDaGaD": (0, 0), + "DaGaDaG": (0, 1), + "DlTlDlT": (1, 0), + "TlDlTlD": (1, 1), + "GeTeGeT": (2, 0), + "TeGeTeG": (2, 1), + "GaDlTeGaD": (3, 0), + "DaGeTlDaG": (3, 1), + "GeTlDaGaD": (4, 0), + "DaGaDlTeG": (4, 1), + "GaDaGeTlD": (5, 0), + "DlTeGaDaG": (5, 1), + "TlDaGaDaG": (6, 1), } first = node_dict[metapath[0]] last = node_dict[metapath[-1]] @@ -302,18 +764,28 @@ def get_baba_matrices(metapath): return first, last, adj -@pytest.mark.parametrize('m_path', ('GaDaGaD', 'DaGaDaG', 'DlTlDlT', - 'TlDlTlD', 'GeTeGeT', 'TeGeTeG', - 'GaDlTeGaD', 'GeTlDaGaD', 'GaDaGeTlD', - 'TlDaGaDaG')) +@pytest.mark.parametrize( + "m_path", + ( + "GaDaGaD", + "DaGaDaG", + "DlTlDlT", + "TlDlTlD", + "GeTeGeT", + "TeGeTeG", + "GaDlTeGaD", + "GeTlDaGaD", + "GaDaGeTlD", + "TlDaGaDaG", + ), +) def test__dwpc_baba(m_path): - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metagraph = graph.metagraph metapath = metagraph.metapath_from_abbrev(m_path) row_sol, col_sol, adj_sol = get_baba_matrices(m_path) - row, col, dwpc_matrix = _dwpc_baba(graph, metapath, damping=0.5, - dense_threshold=0) + row, col, dwpc_matrix = _dwpc_baba(graph, metapath, damping=0.5, dense_threshold=0) assert row_sol == row assert col_sol == col @@ -321,45 +793,51 @@ def test__dwpc_baba(m_path): def get_general_solutions(length): - genes = ['CXCR4', 'IL2RA', 'IRF1', 'IRF8', 'ITCH', 'STAT3', 'SUMO1'] + genes = ["CXCR4", "IL2RA", "IRF1", "IRF8", "ITCH", "STAT3", "SUMO1"] mat_dict = { - 0: [[0, 0, 0.35355339, 0, 0.70710678, 0, 0], + 0: [ + [0, 0, 0.35355339, 0, 0.70710678, 0, 0], [0, 0, 0.5, 0, 0, 0, 0], [0.35355339, 0.5, 0, 0.5, 0, 0, 0.5], [0, 0, 0.5, 0, 0, 0, 0], [0.70710678, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], - [0, 0, 0.5, 0, 0, 0, 0]], - 1: [[0, 0.1767767, 0, 0.1767767, 0, 0, 0.1767767], + [0, 0, 0.5, 0, 0, 0, 0], + ], + 1: [ + [0, 0.1767767, 0, 0.1767767, 0, 0, 0.1767767], [0.1767767, 0, 0, 0.25, 0, 0, 0.25], [0, 0, 0, 0, 0.25, 0, 0], [0.1767767, 0.25, 0, 0, 0, 0, 0.25], [0, 0, 0.25, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0], - [0.1767767, 0.25, 0, 0.25, 0, 0, 0]], - 2: [[0, 0, 0, 0, 0, 0, 0], + [0.1767767, 0.25, 0, 0.25, 0, 0, 0], + ], + 2: [ + [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0.125, 0, 0], [0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0.125, 0, 0], [0, 0.125, 0, 0.125, 0, 0, 0.125], [0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0.125, 0, 0]], + [0, 0, 0, 0, 0.125, 0, 0], + ], 3: numpy.zeros((7, 7)), 4: numpy.zeros((7, 7)), - 5: numpy.zeros((7, 7)) + 5: numpy.zeros((7, 7)), } return genes, genes, mat_dict[length] -@pytest.mark.parametrize('length', list(range(6))) +@pytest.mark.parametrize("length", list(range(6))) def test__dwpc_general_case(length): """ Test the functionality of dwpc_same_metanode to find DWPC within a metapath (segment) of metanode and metaedge repeats. """ - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metagraph = graph.metagraph - m_path = 'GiG' + length * 'iG' + m_path = "GiG" + length * "iG" metapath = metagraph.metapath_from_abbrev(m_path) rows, cols, dwpc_mat = _dwpc_general_case(graph, metapath, damping=0.5) exp_row, exp_col, exp_dwpc = get_general_solutions(length) @@ -370,83 +848,90 @@ def test__dwpc_general_case(length): assert cols == exp_col -@pytest.mark.parametrize('metapath,solution', [ - ('GiG', 'short_repeat'), - ('GiGiGiG', 'four_repeat'), - ('G' + 10 * 'iG', 'long_repeat'), - ('GiGiGcGcG', 'long_repeat'), # iicc - ('GiGcGcGiG', 'long_repeat'), # icci - ('GcGiGcGaDrD', 'disjoint'), # cicDD - ('GcGiGaDrDrD', 'disjoint'), # ciDDD - ('CpDaG', 'no_repeats'), # ABC - ('DaGiGaDaG', 'other'), # ABBAB - ('DaGiGbC', 'short_repeat'), # ABBC - ('DaGiGaD', 'BAAB'), # ABBA - ('GeAlDlAeG', 'BAAB'), # ABCBA - ('CbGaDrDaGeA', 'BAAB'), # ABCCBD - ('AlDlAlD', 'BABA'), # ABAB - ('CrCbGbCbG', 'other'), # BBABA - ('CbGiGbCrC', 'other'), - ('CbGiGiGbC', 'BAAB'), - ('CbGbCbGbC', 'other'), - ('CrCbGiGbC', 'other'), - ('CrCbGbCbG', 'other'), - ('CbGaDaGeAlD', 'BABA'), # ABCBDC - ('AlDaGiG', 'short_repeat'), # ABCC - ('AeGaDaGiG', 'short_repeat'), # ABCB - ('CbGaDpCbGaD', 'other'), # ABCABC - ('DaGiGiGiGiGaD', 'other'), # ABBBBBA - ('CbGaDrDaGbC', 'BAAB'), # ABCCBA - ('DlAuGcGpBPpGaDlA', 'other'), # ABCCDCAB - ('CrCbGiGaDrD', 'disjoint'), # AABBCC - ('SEcCpDaGeAeGaDtC', 'BAAB'), - ('CbGiGiGbC', 'BAAB'), - ('CbGbCbGbC', 'other')]) # ABABA +@pytest.mark.parametrize( + "metapath,solution", + [ + ("GiG", "short_repeat"), + ("GiGiGiG", "four_repeat"), + ("G" + 10 * "iG", "long_repeat"), + ("GiGiGcGcG", "long_repeat"), # iicc + ("GiGcGcGiG", "long_repeat"), # icci + ("GcGiGcGaDrD", "disjoint"), # cicDD + ("GcGiGaDrDrD", "disjoint"), # ciDDD + ("CpDaG", "no_repeats"), # ABC + ("DaGiGaDaG", "other"), # ABBAB + ("DaGiGbC", "short_repeat"), # ABBC + ("DaGiGaD", "BAAB"), # ABBA + ("GeAlDlAeG", "BAAB"), # ABCBA + ("CbGaDrDaGeA", "BAAB"), # ABCCBD + ("AlDlAlD", "BABA"), # ABAB + ("CrCbGbCbG", "other"), # BBABA + ("CbGiGbCrC", "other"), + ("CbGiGiGbC", "BAAB"), + ("CbGbCbGbC", "other"), + ("CrCbGiGbC", "other"), + ("CrCbGbCbG", "other"), + ("CbGaDaGeAlD", "BABA"), # ABCBDC + ("AlDaGiG", "short_repeat"), # ABCC + ("AeGaDaGiG", "short_repeat"), # ABCB + ("CbGaDpCbGaD", "other"), # ABCABC + ("DaGiGiGiGiGaD", "other"), # ABBBBBA + ("CbGaDrDaGbC", "BAAB"), # ABCCBA + ("DlAuGcGpBPpGaDlA", "other"), # ABCCDCAB + ("CrCbGiGaDrD", "disjoint"), # AABBCC + ("SEcCpDaGeAeGaDtC", "BAAB"), + ("CbGiGiGbC", "BAAB"), + ("CbGbCbGbC", "other"), + ], +) # ABABA def test_categorize(metapath, solution): - url = 'https://github.com/hetio/hetnetpy/raw/{}/{}'.format( - '9dc747b8fc4e23ef3437829ffde4d047f2e1bdde', - 'test/data/hetionet-v1.0-metagraph.json', + url = "https://github.com/hetio/hetnetpy/raw/{}/{}".format( + "9dc747b8fc4e23ef3437829ffde4d047f2e1bdde", + "test/data/hetionet-v1.0-metagraph.json", ) metagraph = hetnetpy.readwrite.read_metagraph(url) metapath = metagraph.metapath_from_abbrev(metapath) assert categorize(metapath) == solution -@pytest.mark.parametrize('metapath,solution', [ - ('AeGiGaDaG', '[AeG, GiGaDaG]'), # short_repeat - ('AeGaDaGiG', '[AeG, GaDaGiG]'), # short_repeat other direction - ('CpDrDdGdD', '[CpD, DrDdGdD]'), - ('AeGiGeAlD', '[AeG, GiG, GeA, AlD]'), # BAABC - ('AeGiGaDlA', '[AeG, GiG, GaDlA]'), - ('DaGaDaG', '[DaG, GaD, DaG]'), # BABA - ('CbGeAlDaGbC', '[CbG, GeAlDaG, GbC]'), - ('SEcCpDaGeAeGaDtC', '[SEcC, CpD, DaG, GeAeG, GaD, DtC]'), - ('DlAeGaDaG', '[DlAeG, GaD, DaG]'), # BCABA - ('GaDlAeGaD', '[GaD, DlAeG, GaD]'), # BACBA - ('GiGiG', '[GiGiG]'), # short_repeat - ('GiGiGiG', '[GiG, GiG, GiG]'), # four_repeat - ('CrCbGiGiGaDrDlA', '[CrC, CbG, GiGiG, GaD, DrD, DlA]'), - ('CrCrCbGiGeAlDrD', '[CrCrC, CbG, GiG, GeAlD, DrD]'), - ('SEcCrCrCbGiGeAlDrDpS', '[SEcC, CrCrC, CbG, GiG, GeAlD, DrD, DpS]'), - ('SEcCrCrCrC', '[SEcC, CrC, CrC, CrC]'), - ('GiGaDaG', '[GiGaDaG]'), - ('CrCbGiGbC', '[CrC, CbG, GiG, GbC]'), # OTHER - ('GbCpDrDaG', '[GbCpD, DrD, DaG]'), - ('CbGiGiGbC', '[CbG, GiGiG, GbC]'), - ('CbGiGiGiGiGbC', '[CbG, GiGiGiGiG, GbC]'), # OTHER - ('CbGaDaGiGiGbCrC', '[CbG, GaDaGiGiG, GbC, CrC]'), # OTHER - ('CbGiGiGiGbCbG', '[CbG, GiGiGiG, GbC, CbG]'), - ('CbGiGiGbCpD', '[CbG, GiGiG, GbC, CpD]'), - ('CbGaDaGaDpC', '[CbG, GaDaGaD, DpC]'), - ('GaDaGaD', '[GaD, DaG, GaD]'), - ('CrCbGaDrDaG', '[CrC, CbG, GaDrDaG]'), - ('CrCbGaDaGaD', '[CrC, CbG, GaDaGaD]'), - ('DlAeGiGaDlA', '[DlA, AeGiGaD, DlA]') -]) +@pytest.mark.parametrize( + "metapath,solution", + [ + ("AeGiGaDaG", "[AeG, GiGaDaG]"), # short_repeat + ("AeGaDaGiG", "[AeG, GaDaGiG]"), # short_repeat other direction + ("CpDrDdGdD", "[CpD, DrDdGdD]"), + ("AeGiGeAlD", "[AeG, GiG, GeA, AlD]"), # BAABC + ("AeGiGaDlA", "[AeG, GiG, GaDlA]"), + ("DaGaDaG", "[DaG, GaD, DaG]"), # BABA + ("CbGeAlDaGbC", "[CbG, GeAlDaG, GbC]"), + ("SEcCpDaGeAeGaDtC", "[SEcC, CpD, DaG, GeAeG, GaD, DtC]"), + ("DlAeGaDaG", "[DlAeG, GaD, DaG]"), # BCABA + ("GaDlAeGaD", "[GaD, DlAeG, GaD]"), # BACBA + ("GiGiG", "[GiGiG]"), # short_repeat + ("GiGiGiG", "[GiG, GiG, GiG]"), # four_repeat + ("CrCbGiGiGaDrDlA", "[CrC, CbG, GiGiG, GaD, DrD, DlA]"), + ("CrCrCbGiGeAlDrD", "[CrCrC, CbG, GiG, GeAlD, DrD]"), + ("SEcCrCrCbGiGeAlDrDpS", "[SEcC, CrCrC, CbG, GiG, GeAlD, DrD, DpS]"), + ("SEcCrCrCrC", "[SEcC, CrC, CrC, CrC]"), + ("GiGaDaG", "[GiGaDaG]"), + ("CrCbGiGbC", "[CrC, CbG, GiG, GbC]"), # OTHER + ("GbCpDrDaG", "[GbCpD, DrD, DaG]"), + ("CbGiGiGbC", "[CbG, GiGiG, GbC]"), + ("CbGiGiGiGiGbC", "[CbG, GiGiGiGiG, GbC]"), # OTHER + ("CbGaDaGiGiGbCrC", "[CbG, GaDaGiGiG, GbC, CrC]"), # OTHER + ("CbGiGiGiGbCbG", "[CbG, GiGiGiG, GbC, CbG]"), + ("CbGiGiGbCpD", "[CbG, GiGiG, GbC, CpD]"), + ("CbGaDaGaDpC", "[CbG, GaDaGaD, DpC]"), + ("GaDaGaD", "[GaD, DaG, GaD]"), + ("CrCbGaDrDaG", "[CrC, CbG, GaDrDaG]"), + ("CrCbGaDaGaD", "[CrC, CbG, GaDaGaD]"), + ("DlAeGiGaDlA", "[DlA, AeGiGaD, DlA]"), + ], +) def test_get_segments(metapath, solution): - url = 'https://github.com/hetio/hetnetpy/raw/{}/{}'.format( - '9dc747b8fc4e23ef3437829ffde4d047f2e1bdde', - 'test/data/hetionet-v1.0-metagraph.json', + url = "https://github.com/hetio/hetnetpy/raw/{}/{}".format( + "9dc747b8fc4e23ef3437829ffde4d047f2e1bdde", + "test/data/hetionet-v1.0-metagraph.json", ) metagraph = hetnetpy.readwrite.read_metagraph(url) metapath = metagraph.metapath_from_abbrev(metapath) @@ -454,134 +939,157 @@ def test_get_segments(metapath, solution): assert output == solution -@pytest.mark.parametrize('dense_threshold', [0, 1]) -@pytest.mark.parametrize('metapath,expected', [ - ('DaGiGiG', [[0., 0., 0., 0., 0.1767767, 0., 0.], - [0.1767767, 0.21338835, 0., 0.21338835, 0., 0., 0.33838835]]), - ('DaGiGiGaD', [[0, 0], - [0, 0]]), - ('DaGiGaD', [[0., 0.47855339], - [0.47855339, 0.]]), - ('TeGiGeT', [[0, 0], - [0, 0]]), - ('DaGiGeTlD', [[0, 0], - [0, 0]]), - ('DaGeTeGaD', [[0, 0], - [0, 0]]), - ('TlDaGiGeT', [[0., 0.47855339], - [0., 0.]]), - ('DaGeT', [[0.5, 0.5], - [0, 0]]), - ('DlTeG', [[0, 0, 0, 0, 0, 0, 0], - [0, 0, 0.70710678, 0, 0, 0, 0]]), - ('GeTlD', [[0, 0], - [0, 0], - [0, 0.70710678], - [0, 0], - [0, 0], - [0, 0], - [0, 0]]), - ('GaDlT', [[0.5, 0], - [0.5, 0], - [0, 0], - [0.5, 0], - [0, 0], - [0.35355339, 0], - [0, 0]]), - ('TeGaD', [[0.5, 0], - [0.5, 0]]), - ('TlDaG', [[0.5, 0.5, 0, 0.5, 0, 0.35355339, 0], - [0, 0, 0, 0, 0, 0, 0]]), - ('GiG', [[0., 0., 0.35355339, 0., 0.70710678, 0., 0.], - [0., 0., 0.5, 0., 0., 0., 0.], - [0.35355339, 0.5, 0., 0.5, 0., 0., 0.5], - [0., 0., 0.5, 0., 0., 0., 0.], - [0.70710678, 0., 0., 0., 0., 0., 0.], - [0., 0., 0., 0., 0., 0., 0.], - [0., 0., 0.5, 0., 0., 0., 0.]]), - ('GaDaG', [[0., 0.25, 0., 0.25, 0., 0.1767767, 0.], - [0.25, 0., 0., 0.25, 0., 0.1767767, 0.], - [0., 0., 0., 0., 0., 0.35355339, 0.], - [0.25, 0.25, 0., 0., 0., 0.1767767, 0.], - [0., 0., 0., 0., 0., 0., 0.], - [0.1767767, 0.1767767, 0.35355339, 0.1767767, 0., 0., 0.], - [0., 0., 0., 0., 0., 0., 0.]]), - ('GiGiG', [[0, 0.1767767, 0, 0.1767767, 0, 0, 0.1767767], - [0.1767767, 0, 0, 0.25, 0, 0, 0.25], - [0, 0, 0, 0, 0.25, 0, 0], - [0.1767767, 0.25, 0, 0, 0, 0, 0.25], - [0, 0, 0.25, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0], - [0.1767767, 0.25, 0, 0.25, 0, 0, 0]]), - ('GiGiGiG', [[0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0.125, 0, 0], - [0, 0, 0, 0, 0, 0, 0], - [0, 0., 0, 0, 0.125, 0, 0], - [0, 0.125, 0, 0.125, 0, 0, 0.125], - [0, 0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0.125, 0, 0]]), - ('GaDaGaD', [[0.08838835, 0], # BABA - [0.08838835, 0], - [0, 0.125], - [0.08838835, 0], - [0, 0], - [0, 0], - [0, 0]]), - ('DlTlDlT', [[0, 0], # BABA - [0, 0]]), - ('TlDlTlD', [[0, 0], # BABA - [0, 0]]), - ('GeTeGeT', [[0, 0], # BABA - [0, 0], - [0, 0], - [0, 0], - [0, 0], - [0, 0], - [0, 0]]), - ('GaDlTeGaD', [[0.25, 0.], # BA C BA - [0.25, 0.], - [0., 0.], - [0.25, 0.], - [0., 0.], - [0.1767767, 0.], - [0., 0.]]), - ('GeTlDaGaD', [[0., 0.], # B C ABA - [0., 0.], - [0.125, 0.], - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.]]), - ('GaDaGeTlD', [[0., 0.], # BAB C A - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.], - [0., 0.25], - [0., 0.]]), - ('TlDaGaDaGeT', [[0, 0.0883883476], # C BABA C - [0, 0]]), - ('TlDaGiGaDlT', [[0, 0], # C BAAB C - [0, 0]]), - ('TeGiGaDlTlD', [[0, 0], - [0, 0]]), - ('TeGiGaD', [[0., 0.47855339], - [0., 0.47855339]]), - ('TeGaDaG', [[0., 0., 0., 0., 0., 0.25, 0.], - [0., 0., 0., 0., 0., 0.25, 0.]]) -]) +@pytest.mark.parametrize("dense_threshold", [0, 1]) +@pytest.mark.parametrize( + "metapath,expected", + [ + ( + "DaGiGiG", + [ + [0.0, 0.0, 0.0, 0.0, 0.1767767, 0.0, 0.0], + [0.1767767, 0.21338835, 0.0, 0.21338835, 0.0, 0.0, 0.33838835], + ], + ), + ("DaGiGiGaD", [[0, 0], [0, 0]]), + ("DaGiGaD", [[0.0, 0.47855339], [0.47855339, 0.0]]), + ("TeGiGeT", [[0, 0], [0, 0]]), + ("DaGiGeTlD", [[0, 0], [0, 0]]), + ("DaGeTeGaD", [[0, 0], [0, 0]]), + ("TlDaGiGeT", [[0.0, 0.47855339], [0.0, 0.0]]), + ("DaGeT", [[0.5, 0.5], [0, 0]]), + ("DlTeG", [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0.70710678, 0, 0, 0, 0]]), + ("GeTlD", [[0, 0], [0, 0], [0, 0.70710678], [0, 0], [0, 0], [0, 0], [0, 0]]), + ( + "GaDlT", + [[0.5, 0], [0.5, 0], [0, 0], [0.5, 0], [0, 0], [0.35355339, 0], [0, 0]], + ), + ("TeGaD", [[0.5, 0], [0.5, 0]]), + ("TlDaG", [[0.5, 0.5, 0, 0.5, 0, 0.35355339, 0], [0, 0, 0, 0, 0, 0, 0]]), + ( + "GiG", + [ + [0.0, 0.0, 0.35355339, 0.0, 0.70710678, 0.0, 0.0], + [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0], + [0.35355339, 0.5, 0.0, 0.5, 0.0, 0.0, 0.5], + [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0], + [0.70710678, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0], + ], + ), + ( + "GaDaG", + [ + [0.0, 0.25, 0.0, 0.25, 0.0, 0.1767767, 0.0], + [0.25, 0.0, 0.0, 0.25, 0.0, 0.1767767, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.35355339, 0.0], + [0.25, 0.25, 0.0, 0.0, 0.0, 0.1767767, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + [0.1767767, 0.1767767, 0.35355339, 0.1767767, 0.0, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], + ], + ), + ( + "GiGiG", + [ + [0, 0.1767767, 0, 0.1767767, 0, 0, 0.1767767], + [0.1767767, 0, 0, 0.25, 0, 0, 0.25], + [0, 0, 0, 0, 0.25, 0, 0], + [0.1767767, 0.25, 0, 0, 0, 0, 0.25], + [0, 0, 0.25, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0.1767767, 0.25, 0, 0.25, 0, 0, 0], + ], + ), + ( + "GiGiGiG", + [ + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0.125, 0, 0], + [0, 0, 0, 0, 0, 0, 0], + [0, 0.0, 0, 0, 0.125, 0, 0], + [0, 0.125, 0, 0.125, 0, 0, 0.125], + [0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0.125, 0, 0], + ], + ), + ( + "GaDaGaD", + [ + [0.08838835, 0], # BABA + [0.08838835, 0], + [0, 0.125], + [0.08838835, 0], + [0, 0], + [0, 0], + [0, 0], + ], + ), + ("DlTlDlT", [[0, 0], [0, 0]]), # BABA + ("TlDlTlD", [[0, 0], [0, 0]]), # BABA + ("GeTeGeT", [[0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]), # BABA + ( + "GaDlTeGaD", + [ + [0.25, 0.0], # BA C BA + [0.25, 0.0], + [0.0, 0.0], + [0.25, 0.0], + [0.0, 0.0], + [0.1767767, 0.0], + [0.0, 0.0], + ], + ), + ( + "GeTlDaGaD", + [ + [0.0, 0.0], # B C ABA + [0.0, 0.0], + [0.125, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + ], + ), + ( + "GaDaGeTlD", + [ + [0.0, 0.0], # BAB C A + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.0], + [0.0, 0.25], + [0.0, 0.0], + ], + ), + ("TlDaGaDaGeT", [[0, 0.0883883476], [0, 0]]), # C BABA C + ("TlDaGiGaDlT", [[0, 0], [0, 0]]), # C BAAB C + ("TeGiGaDlTlD", [[0, 0], [0, 0]]), + ("TeGiGaD", [[0.0, 0.47855339], [0.0, 0.47855339]]), + ( + "TeGaDaG", + [ + [0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0, 0.25, 0.0], + ], + ), + ], +) def test_dwpc(metapath, expected, dense_threshold): if expected is not None: expected = numpy.array(expected, dtype=numpy.float64) - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metapath = graph.metagraph.metapath_from_abbrev(metapath) if expected is None: with pytest.raises(Exception): dwpc(graph, metapath, damping=0.5, dense_threshold=dense_threshold) else: - row, col, dwpc_matrix = dwpc(graph, metapath, damping=0.5, - dense_threshold=dense_threshold) + row, col, dwpc_matrix = dwpc( + graph, metapath, damping=0.5, dense_threshold=dense_threshold + ) assert abs(expected - dwpc_matrix).max() == pytest.approx(0, abs=1e-7) if dense_threshold == 1: assert sparse.issparse(dwpc_matrix) @@ -589,38 +1097,49 @@ def test_dwpc(metapath, expected, dense_threshold): assert not sparse.issparse(dwpc_matrix) -@pytest.mark.parametrize('metapath,dtype', [ - ('TeGaDaG', numpy.float64), - ('TeGaDaG', numpy.float32), - # ('TeGaDaG', numpy.float16), # fails due to https://github.com/scipy/scipy/issues/8903 -]) -@pytest.mark.parametrize('dwwc_method', [ - None, - dwwc_sequential, - dwwc_recursive, - dwwc_chain, -]) +@pytest.mark.parametrize( + "metapath,dtype", + [ + ("TeGaDaG", numpy.float64), + ("TeGaDaG", numpy.float32), + # ('TeGaDaG', numpy.float16), # fails due to https://github.com/scipy/scipy/issues/8903 + ], +) +@pytest.mark.parametrize( + "dwwc_method", + [ + None, + dwwc_sequential, + dwwc_recursive, + dwwc_chain, + ], +) def test_dtype(metapath, dtype, dwwc_method): - graph = get_graph('disease-gene-example') + graph = get_graph("disease-gene-example") metapath = graph.metagraph.metapath_from_abbrev(metapath) - rows, cols, dwpc_matrix = dwpc(graph, metapath, dtype=dtype, dwwc_method=dwwc_method) + rows, cols, dwpc_matrix = dwpc( + graph, metapath, dtype=dtype, dwwc_method=dwwc_method + ) assert dwpc_matrix.dtype == dtype -@pytest.mark.parametrize('metapath,relative', [ - ('DrDaGiG', 'equal'), - ('DaGiGaD', 'equal'), - ('DaGaDrDaGaD', 'not_equal'), - ('CrCpDrD', 'equal') -]) +@pytest.mark.parametrize( + "metapath,relative", + [ + ("DrDaGiG", "equal"), + ("DaGiGaD", "equal"), + ("DaGaDrDaGaD", "not_equal"), + ("CrCpDrD", "equal"), + ], +) def test_dwpc_approx(metapath, relative): - graph = get_graph('random-subgraph') + graph = get_graph("random-subgraph") metapath = graph.metagraph.metapath_from_abbrev(metapath) rows, cols, dwpc_matrix = dwpc(graph, metapath) rows, cols, dwpc_approx = _dwpc_approx(graph, metapath) rows, cols, dwwc_matrix = dwwc(graph, metapath) - if relative == 'equal': - assert abs((dwpc_approx - dwpc_matrix)).max() == pytest.approx(0, abs=1e-7) + if relative == "equal": + assert abs(dwpc_approx - dwpc_matrix).max() == pytest.approx(0, abs=1e-7) else: - assert numpy.sum((dwpc_approx - dwpc_matrix)) >= 0 - assert abs((dwwc_matrix - dwpc_approx)).max() >= 0 + assert numpy.sum(dwpc_approx - dwpc_matrix) >= 0 + assert abs(dwwc_matrix - dwpc_approx).max() >= 0 diff --git a/hetmatpy/tests/test_diffusion.py b/hetmatpy/tests/test_diffusion.py index d29e213..b3618a0 100644 --- a/hetmatpy/tests/test_diffusion.py +++ b/hetmatpy/tests/test_diffusion.py @@ -9,7 +9,7 @@ class TestDualNormalize: Test hetmatpy.diffusion.dual_normalize() """ - def get_clean_matrix(self, dtype='float64'): + def get_clean_matrix(self, dtype="float64"): """Return a newly allocated matrix.""" matrix = [ [1, 1, 1], @@ -19,24 +19,24 @@ def get_clean_matrix(self, dtype='float64'): matrix = numpy.array(matrix, dtype=dtype) return matrix - @pytest.mark.parametrize('dtype', ['bool_', 'int8', 'float64']) + @pytest.mark.parametrize("dtype", ["bool_", "int8", "float64"]) def test_diffusion_step_passthrough(self, dtype): """Should not change matrix""" matrix = self.get_clean_matrix(dtype) output = diffusion_step(matrix, 0.0, 0.0) assert numpy.array_equal(output, matrix) - @pytest.mark.parametrize('exponent', [0, 0.3, 0.5, 1, 2, 20]) + @pytest.mark.parametrize("exponent", [0, 0.3, 0.5, 1, 2, 20]) def test_diffusion_step_row_or_column_damping(self, exponent): """Test row, column damping individually""" # Create the matrix expected for single normalization p = exponent # for easier reading expect = [ - [1/3**p, 1/3**p, 1/3**p], - [1/2**p, 1/2**p, 0], + [1 / 3**p, 1 / 3**p, 1 / 3**p], + [1 / 2**p, 1 / 2**p, 0], [1, 0, 0], ] - expect = numpy.array(expect, dtype='float64') + expect = numpy.array(expect, dtype="float64") # Test row normalization works as expected input_matrix = self.get_clean_matrix() @@ -48,25 +48,28 @@ def test_diffusion_step_row_or_column_damping(self, exponent): matrix = diffusion_step(input_matrix, column_damping=exponent) assert numpy.allclose(numpy.transpose(expect), matrix) - @pytest.mark.parametrize('row_damping', [0, 0.3, 0.5, 1, 2]) - @pytest.mark.parametrize('column_damping', [0, 0.3, 0.5, 1, 2]) - def test_diffusion_step_row_and_column_damping( - self, row_damping, column_damping): + @pytest.mark.parametrize("row_damping", [0, 0.3, 0.5, 1, 2]) + @pytest.mark.parametrize("column_damping", [0, 0.3, 0.5, 1, 2]) + def test_diffusion_step_row_and_column_damping(self, row_damping, column_damping): """Test simultaneous row and column damping""" input_matrix = self.get_clean_matrix() # Create the matrix expected for simultaneous dual normalization pr, pc = row_damping, column_damping expect = [ - [(1/3**pc) / (1/3**pc + 1/2**pc + 1)**pr, - (1/2**pc) / (1/3**pc + 1/2**pc + 1)**pr, - 1 / (1/3**pc + 1/2**pc + 1)**pr], - [(1/3**pc) / (1/3**pc + 1/2**pc)**pr, - (1/2**pc) / (1/3**pc + 1/2**pc)**pr, - 0], - [(1/3**pc) / (1/3**pc)**pr, 0, 0], + [ + (1 / 3**pc) / (1 / 3**pc + 1 / 2**pc + 1) ** pr, + (1 / 2**pc) / (1 / 3**pc + 1 / 2**pc + 1) ** pr, + 1 / (1 / 3**pc + 1 / 2**pc + 1) ** pr, + ], + [ + (1 / 3**pc) / (1 / 3**pc + 1 / 2**pc) ** pr, + (1 / 2**pc) / (1 / 3**pc + 1 / 2**pc) ** pr, + 0, + ], + [(1 / 3**pc) / (1 / 3**pc) ** pr, 0, 0], ] - expect = numpy.array(expect, dtype='float64') + expect = numpy.array(expect, dtype="float64") matrix = diffusion_step(input_matrix, row_damping, column_damping) assert numpy.allclose(expect, matrix) @@ -74,24 +77,25 @@ def test_diffusion_step_row_and_column_damping( def get_problem_matrix(type): """Return a problematic matrix of specified type""" matrix_dict = { - 'empty_row': numpy.array([[1, 2], [3, 4], []], dtype=object), - 'empty_matrix': numpy.array([[], [], []]), - 'nan_matrix': numpy.array([[numpy.nan, numpy.nan], [1, 0.5]]), - 'infinite_matrix': numpy.array([[numpy.inf, numpy.inf], [1, 0.5]]), - 'zero_matrix': numpy.zeros((3, 3)) + "empty_row": numpy.array([[1, 2], [3, 4], []], dtype=object), + "empty_matrix": numpy.array([[], [], []]), + "nan_matrix": numpy.array([[numpy.nan, numpy.nan], [1, 0.5]]), + "infinite_matrix": numpy.array([[numpy.inf, numpy.inf], [1, 0.5]]), + "zero_matrix": numpy.zeros((3, 3)), } return matrix_dict[type] - @pytest.mark.parametrize('corner_type', ['empty_row', 'empty_matrix', - 'nan_matrix', 'infinite_matrix', - 'zero_matrix']) + @pytest.mark.parametrize( + "corner_type", + ["empty_row", "empty_matrix", "nan_matrix", "infinite_matrix", "zero_matrix"], + ) def test_corner_cases(self, corner_type): """ Test that errors are returned for matrices with errors like NaN, Inf, emptiness, etc. """ input_matrix = self.get_problem_matrix(corner_type) - if corner_type == 'zero_matrix': # Assert output is all zero + if corner_type == "zero_matrix": # Assert output is all zero assert not diffusion_step(input_matrix, 0.5, 0.5).any() else: with pytest.raises(AssertionError): diff --git a/hetmatpy/tests/test_hetnetpy.py b/hetmatpy/tests/test_hetnetpy.py index 94e9c0d..8801f61 100644 --- a/hetmatpy/tests/test_hetnetpy.py +++ b/hetmatpy/tests/test_hetnetpy.py @@ -3,7 +3,8 @@ def test_hetnetpy_imports(): Test hetnetpy module imports (module formerly named hetio) """ import hetnetpy - import hetnetpy.readwrite import hetnetpy.hetnet + import hetnetpy.readwrite + # Create an empty metagraph hetnetpy.hetnet.MetaGraph() diff --git a/hetmatpy/tests/test_path_count.py b/hetmatpy/tests/test_path_count.py index c4321a9..e076682 100644 --- a/hetmatpy/tests/test_path_count.py +++ b/hetmatpy/tests/test_path_count.py @@ -1,9 +1,9 @@ import platform +import hetnetpy.pathtools import numpy import pytest -import hetnetpy.pathtools from hetmatpy.degree_weight import dwpc from hetmatpy.testing import get_graph @@ -14,10 +14,10 @@ def test_CbGpPWpGaD_traversal(): metapath between bupropion and nicotine dependence. Expected values from the network traversal methods at https://git.io/vHBh2. """ - graph = get_graph('bupropion-subgraph') - compound = 'DB01156' # Bupropion - disease = 'DOID:0050742' # nicotine dependence - metapath = graph.metagraph.metapath_from_abbrev('CbGpPWpGaD') + graph = get_graph("bupropion-subgraph") + compound = "DB01156" # Bupropion + disease = "DOID:0050742" # nicotine dependence + metapath = graph.metagraph.metapath_from_abbrev("CbGpPWpGaD") rows, cols, pc_matrix = dwpc(graph, metapath, damping=0) rows, cols, dwpc_matrix = dwpc(graph, metapath, damping=0.4) i = rows.index(compound) @@ -33,14 +33,14 @@ def test_CbGiGiGaD_traversal(): intended to correspond to the values from the entire Hetionet v1.0. Hence, the expected values are generated using hetnetpy.pathtools. """ - graph = get_graph('bupropion-subgraph') - compound = 'DB01156' # Bupropion - disease = 'DOID:0050742' # nicotine dependence - metapath = graph.metagraph.metapath_from_abbrev('CbGiGiGaD') + graph = get_graph("bupropion-subgraph") + compound = "DB01156" # Bupropion + disease = "DOID:0050742" # nicotine dependence + metapath = graph.metagraph.metapath_from_abbrev("CbGiGiGaD") paths = hetnetpy.pathtools.paths_between( graph, - source=('Compound', compound), - target=('Disease', disease), + source=("Compound", compound), + target=("Disease", disease), metapath=metapath, duplicates=False, ) @@ -55,16 +55,19 @@ def test_CbGiGiGaD_traversal(): assert dwpc_matrix[i, j] == pytest.approx(hetnetpy_dwpc) -@pytest.mark.parametrize('metapath', [ - 'CbGaD', - 'CbGbCtD', - 'CrCtD', - 'CtDrD', - 'CuGr>GuD', - 'CuGGuD", + "CuG") - graph = get_graph('random-subgraph') + graph = get_graph("random-subgraph") graph_or_hetmat = graph if hetmat: - graph_or_hetmat = get_graph('random-subgraph', hetmat=hetmat, directory=tmpdir) + graph_or_hetmat = get_graph("random-subgraph", hetmat=hetmat, directory=tmpdir) metapath = graph.metagraph.metapath_from_abbrev(metapath) # Matrix computations @@ -92,8 +95,8 @@ def test_path_traversal(metapath, hetmat, tmpdir): # hetnetpy.pathtools computations paths = hetnetpy.pathtools.paths_between( graph, - source=('Compound', compound), - target=('Disease', disease), + source=("Compound", compound), + target=("Disease", disease), metapath=metapath, duplicates=False, ) diff --git a/hetmatpy/tests/test_pipeline.py b/hetmatpy/tests/test_pipeline.py index 84d0cb6..d819fcb 100644 --- a/hetmatpy/tests/test_pipeline.py +++ b/hetmatpy/tests/test_pipeline.py @@ -56,118 +56,147 @@ def test_grouper_length_1(): ] -@pytest.mark.parametrize('sum_of_squares, unsquared_sum, number_nonzero, expected_output', [ - # Sum of squares and unsquared sum are from a pair of the same number, so return zero - (32.0, 8.0, 2, 0.0), - # Sum of squares and unsquared sum are very close to the case above, so return zero - (32.0, 8.0 + 1e-6, 2, 0.0), - # Only one nonzero observation, so return None - (5.0, 5.0, 1, None), - # Test that the standard deviation of 5, 4, and 3 is 1 - (50.0, 12.0, 3, 1.0), - # Test no nonzero values - (0.0, 0.0, 0, None), -]) +@pytest.mark.parametrize( + "sum_of_squares, unsquared_sum, number_nonzero, expected_output", + [ + # Sum of squares and unsquared sum are from a pair of the same number, so return zero + (32.0, 8.0, 2, 0.0), + # Sum of squares and unsquared sum are very close to the case above, so return zero + (32.0, 8.0 + 1e-6, 2, 0.0), + # Only one nonzero observation, so return None + (5.0, 5.0, 1, None), + # Test that the standard deviation of 5, 4, and 3 is 1 + (50.0, 12.0, 3, 1.0), + # Test no nonzero values + (0.0, 0.0, 0, None), + ], +) def test_calculate_sd(sum_of_squares, unsquared_sum, number_nonzero, expected_output): - assert calculate_sd(sum_of_squares, unsquared_sum, number_nonzero) == expected_output - - -@pytest.mark.parametrize('row, expected_output', [ - # zero path count - ({ - 'path_count': 0, - 'sd_nz': 2.0, - 'dwpc': 4.0, - 'nnz': 1, - 'n': 4, - 'alpha': 1.0, - 'beta': 2.0, - 'sum': 1.0, - }, 1.0), - # zero standard deviation with dwpc lower than mean - ({ - 'path_count': 5, - 'sd_nz': 0.0, - 'dwpc': 2.0, - 'mean_nz': 3.0, - 'nnz': 3, - 'n': 8, - 'alpha': 1.0, - 'beta': 2.0, - 'sum': 1.0, - }, .375), - # zero standard deviation with dwpc higher than mean - ({ - 'path_count': 5, - 'sd_nz': 0.0, - 'dwpc': 4.0, - 'mean_nz': 3.0, - 'nnz': 1, - 'n': 4, - 'alpha': 1.0, - 'beta': 2.0, - 'sum': 1.0, - }, 0.0), - # normal gamma hurdle case - ({ - 'path_count': 5, - 'sd_nz': 1.0, - 'dwpc': 2.5, - 'nnz': 1, - 'n': 10, - 'alpha': 1.0, - 'beta': 1.0, - 'sum': 1.0, - }, .008208), - # number nonzero is itself zero - ({ - 'path_count': 5, - 'sd_nz': 0.0, - 'dwpc': 2.0, - 'nnz': 0, - 'n': 4, - 'alpha': 1.0, - 'beta': 2.0, - 'sum': 0.0, - }, 0.0), - # dwpc slightly larger than mean_nz, but within float error tolerance - ({ - 'source_id': 'DB00193', - 'target_id': 'DOID:0050425', - 'source_name': 'Tramadol', - 'target_name': 'restless legs syndrome', - 'source_degree': 1, - 'target_degree': 10, - 'path_count': 1, - 'dwpc': 7.323728709931218, - 'n': 81600, - 'nnz': 2086, - 'n_perms': 200, - 'mean_nz': 7.323728709931212, - 'sd_nz': 0.0, - }, 0.02556372549), - # standard deviation is None - ({ - 'path_count': 5, - 'sd_nz': None, - 'dwpc': 1.5, - 'nnz': 10, - 'n': 100, - 'alpha': 1.0, - 'beta': 1.0, - 'sum': 1.0, - 'mean_nz': 2, - }, .1), -]) + assert ( + calculate_sd(sum_of_squares, unsquared_sum, number_nonzero) == expected_output + ) + + +@pytest.mark.parametrize( + "row, expected_output", + [ + # zero path count + ( + { + "path_count": 0, + "sd_nz": 2.0, + "dwpc": 4.0, + "nnz": 1, + "n": 4, + "alpha": 1.0, + "beta": 2.0, + "sum": 1.0, + }, + 1.0, + ), + # zero standard deviation with dwpc lower than mean + ( + { + "path_count": 5, + "sd_nz": 0.0, + "dwpc": 2.0, + "mean_nz": 3.0, + "nnz": 3, + "n": 8, + "alpha": 1.0, + "beta": 2.0, + "sum": 1.0, + }, + 0.375, + ), + # zero standard deviation with dwpc higher than mean + ( + { + "path_count": 5, + "sd_nz": 0.0, + "dwpc": 4.0, + "mean_nz": 3.0, + "nnz": 1, + "n": 4, + "alpha": 1.0, + "beta": 2.0, + "sum": 1.0, + }, + 0.0, + ), + # normal gamma hurdle case + ( + { + "path_count": 5, + "sd_nz": 1.0, + "dwpc": 2.5, + "nnz": 1, + "n": 10, + "alpha": 1.0, + "beta": 1.0, + "sum": 1.0, + }, + 0.008208, + ), + # number nonzero is itself zero + ( + { + "path_count": 5, + "sd_nz": 0.0, + "dwpc": 2.0, + "nnz": 0, + "n": 4, + "alpha": 1.0, + "beta": 2.0, + "sum": 0.0, + }, + 0.0, + ), + # dwpc slightly larger than mean_nz, but within float error tolerance + ( + { + "source_id": "DB00193", + "target_id": "DOID:0050425", + "source_name": "Tramadol", + "target_name": "restless legs syndrome", + "source_degree": 1, + "target_degree": 10, + "path_count": 1, + "dwpc": 7.323728709931218, + "n": 81600, + "nnz": 2086, + "n_perms": 200, + "mean_nz": 7.323728709931212, + "sd_nz": 0.0, + }, + 0.02556372549, + ), + # standard deviation is None + ( + { + "path_count": 5, + "sd_nz": None, + "dwpc": 1.5, + "nnz": 10, + "n": 100, + "alpha": 1.0, + "beta": 1.0, + "sum": 1.0, + "mean_nz": 2, + }, + 0.1, + ), + ], +) def test_calculate_p_value(row, expected_output): assert calculate_p_value(row) == pytest.approx(expected_output, rel=1e-4) def test_add_gamma_hurdle(): df_dict = { - 'nnz': [1, 3, 3], - 'sum': [4.0, 4.0, 3.0], - 'sum_of_squares': [4.0, 6.0, 3.0 + 1e-15], + "nnz": [1, 3, 3], + "sum": [4.0, 4.0, 3.0], + "sum_of_squares": [4.0, 6.0, 3.0 + 1e-15], } dgp_df = pandas.DataFrame(df_dict) dgp_df = add_gamma_hurdle_to_dgp_df(dgp_df) @@ -175,40 +204,43 @@ def test_add_gamma_hurdle(): # Test nnz = 1 expected_mean_nz_0 = 4.0 print(dgp_df) - assert expected_mean_nz_0 == dgp_df['mean_nz'][0] - assert pandas.isna(dgp_df['sd_nz'][0]) - assert pandas.isna(dgp_df['beta'][0]) - assert pandas.isna(dgp_df['alpha'][0]) + assert expected_mean_nz_0 == dgp_df["mean_nz"][0] + assert pandas.isna(dgp_df["sd_nz"][0]) + assert pandas.isna(dgp_df["beta"][0]) + assert pandas.isna(dgp_df["alpha"][0]) # Test a normal case expected_mean_nz_1 = 4 / 3 - expected_sd_nz_1 = ((2 / 3) / 2) ** .5 - expected_beta_1 = expected_mean_nz_1 / expected_sd_nz_1 ** 2 + expected_sd_nz_1 = ((2 / 3) / 2) ** 0.5 + expected_beta_1 = expected_mean_nz_1 / expected_sd_nz_1**2 expected_alpha_1 = expected_mean_nz_1 * expected_beta_1 - assert expected_mean_nz_1 == dgp_df['mean_nz'][1] - assert expected_sd_nz_1 == pytest.approx(dgp_df['sd_nz'][1]) - assert expected_beta_1 == pytest.approx(dgp_df['beta'][1]) - assert expected_alpha_1 == pytest.approx(dgp_df['alpha'][1]) + assert expected_mean_nz_1 == dgp_df["mean_nz"][1] + assert expected_sd_nz_1 == pytest.approx(dgp_df["sd_nz"][1]) + assert expected_beta_1 == pytest.approx(dgp_df["beta"][1]) + assert expected_alpha_1 == pytest.approx(dgp_df["alpha"][1]) # Test squared deviations ~ 0 expected_mean_nz_2 = 1.0 expected_sd_nz_2 = 0.0 - assert expected_mean_nz_2 == dgp_df['mean_nz'][2] - assert expected_sd_nz_2 == dgp_df['sd_nz'][2] - assert pandas.isna(dgp_df['beta'][2]) - assert pandas.isna(dgp_df['alpha'][2]) - - -@pytest.mark.parametrize('row, expected_output', [ - # Path count is zero - ({'path_count': 0.0}, True), - # Path count is nonzero - ({'path_count': 1.0}, False), - # No path count, dwpc is zero - ({'dwpc': 0.0}, True), - # No path count, dwpc is nonzero - ({'dwpc': .01}, False), -]) + assert expected_mean_nz_2 == dgp_df["mean_nz"][2] + assert expected_sd_nz_2 == dgp_df["sd_nz"][2] + assert pandas.isna(dgp_df["beta"][2]) + assert pandas.isna(dgp_df["alpha"][2]) + + +@pytest.mark.parametrize( + "row, expected_output", + [ + # Path count is zero + ({"path_count": 0.0}, True), + # Path count is nonzero + ({"path_count": 1.0}, False), + # No path count, dwpc is zero + ({"dwpc": 0.0}, True), + # No path count, dwpc is nonzero + ({"dwpc": 0.01}, False), + ], +) def test_path_does_not_exist(row, expected_output): assert path_does_not_exist(row) == expected_output @@ -218,14 +250,14 @@ def test_calculate_gamma_hurdle_p_value_missing_beta_alpha(): Testing data from https://search.het.io/?source=1502&target=41593&metapaths=BPpGbCcSE """ row = { - 'n': 202_969_400, - 'nnz': 1_447_786, - 'mean_nz': 3.03334213264993, - 'sd_nz': 0.986636206544574, - 'path_count': 3, - 'dwpc': 2.48807892922932, + "n": 202_969_400, + "nnz": 1_447_786, + "mean_nz": 3.03334213264993, + "sd_nz": 0.986636206544574, + "path_count": 3, + "dwpc": 2.48807892922932, } p_value = calculate_gamma_hurdle_p_value(row) assert pytest.approx(p_value) == 0.0048801093094888 - assert pytest.approx(row['beta']) == 3.1160706804802087 - assert pytest.approx(row['alpha']) == 9.452108483415754 + assert pytest.approx(row["beta"]) == 3.1160706804802087 + assert pytest.approx(row["alpha"]) == 9.452108483415754 diff --git a/hetmatpy/xarray.py b/hetmatpy/xarray.py index 4acb563..73ed772 100644 --- a/hetmatpy/xarray.py +++ b/hetmatpy/xarray.py @@ -22,16 +22,14 @@ def metaedge_to_data_array(graph, metaedge, dtype=numpy.bool_): Return an xarray.DataArray that's an adjacency matrix where source nodes are columns and target nodes are rows. """ - source_node_ids, target_node_ids, adjacency_matrix = ( - metaedge_to_adjacency_matrix(graph, metaedge, dtype=dtype)) + source_node_ids, target_node_ids, adjacency_matrix = metaedge_to_adjacency_matrix( + graph, metaedge, dtype=dtype + ) dims = metaedge.source.identifier, metaedge.target.identifier coords = source_node_ids, target_node_ids data_array = xarray.DataArray( - adjacency_matrix, - coords=coords, - dims=dims, - name=metaedge.get_unicode_str() + adjacency_matrix, coords=coords, dims=dims, name=metaedge.get_unicode_str() ) return data_array diff --git a/setup.py b/setup.py index 7e14edf..96f3772 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,11 @@ long_description = readme_path.read_text(encoding="utf-8-sig") setuptools.setup( - name='hetmatpy', - description='Matrix implementations for hetnets and path-count-based measures', + name="hetmatpy", + description="Matrix implementations for hetnets and path-count-based measures", long_description_content_type="text/markdown", long_description=long_description, - url='https://github.com/hetio/hetmatpy', + url="https://github.com/hetio/hetmatpy", project_urls={ "Source": "https://github.com/hetio/hetmatpy", "Documentation": "https://hetio.github.io/hetmatpy", @@ -28,22 +28,22 @@ "Homepage": "https://het.io/software/", "Publication": "https://greenelab.github.io/connectivity-search-manuscript/", }, - license='BSD-2-Clause Plus Patent License', + license="BSD-2-Clause Plus Patent License", packages=setuptools.find_packages(), - python_requires='>=3.6', + python_requires=">=3.6", install_requires=[ - 'hetnetpy>=0.3.0', - 'numpy', - 'pandas', - 'scipy', + "hetnetpy>=0.3.0", + "numpy", + "pandas", + "scipy", ], extras_require={ - 'dev': [ - 'black', - 'flake8', - 'portray', - 'pytest', - 'xarray', + "dev": [ + "black", + "flake8", + "portray", + "pytest", + "xarray", ] - } + }, )