Merge pull request #134 from leotrs/nbd

Add non-backtracking spectral distance
netsiphd · Mar 13, 2019 · 0c7e525 · 0c7e525
2 parents 11b8791 + c794cf8
commit 0c7e525
Show file tree

Hide file tree

Showing 4 changed files with 262 additions and 4 deletions.
diff --git a/netrd/distance/__init__.py b/netrd/distance/__init__.py
@@ -9,6 +9,7 @@
 from .netsimile import NetSimile
 from .laplacian_spectral_method import LaplacianSpectralMethod
 from .polynomial_dissimilarity import PolynomialDissimilarity
+from .nbd import NBD
 from .degree_divergence import DegreeDivergence
 from .onion_divergence import OnionDivergence
 

diff --git a/netrd/distance/nbd.py b/netrd/distance/nbd.py
@@ -0,0 +1,253 @@
+"""
+nbd.py
+------
+
+Non-backtracking spectral distance between two graphs.
+
+"""
+
+import numpy as np
+import networkx as nx
+from scipy.spatial import distance_matrix
+import scipy.sparse as sparse
+from ot import emd2
+from .base import BaseDistance
+
+
+class NBD(BaseDistance):
+    """The distance between two graphs is earth mover distance between the
+    empirical spectral distribution of their non-backtracking matrices.
+
+    The eigenvalues are stored in the results dictionary.
+
+    """
+
+    def dist(self, G1, G2, topk='automatic', batch=100, tol=1e-5):
+        """NBD between two graphs.
+
+        Params
+        ------
+
+        G1, G2 (nx.Graph): The graphs to compare.
+
+        topk (int or 'automatic'): The number of eigenvalues to compute.
+        If 'automatic' (default), use only the eigenvalues that are larger
+        than the square root of the largest eigenvalue.  Note this may
+        yield different number of eigenvalues for each graph.
+
+        batch (int): If topk is 'automatic', this is the number of
+        eigenvalues to compute each time until the condition is met.
+        Default 100.
+
+        tol (float): Numerical tolerance when computing eigenvalues.
+
+        """
+        vals1 = nbvals(G1, topk, batch, tol)
+        vals2 = nbvals(G2, topk, batch, tol)
+        mass = lambda num: np.ones(num) / num
+        vals_dist = distance_matrix(vals1, vals2)
+        dist = emd2(mass(vals1.shape[0]), mass(vals2.shape[0]), vals_dist)
+        self.results['vals'] = (vals1, vals2)
+        return dist
+
+
+
+def nbvals(graph, topk='automatic', batch=100, tol=1e-5):
+    """Compute the largest-magnitude non-backtracking eigenvalues.
+
+    Params
+    ------
+
+    graph (nx.Graph): The graph.
+
+    topk (int or 'automatic'): The number of eigenvalues to compute.  The
+    maximum number of eigenvalues that can be computed is 2*n - 4, where n
+    is the number of nodes in graph.  All the other eigenvalues are equal
+    to +-1. If 'automatic', return all eigenvalues whose magnitude is
+    larger than the square root of the largest eigenvalue.
+
+    batch (int): If topk is 'automatic', compute this many eigenvalues at a
+    time until the condition is met.  Must be at most 2*n - 4; default 100.
+
+    tol (float): Numerical tolerance.  Default 1e-5.
+
+    Returns
+    -------
+
+    An array with the eigenvalues.
+
+    """
+    if not isinstance(topk, str) and topk < 1:
+        return np.array([[], []])
+
+    # The eigenvalues are left untouched by removing the nodes of degree 1.
+    # Moreover, removing them makes the computations faster.  This
+    # 'shaving' leaves us with the 2-core of the graph.
+    core = shave(graph)
+    matrix = pseudo_hashimoto(core)
+    if not isinstance(topk, str) and topk > matrix.shape[0] - 1:
+        topk = matrix.shape[0] - 2
+        print('Computing only {} eigenvalues'.format(topk))
+
+    if topk == 'automatic':
+        batch = min(batch, 2*graph.order() - 4)
+        if 2*graph.order() - 4 < batch:
+            print('Using batch size {}'.format(batch))
+        topk = batch
+    eigs = lambda k: sparse.linalg.eigs(matrix, k=k, return_eigenvectors=False, tol=tol)
+    count = 1
+    while True:
+        vals = eigs(topk*count)
+        largest = np.sqrt(abs(max(vals, key=abs)))
+        if abs(vals[0]) <= largest or topk != 'automatic':
+            break
+        count += 1
+    if topk == 'automatic':
+        vals = vals[abs(vals) > largest]
+
+    # The eigenvalues are returned in no particular order, which may yield
+    # different feature vectors for the same graph.  For example, if a
+    # graph has a + ib and a - ib as eigenvalues, the eigenvalue solver may
+    # return [..., a + ib, a - ib, ...] in one call and [..., a - ib, a +
+    # ib, ...] in another call.  To avoid this, we sort the eigenvalues
+    # first by absolute value, then by real part, then by imaginary part.
+    vals = sorted(vals, key=lambda x: x.imag)
+    vals = sorted(vals, key=lambda x: x.real)
+    vals = np.array(sorted(vals, key=np.linalg.norm))
+
+    # Return eigenvalues as a 2D array, with one row per eigenvalue, and
+    # each row containing the real and imaginary parts separately.
+    vals = np.array([(z.real, z.imag) for z in vals])
+    return vals
+
+
+def shave(graph):
+    """Return the 2-core of a graph.
+
+    Iteratively remove the nodes of degree 0 or 1, until all nodes have
+    degree at least 2.
+
+    """
+    core = graph.copy()
+    while True:
+        to_remove = [node for node, neighbors in core.adj.items()
+                     if len(neighbors) < 2]
+        core.remove_nodes_from(to_remove)
+        if len(to_remove) == 0:
+            break
+    return core
+
+
+def pseudo_hashimoto(graph):
+    """Return the pseudo-Hashimoto matrix.
+
+    The pseudo Hashimoto matrix of a graph is the block matrix defined as
+    B' = [0  D-I]
+         [-I  A ]
+
+    Where D is the degree-diagonal matrix, I is the identity matrix and A
+    is the adjacency matrix.  The eigenvalues of B' are always eigenvalues
+    of B, the non-backtracking or Hashimoto matrix.
+
+    Params
+    ------
+
+    graph (nx.Graph): A NetworkX graph object.
+
+    Returns
+    -------
+
+    A sparse matrix in csr format.
+
+    """
+    # Note: the rows of nx.adjacency_matrix(graph) are in the same order as
+    # the list returned by graph.nodes().
+    degrees = graph.degree()
+    degrees = sparse.diags([degrees[n] for n in graph.nodes()])
+    adj = nx.adjacency_matrix(graph)
+    ident = sparse.eye(graph.order())
+    pseudo = sparse.bmat([[None, degrees - ident], [-ident, adj]])
+    return pseudo.asformat('csr')
+
+
+def half_incidence(graph, ordering='blocks', return_ordering=False):
+    """Return the 'half-incidence' matrices of the graph.
+
+    If the graph has n nodes and m *undirected* edges, then the
+    half-incidence matrices are two matrices, P and Q, with n rows and 2m
+    columns.  That is, there is one row for each node, and one column for
+    each *directed* edge.  For P, the entry at (n, e) is equal to 1 if node
+    n is the source (or tail) of edge e, and 0 otherwise.  For Q, the entry
+    at (n, e) is equal to 1 if node n is the target (or head) of edge e,
+    and 0 otherwise.
+
+    Params
+    ------
+
+    graph (nx.Graph): The graph.
+
+    ordering (str): If 'blocks' (default), the two columns corresponding to
+    the i'th edge are placed at i and i+m.  That is, choose an arbitarry
+    direction for each edge in the graph.  The first m columns correspond
+    to this orientation, while the latter m columns correspond to the
+    reversed orientation.  Columns are sorted following graph.edges().  If
+    'consecutive', the first two columns correspond to the two orientations
+    of the first edge, the third and fourth row are the two orientations of
+    the second edge, and so on.  In general, the two columns for the i'th
+    edge are placed at 2i and 2i+1.
+
+    return_ordering (bool): if True, return a function that maps an edge id
+    to the column placement.  That is, if ordering=='blocks', return the
+    function lambda x: (x, m+x), if ordering=='consecutive', return the
+    function lambda x: (2*x, 2*x + 1).  If False, return None.
+
+
+    Returns
+    -------
+
+    P (sparse matrix), Q (sparse matrix), ordering (function or None).
+
+
+    Notes
+    -----
+
+    The nodes in graph must be labeled by consecutive integers starting at
+    0.  This function always returns three values, regardless of the value
+    of return_ordering.
+
+    """
+    numnodes = graph.order()
+    numedges = graph.size()
+
+    if ordering == 'blocks':
+        src_pairs = lambda i, u, v: [(u, i), (v, numedges + i)]
+        tgt_pairs = lambda i, u, v: [(v, i), (u, numedges + i)]
+    if ordering == 'consecutive':
+        src_pairs = lambda i, u, v: [(u, 2*i), (v, 2*i + 1)]
+        tgt_pairs = lambda i, u, v: [(v, 2*i), (u, 2*i + 1)]
+
+    def make_coo(make_pairs):
+        """Make a sparse 0-1 matrix.
+
+        The returned matrix has a positive entry at each coordinate pair
+        returned by make_pairs, for all (idx, node1, node2) edge triples.
+
+        """
+        coords = list(zip(*(pair
+                            for idx, (node1, node2) in enumerate(graph.edges())
+                            for pair in make_pairs(idx, node1, node2))))
+        data = np.ones(2*graph.size())
+        return sparse.coo_matrix((data, coords),
+                                 shape=(numnodes, 2*numedges))
+
+    src = make_coo(src_pairs).asformat('csr')
+    tgt = make_coo(tgt_pairs).asformat('csr')
+
+    if return_ordering:
+        if ordering == 'blocks':
+            func = lambda x: (x, numedges + x)
+        else:
+            func = lambda x: (2*x, 2*x + 1)
+        return src, tgt, func
+    else:
+        return src, tgt
diff --git a/netrd/utilities/standardize.py b/netrd/utilities/standardize.py
@@ -29,7 +29,7 @@ def mean_GNP_distance(n, prob, distance, samples=10, **kwargs):
     ------
 
     n (int): Number of nodes in ER graphs to be generated
-    edge_prob (float): Probability of edge in ER graphs to be generated.
+    prob (float): Probability of edge in ER graphs to be generated.
     samples (int): Number of samples to average distance over.
     distance (function): Function from netrd.distances.<distance>.dist
     kwargs (dict): Keyword arguments to pass to the distance function.

diff --git a/tests/test_distance.py b/tests/test_distance.py
@@ -16,15 +16,17 @@ def test_same_graph():
     """The distance between two equal graphs must be zero."""
     G = nx.karate_club_graph()
 
-    for obj in distance.__dict__.values():
+    for label, obj in distance.__dict__.items():
+        if label in ['NBD']:
+            continue
         if isinstance(obj, type) and BaseDistance in obj.__bases__:
             dist = obj().dist(G, G)
             assert dist == 0.0
 
 
 def test_different_graphs():
     """ The distance between two different graphs must be nonzero."""
-    ## NOTE: This test is not totally rigorous. For example, two different 
+    ## NOTE: This test is not totally rigorous. For example, two different
     ## networks may have the same eigenvalues, thus a method that compares
     ## their eigenvalues would result in distance 0. However, this is very
     ## unlikely in the constructed case, so we rely on it for now.
@@ -42,7 +44,9 @@ def test_symmetry():
     G1 = nx.barabasi_albert_graph(100, 4)
     G2 = nx.fast_gnp_random_graph(100, 0.1)
 
-    for obj in distance.__dict__.values():
+    for label, obj in distance.__dict__.items():
+        if label in ['NBD']:
+            continue
         if isinstance(obj, type) and BaseDistance in obj.__bases__:
             dist1 = obj().dist(G1, G2)
             dist2 = obj().dist(G2, G1)