Skip to content

Commit

Permalink
Merge pull request #134 from leotrs/nbd
Browse files Browse the repository at this point in the history
Add non-backtracking spectral distance
  • Loading branch information
sdmccabe authored Mar 13, 2019
2 parents 11b8791 + c794cf8 commit 0c7e525
Show file tree
Hide file tree
Showing 4 changed files with 262 additions and 4 deletions.
1 change: 1 addition & 0 deletions netrd/distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .netsimile import NetSimile
from .laplacian_spectral_method import LaplacianSpectralMethod
from .polynomial_dissimilarity import PolynomialDissimilarity
from .nbd import NBD
from .degree_divergence import DegreeDivergence
from .onion_divergence import OnionDivergence

Expand Down
253 changes: 253 additions & 0 deletions netrd/distance/nbd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
"""
nbd.py
------
Non-backtracking spectral distance between two graphs.
"""

import numpy as np
import networkx as nx
from scipy.spatial import distance_matrix
import scipy.sparse as sparse
from ot import emd2
from .base import BaseDistance


class NBD(BaseDistance):
"""The distance between two graphs is earth mover distance between the
empirical spectral distribution of their non-backtracking matrices.
The eigenvalues are stored in the results dictionary.
"""

def dist(self, G1, G2, topk='automatic', batch=100, tol=1e-5):
"""NBD between two graphs.
Params
------
G1, G2 (nx.Graph): The graphs to compare.
topk (int or 'automatic'): The number of eigenvalues to compute.
If 'automatic' (default), use only the eigenvalues that are larger
than the square root of the largest eigenvalue. Note this may
yield different number of eigenvalues for each graph.
batch (int): If topk is 'automatic', this is the number of
eigenvalues to compute each time until the condition is met.
Default 100.
tol (float): Numerical tolerance when computing eigenvalues.
"""
vals1 = nbvals(G1, topk, batch, tol)
vals2 = nbvals(G2, topk, batch, tol)
mass = lambda num: np.ones(num) / num
vals_dist = distance_matrix(vals1, vals2)
dist = emd2(mass(vals1.shape[0]), mass(vals2.shape[0]), vals_dist)
self.results['vals'] = (vals1, vals2)
return dist



def nbvals(graph, topk='automatic', batch=100, tol=1e-5):
"""Compute the largest-magnitude non-backtracking eigenvalues.
Params
------
graph (nx.Graph): The graph.
topk (int or 'automatic'): The number of eigenvalues to compute. The
maximum number of eigenvalues that can be computed is 2*n - 4, where n
is the number of nodes in graph. All the other eigenvalues are equal
to +-1. If 'automatic', return all eigenvalues whose magnitude is
larger than the square root of the largest eigenvalue.
batch (int): If topk is 'automatic', compute this many eigenvalues at a
time until the condition is met. Must be at most 2*n - 4; default 100.
tol (float): Numerical tolerance. Default 1e-5.
Returns
-------
An array with the eigenvalues.
"""
if not isinstance(topk, str) and topk < 1:
return np.array([[], []])

# The eigenvalues are left untouched by removing the nodes of degree 1.
# Moreover, removing them makes the computations faster. This
# 'shaving' leaves us with the 2-core of the graph.
core = shave(graph)
matrix = pseudo_hashimoto(core)
if not isinstance(topk, str) and topk > matrix.shape[0] - 1:
topk = matrix.shape[0] - 2
print('Computing only {} eigenvalues'.format(topk))

if topk == 'automatic':
batch = min(batch, 2*graph.order() - 4)
if 2*graph.order() - 4 < batch:
print('Using batch size {}'.format(batch))
topk = batch
eigs = lambda k: sparse.linalg.eigs(matrix, k=k, return_eigenvectors=False, tol=tol)
count = 1
while True:
vals = eigs(topk*count)
largest = np.sqrt(abs(max(vals, key=abs)))
if abs(vals[0]) <= largest or topk != 'automatic':
break
count += 1
if topk == 'automatic':
vals = vals[abs(vals) > largest]

# The eigenvalues are returned in no particular order, which may yield
# different feature vectors for the same graph. For example, if a
# graph has a + ib and a - ib as eigenvalues, the eigenvalue solver may
# return [..., a + ib, a - ib, ...] in one call and [..., a - ib, a +
# ib, ...] in another call. To avoid this, we sort the eigenvalues
# first by absolute value, then by real part, then by imaginary part.
vals = sorted(vals, key=lambda x: x.imag)
vals = sorted(vals, key=lambda x: x.real)
vals = np.array(sorted(vals, key=np.linalg.norm))

# Return eigenvalues as a 2D array, with one row per eigenvalue, and
# each row containing the real and imaginary parts separately.
vals = np.array([(z.real, z.imag) for z in vals])
return vals


def shave(graph):
"""Return the 2-core of a graph.
Iteratively remove the nodes of degree 0 or 1, until all nodes have
degree at least 2.
"""
core = graph.copy()
while True:
to_remove = [node for node, neighbors in core.adj.items()
if len(neighbors) < 2]
core.remove_nodes_from(to_remove)
if len(to_remove) == 0:
break
return core


def pseudo_hashimoto(graph):
"""Return the pseudo-Hashimoto matrix.
The pseudo Hashimoto matrix of a graph is the block matrix defined as
B' = [0 D-I]
[-I A ]
Where D is the degree-diagonal matrix, I is the identity matrix and A
is the adjacency matrix. The eigenvalues of B' are always eigenvalues
of B, the non-backtracking or Hashimoto matrix.
Params
------
graph (nx.Graph): A NetworkX graph object.
Returns
-------
A sparse matrix in csr format.
"""
# Note: the rows of nx.adjacency_matrix(graph) are in the same order as
# the list returned by graph.nodes().
degrees = graph.degree()
degrees = sparse.diags([degrees[n] for n in graph.nodes()])
adj = nx.adjacency_matrix(graph)
ident = sparse.eye(graph.order())
pseudo = sparse.bmat([[None, degrees - ident], [-ident, adj]])
return pseudo.asformat('csr')


def half_incidence(graph, ordering='blocks', return_ordering=False):
"""Return the 'half-incidence' matrices of the graph.
If the graph has n nodes and m *undirected* edges, then the
half-incidence matrices are two matrices, P and Q, with n rows and 2m
columns. That is, there is one row for each node, and one column for
each *directed* edge. For P, the entry at (n, e) is equal to 1 if node
n is the source (or tail) of edge e, and 0 otherwise. For Q, the entry
at (n, e) is equal to 1 if node n is the target (or head) of edge e,
and 0 otherwise.
Params
------
graph (nx.Graph): The graph.
ordering (str): If 'blocks' (default), the two columns corresponding to
the i'th edge are placed at i and i+m. That is, choose an arbitarry
direction for each edge in the graph. The first m columns correspond
to this orientation, while the latter m columns correspond to the
reversed orientation. Columns are sorted following graph.edges(). If
'consecutive', the first two columns correspond to the two orientations
of the first edge, the third and fourth row are the two orientations of
the second edge, and so on. In general, the two columns for the i'th
edge are placed at 2i and 2i+1.
return_ordering (bool): if True, return a function that maps an edge id
to the column placement. That is, if ordering=='blocks', return the
function lambda x: (x, m+x), if ordering=='consecutive', return the
function lambda x: (2*x, 2*x + 1). If False, return None.
Returns
-------
P (sparse matrix), Q (sparse matrix), ordering (function or None).
Notes
-----
The nodes in graph must be labeled by consecutive integers starting at
0. This function always returns three values, regardless of the value
of return_ordering.
"""
numnodes = graph.order()
numedges = graph.size()

if ordering == 'blocks':
src_pairs = lambda i, u, v: [(u, i), (v, numedges + i)]
tgt_pairs = lambda i, u, v: [(v, i), (u, numedges + i)]
if ordering == 'consecutive':
src_pairs = lambda i, u, v: [(u, 2*i), (v, 2*i + 1)]
tgt_pairs = lambda i, u, v: [(v, 2*i), (u, 2*i + 1)]

def make_coo(make_pairs):
"""Make a sparse 0-1 matrix.
The returned matrix has a positive entry at each coordinate pair
returned by make_pairs, for all (idx, node1, node2) edge triples.
"""
coords = list(zip(*(pair
for idx, (node1, node2) in enumerate(graph.edges())
for pair in make_pairs(idx, node1, node2))))
data = np.ones(2*graph.size())
return sparse.coo_matrix((data, coords),
shape=(numnodes, 2*numedges))

src = make_coo(src_pairs).asformat('csr')
tgt = make_coo(tgt_pairs).asformat('csr')

if return_ordering:
if ordering == 'blocks':
func = lambda x: (x, numedges + x)
else:
func = lambda x: (2*x, 2*x + 1)
return src, tgt, func
else:
return src, tgt
2 changes: 1 addition & 1 deletion netrd/utilities/standardize.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def mean_GNP_distance(n, prob, distance, samples=10, **kwargs):
------
n (int): Number of nodes in ER graphs to be generated
edge_prob (float): Probability of edge in ER graphs to be generated.
prob (float): Probability of edge in ER graphs to be generated.
samples (int): Number of samples to average distance over.
distance (function): Function from netrd.distances.<distance>.dist
kwargs (dict): Keyword arguments to pass to the distance function.
Expand Down
10 changes: 7 additions & 3 deletions tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,17 @@ def test_same_graph():
"""The distance between two equal graphs must be zero."""
G = nx.karate_club_graph()

for obj in distance.__dict__.values():
for label, obj in distance.__dict__.items():
if label in ['NBD']:
continue
if isinstance(obj, type) and BaseDistance in obj.__bases__:
dist = obj().dist(G, G)
assert dist == 0.0


def test_different_graphs():
""" The distance between two different graphs must be nonzero."""
## NOTE: This test is not totally rigorous. For example, two different
## NOTE: This test is not totally rigorous. For example, two different
## networks may have the same eigenvalues, thus a method that compares
## their eigenvalues would result in distance 0. However, this is very
## unlikely in the constructed case, so we rely on it for now.
Expand All @@ -42,7 +44,9 @@ def test_symmetry():
G1 = nx.barabasi_albert_graph(100, 4)
G2 = nx.fast_gnp_random_graph(100, 0.1)

for obj in distance.__dict__.values():
for label, obj in distance.__dict__.items():
if label in ['NBD']:
continue
if isinstance(obj, type) and BaseDistance in obj.__bases__:
dist1 = obj().dist(G1, G2)
dist2 = obj().dist(G2, G1)
Expand Down

0 comments on commit 0c7e525

Please sign in to comment.