-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathgraph.py
548 lines (507 loc) · 20.8 KB
/
graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
from collections.abc import Iterable
from copy import deepcopy
import gc
import networkx as nx
import numba
from numba import jit
import numpy as np
import os
import pandas as pd
from scipy import sparse
from sklearn.decomposition import TruncatedSVD
import time
import warnings
from csrgraph.methods import (
_row_norm, _node_degrees, _src_multiply, _dst_multiply
)
from csrgraph.random_walks import (
_random_walk, _node2vec_walks
)
from csrgraph import methods, random_walks
from csrgraph import ggvec, glove, grarep
UINT32_MAX = (2**32) - 1
class csrgraph():
"""
This top level python class either calls external JIT'ed methods
or methods from the JIT'ed internal graph
"""
def __init__(self, data, nodenames=None, copy=True, threads=0):
"""
A class for larger graphs.
NOTE: this class tends to "steal data" by default.
If you pass a numpy array/scipy matrix/csr graph
Chances are this object will point to the same instance of data
Parameters:
-------------------
data : The graph data. Can be one of:
**NetworkX Graph**
**Numpy dense matrix**
**CSR Matrix**
**(data, indices, indptr)**
**CSRGraph object**
nodenames (array of str or int) : Node names
The position in this array should correspond with the node ID
So if passing a CSR Matrix or raw data, it should be co-indexed
with the matrix/raw data arrays
copy : bool
Wether to copy passed data to create new object
Default behavior is to point to underlying ctor passed data
For networkX graphs and numpy dense matrices we create a new object anyway
threads : int
number of threads to leverage for methods in this graph.
WARNING: changes the numba environment variable to do it.
Recompiles methods and changes it when changed.
0 is numba default (usually all threads)
TODO: add numpy mmap support for very large on-disk graphs
Should be in a different class
This also requires routines to read/write edgelists, etc. from disk
TODO: subclass scipy.csr_matrix???
"""
# If input is a CSRGraph, same object
if isinstance(data, csrgraph):
if copy:
self.mat = data.mat.copy()
self.names = deepcopy(data.names)
else:
self.mat = data.mat
self.names = data.names
if not nodenames:
nodenames = self.names
else:
self.names = nodenames
# NetworkX Graph input
elif isinstance(data, nx.Graph):
mat = nx.adj_matrix(data)
mat.data = mat.data.astype(np.float32)
self.mat = mat
nodenames = list(data)
# CSR Matrix Input
elif isinstance(data, sparse.csr_matrix):
if copy: self.mat = data.copy()
else: self.mat = data
# Numpy Array Input
elif isinstance(data, np.ndarray):
self.mat = sparse.csr_matrix(data)
else:
raise ValueError(f"Incorrect input data type: {type(data).__name__}")
# Now that we have the core csr_matrix, alias underlying arrays
assert hasattr(self, 'mat')
self.weights = self.mat.data
self.src = self.mat.indptr
self.dst = self.mat.indices
# indptr has one more element than nnodes
self.nnodes = self.src.size - 1
# node name -> node ID
if nodenames is not None:
self.names = pd.Series(nodenames)
else:
self.names = pd.Series(np.arange(self.nnodes))
# Bounds check once here otherwise there be dragons later
max_idx = np.max(self.dst)
if self.nnodes < max_idx:
raise ValueError(f"""
Out of bounds node: {max_idx}, nnodes: {self.nnodes}
""")
self.set_threads(threads)
def set_threads(self, threads):
self.threads = threads
# Manage threading through Numba hack
if type(threads) is not int:
raise ValueError("Threads argument must be an int!")
if threads == 0:
threads = numba.config.NUMBA_DEFAULT_NUM_THREADS
threads = str(threads)
# If we change the number of threads, recompile
try:
prev_numba_value = os.environ['NUMBA_NUM_THREADS']
except KeyError:
prev_numba_value = threads
if threads != prev_numba_value:
os.environ['NUMBA_NUM_THREADS'] = threads
_random_walk.recompile()
_row_norm.recompile()
_node2vec_walks.recompile()
_node_degrees.recompile()
_src_multiply.recompile()
_dst_multiply.recompile()
def __getitem__(self, node):
"""
[] operator
like networkX, gets names of neighbor nodes
"""
# Get node ID from names array
# This is O(n) by design -- we more often get names from IDs
# than we get IDs from names and we don't want to hold 2 maps
# TODO : replace names with a pd.Index and use get_loc
node_id = self.names[self.names == node].index[0]
edges = self.dst[
self.src[node_id] : self.src[node_id+1]
]
return self.names.iloc[edges].values
def nodes(self):
"""
Returns the graph's nodes, in order
"""
if self.names is not None:
return self.names
else:
return np.arange(self.nnodes)
def normalize(self, return_self=True):
"""
Normalizes edge weights per node
For any node in the Graph, the new edges' weights will sum to 1
return_self : bool
whether to change the graph's values and return itself
this lets us call `G.normalize()` directly
"""
new_weights = _row_norm(self.weights, self.src)
if return_self:
self.mat = sparse.csr_matrix((new_weights, self.dst, self.src))
# Point objects to the correct places
self.weights = self.mat.data
self.src = self.mat.indptr
self.dst = self.mat.indices
gc.collect()
return self
else:
return csrgraph(sparse.csr_matrix(
(new_weights, self.dst, self.src)),
nodenames=self.names)
def random_walks(self,
walklen=10,
epochs=1,
start_nodes=None,
normalize_self=False,
return_weight=1.,
neighbor_weight=1.):
"""
Create random walks from the transition matrix of a graph
in CSR sparse format
Parameters
----------
T : scipy.sparse.csr matrix
Graph transition matrix in CSR sparse format
walklen : int
length of the random walks
epochs : int
number of times to start a walk from each nodes
return_weight : float in (0, inf]
Weight on the probability of returning to node coming from
Having this higher tends the walks to be
more like a Breadth-First Search.
Having this very high (> 2) makes search very local.
Equal to the inverse of p in the Node2Vec paper.
explore_weight : float in (0, inf]
Weight on the probability of visitng a neighbor node
to the one we're coming from in the random walk
Having this higher tends the walks to be
more like a Depth-First Search.
Having this very high makes search more outward.
Having this very low makes search very local.
Equal to the inverse of q in the Node2Vec paper.
threads : int
number of threads to use. 0 is full use
Returns
-------
out : 2d np.array (n_walks, walklen)
A matrix where each row is a random walk,
and each entry is the ID of the node
"""
# Make csr graph
if normalize_self:
self.normalize(return_self=True)
T = self
else:
T = self.normalize(return_self=False)
n_rows = T.nnodes
if start_nodes is None:
start_nodes = np.arange(n_rows)
sampling_nodes = np.tile(start_nodes, epochs)
# Node2Vec Biased walks if parameters specified
if (return_weight > 1. or return_weight < 1.
or neighbor_weight < 1. or neighbor_weight > 1.):
walks = _node2vec_walks(T.weights, T.src, T.dst,
sampling_nodes=sampling_nodes,
walklen=walklen,
return_weight=return_weight,
neighbor_weight=neighbor_weight)
# much faster implementation for regular walks
else:
walks = _random_walk(T.weights, T.src, T.dst,
sampling_nodes, walklen)
return walks
def ggvec(self, n_components=2,
learning_rate=0.05,
tol="auto",
max_epoch=500,
negative_ratio=1.0,
order=1,
negative_decay=0.,
exponent=0.5,
max_loss=30.,
tol_samples=100,
verbose=True):
"""
GGVec: Fast global first (and higher) order local embeddings.
This algorithm directly minimizes related nodes' distances.
It uses a relaxation pass (negative sample) + contraction pass (loss minimization)
To find stable embeddings based on the minimal dot product of edge weights.
Parameters:
-------------
n_components (int):
Number of individual embedding dimensions.
order : int >= 1
Meta-level of the embeddings. Improves link prediction performance.
Setting this higher than 1 ~quadratically slows down algorithm
Order = 1 directly optimizes the graph.
Order = 2 optimizes graph plus 2nd order edges
(eg. neighbours of neighbours)
Order = 3 optimizes up to 3rd order edges
Higher order edges are automatically weighed using GraRep-style graph formation
Eg. the higher-order graph is from stable high-order random walk distribution.
negative_ratio : float in [0, 1]
Negative sampling ratio.
Setting this higher will do more negative sampling.
This is slower, but can lead to higher quality embeddings.
exponent : float
Weighing exponent in loss function.
Having this lower reduces effect of large edge weights.
tol : float in [0, 1] or "auto"
Optimization early stopping criterion.
Stops average loss < tol for tol_samples epochs.
"auto" sets tol as a function of learning_rate
tol_samples : int
Optimization early stopping criterion.
This is the number of epochs to sample for loss stability.
Once loss is stable over this number of epochs we stop early.
negative_decay : float in [0, 1]
Decay on negative ratio.
If >0 then negative ratio will decay by (1-negative_decay) ** epoch
You should usually leave this to 0.
max_epoch : int
Stopping criterion.
max_count : int
Ceiling value on edge weights for numerical stability
learning_rate : float in [0, 1]
Optimization learning rate.
max_loss : float
Loss value ceiling for numerical stability.
"""
if tol == 'auto':
tol = max(learning_rate / 2, 0.05)
# Higher order graph embeddings
# Method inspired by GraRep (powers of transition matrix)
if order > 1:
norm_weights = _row_norm(self.weights, self.src)
tranmat = sparse.csr_matrix((norm_weights, self.dst, self.src))
target_matrix = tranmat.copy()
res = np.zeros((self.nnodes, n_components))
for _ in range(order - 1):
target_matrix = target_matrix.dot(tranmat)
w = ggvec.ggvec_main(
data=target_matrix.data,
src=target_matrix.indptr,
dst=target_matrix.indices,
n_components=n_components, tol=tol,
tol_samples=tol_samples,
max_epoch=max_epoch, learning_rate=learning_rate,
negative_ratio=negative_ratio,
negative_decay=negative_decay,
exponent=exponent,
max_loss=max_loss, verbose=verbose)
res = np.sum([res, w], axis=0)
return res
else:
return ggvec.ggvec_main(
data=self.weights, src=self.src, dst=self.dst,
n_components=n_components, tol=tol,
tol_samples=tol_samples,
max_epoch=max_epoch, learning_rate=learning_rate,
negative_ratio=negative_ratio,
negative_decay=negative_decay,
exponent=exponent,
max_loss=max_loss, verbose=verbose)
def glove(self, n_components=2,
tol=0.0001, max_epoch=10_000,
max_count=50, exponent=0.5,
learning_rate=0.1, max_loss=10.,
verbose=True):
"""
Global first order embedding for positive, count-valued sparse matrices.
This algorithm is normally used in NLP on word co-occurence matrices.
The algorithm fails if any value in the sparse matrix < 1.
It is also a poor choice for matrices with homogeneous edge weights.
Parameters:
-------------
n_components (int):
Number of individual embedding dimensions.
tol : float in [0, 1]
Optimization early stopping criterion.
Stops when largest gradient is < tol
max_epoch : int
Stopping criterion.
max_count : int
Ceiling value on edge weights for numerical stability
exponent : float
Weighing exponent in loss function.
Having this lower reduces effect of large edge weights.
learning_rate : float in [0, 1]
Optimization learning rate.
max_loss : float
Loss value ceiling for numerical stability.
References:
-------------
Paper: https://nlp.stanford.edu/pubs/glove.pdf
Original implementation: https://github.com/stanfordnlp/GloVe/blob/master/src/glove.c
"""
w = glove.glove_main(weights=self.weights, dst=self.dst, src=self.src,
n_components=n_components, tol=tol,
max_epoch=max_epoch, learning_rate=learning_rate,
max_count=max_count, exponent=exponent,
max_loss=max_loss, verbose=verbose)
return w
def grarep(self, n_components=2,
order=5,
embedder=TruncatedSVD(
n_iter=10,
random_state=42),
verbose=True):
"""Implementation of GraRep: Embeddings of powers of the PMI matrix
of the graph adj matrix.
NOTE: Unlike GGVec and GLoVe, this method returns a LIST OF EMBEDDINGS
(one per order). You can sum them, or take the last one for embedding.
Default pooling is to sum :
`lambda x : np.sum(x, axis=0)`
You can also take only the highest order embedding:
`lambda x : x[-1]`
Etc.
Original paper: https://dl.acm.org/citation.cfm?id=2806512
Parameters :
----------------
n_components (int):
Number of individual embedding dimensions.
order (int):
Number of PMI matrix powers.
The performance degrades close to quadratically as a factor of this parameter.
Generally should be kept under 5.
embedder : (instance of sklearn API compatible model)
Should implement the `fit_transform` method:
https://scikit-learn.org/stable/glossary.html#term-fit-transform
The model should also have `n_components` as a parameter
for number of resulting embedding dimensions. See:
https://scikit-learn.org/stable/modules/manifold.html#manifold
If not compatible, set resulting dimensions in the model instance directly
merger : function[list[array]] -> array
GraRep returns one embedding matrix per order.
This function reduces it to a single matrix.
"""
w_array = grarep.grarep_main(weights=self.weights, dst=self.dst, src=self.src,
n_components=n_components, order=order,
embedder=embedder,verbose=verbose)
return w_array
def random_walk_resample(self, walklen=4, epochs=30):
"""
Create a new graph from random walk co-occurences.
First, we generate random walks on the graph
Then, any nodes appearing together in a walk get an edge
Edge weights are co-occurence counts.
Recommendation: many short walks > fewer long walks
TODO: add node2vec walk parameters
"""
walks = self.random_walks(walklen=walklen, epochs=epochs)
elist = random_walks.walks_to_edgelist(walks)
if 'weight' in elist.columns:
weights = elist.weight.to_numpy()
else:
weights = np.ones(dst.shape[0])
return methods._edgelist_to_graph(
elist.src.to_numpy(), elist.dst.to_numpy(),
weights, nnodes=self.nnodes, nodenames=self.names)
#
#
# TODO: Organize Graph method here
# Layout nodes by their 1d embedding's position
# Also layout by hilbert space filling curve
#
#
def read_edgelist(f, directed=True, sep=r"\s+", header=None, **readcsvkwargs):
"""
Creates a csrgraph from an edgelist.
The edgelist should be in the form
[source destination]
or
[source destination edge_weight]
The first column needs to be the source, the second the destination.
If there is a third column it's assumed to be edge weights.
Otherwise, all arguments from pandas.read_csv can be used to read the file.
f : str
Filename to read
directed : bool
Whether the graph is directed or undirected.
All csrgraphs are directed, undirected graphs simply add "return edges"
sep : str
CSV-style separator. Eg. Use "," if comma separated
header : int or None
pandas read_csv parameter. Use if column names are present
read_csv_kwargs : keyword arguments for pd.read_csv
Pass these kwargs as you would normally to pd.read_csv.
Returns : csrgraph
"""
# Read in csv correctly to each column
elist = pd.read_csv(f, sep=sep, header=header, **readcsvkwargs)
if len(elist.columns) == 2:
elist.columns = ['src', 'dst']
elist['weight'] = np.ones(elist.shape[0])
elif len(elist.columns) == 3:
elist.columns = ['src', 'dst', 'weight']
else:
raise ValueError(f"""
Invalid columns: {elist.columns}
Expected 2 (source, destination)
or 3 (source, destination, weight)
Read File: \n{elist.head(5)}
""")
# Create name mapping to normalize node IDs
# Somehow this is 1.5x faster than np.union1d. Shame on numpy.
allnodes = list(
set(elist.src.unique())
.union(set(elist.dst.unique())))
# Factor all nodes to unique IDs
names = (
pd.Series(allnodes).astype('category')
.cat.categories
)
nnodes = names.shape[0]
# Get the input data type
if nnodes > UINT32_MAX:
dtype = np.uint64
else:
dtype = np.uint32
name_dict = dict(zip(names,
np.arange(names.shape[0], dtype=dtype)))
elist.src = elist.src.map(name_dict)
elist.dst = elist.dst.map(name_dict)
# clean up temp data
allnodes = None
name_dict = None
gc.collect()
# If undirected graph, append edgelist to reversed self
if not directed:
other_df = elist.copy()
other_df.columns = ['dst', 'src', 'weight']
elist = pd.concat([elist, other_df])
other_df = None
gc.collect()
# Need to sort by src for _edgelist_to_graph
elist = elist.sort_values(by='src')
# extract numpy arrays and clear memory
src = elist.src.to_numpy()
dst = elist.dst.to_numpy()
weight = elist.weight.to_numpy()
elist = None
gc.collect()
G = methods._edgelist_to_graph(
src, dst, weight,
nnodes, nodenames=names
)
return G