Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] dask personalization, fix df query #1237

Merged
merged 16 commits into from
Nov 20, 2020
4 changes: 2 additions & 2 deletions python/cugraph/dask/link_analysis/mg_pagerank_wrapper.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ def mg_pagerank(input_df,

if personalization is not None:
sz = personalization['vertex'].shape[0]
personalization['vertex'] = personalization['vertex'].astype(np.int32)
personalization['values'] = personalization['values'].astype(df['pagerank'].dtype)
personalization['vertex'] = personalization['vertex'].astype(vertex_t)
personalization['values'] = personalization['values'].astype(weight_t)
c_pers_vtx = personalization['vertex'].__cuda_array_interface__['data'][0]
c_pers_val = personalization['values'].__cuda_array_interface__['data'][0]

Expand Down
48 changes: 30 additions & 18 deletions python/cugraph/dask/link_analysis/pagerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,6 @@ def pagerank(input_graph,
"""
from cugraph.structure.graph import null_check

if personalization is not None:
raise Exception("Personalization not supported")

nstart = None

client = default_client()
Expand All @@ -141,21 +138,36 @@ def pagerank(input_graph,
if input_graph.renumbered is True:
personalization = input_graph.add_internal_vertex_id(
personalization, "vertex", "vertex"
).compute()

result = [client.submit(call_pagerank,
Comms.get_session_id(),
wf[1],
num_verts,
num_edges,
vertex_partition_offsets,
alpha,
max_iter,
tol,
personalization,
nstart,
workers=[wf[0]])
for idx, wf in enumerate(data.worker_to_parts.items())]
)
p_data = get_distributed_data(personalization)

result = [client.submit(call_pagerank,
Comms.get_session_id(),
wf[1],
num_verts,
num_edges,
vertex_partition_offsets,
alpha,
max_iter,
tol,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to discuss more about tol.

The code below is what NetworkX does (compare err with N * tol). This requires setting tol to a smaller value if N gets large (tol here basically means tolerance for a single PageRank value not the entire set of PageRank vector).

https://github.com/networkx/networkx/blob/master/networkx/algorithms/link_analysis/pagerank_alg.py#L155

err = sum([abs(x[n] - xlast[n]) for n in x])
        if err < N * tol:
            return x

The new PageRank algorithm adopts this NetworkX logic to determine convergence, so should we follow the NetworkX logic (and set tol to a smaller value with a larger N) or better remove N * from the new PageRank code?

p_data.worker_to_parts[wf[0]][0],
nstart,
workers=[wf[0]])
for idx, wf in enumerate(data.worker_to_parts.items())]
else:
result = [client.submit(call_pagerank,
Comms.get_session_id(),
wf[1],
num_verts,
num_edges,
vertex_partition_offsets,
alpha,
max_iter,
tol,
personalization,
nstart,
workers=[wf[0]])
for idx, wf in enumerate(data.worker_to_parts.items())]
wait(result)
ddf = dask_cudf.from_delayed(result)
if input_graph.renumbered:
Expand Down
4 changes: 2 additions & 2 deletions python/cugraph/structure/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -1081,7 +1081,7 @@ def degrees(self, vertex_subset=None):
df = self.unrenumber(df, "vertex")

if vertex_subset is not None:
df = df.query("`vertex` in @vertex_subset")
df = df[df['vertex'].isin(vertex_subset)]

return df

Expand All @@ -1095,7 +1095,7 @@ def _degree(self, vertex_subset, x=0):
df = self.unrenumber(df, "vertex")

if vertex_subset is not None:
df = df.query("`vertex` in @vertex_subset")
df = df[df['vertex'].isin(vertex_subset)]

return df

Expand Down
2 changes: 1 addition & 1 deletion python/cugraph/tests/dask/test_mg_pagerank.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def personalize(v, personalization_perc):
return cu_personalization


PERSONALIZATION_PERC = [0]
PERSONALIZATION_PERC = [0, 50]


@pytest.fixture
Expand Down