Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add bindings for triangle counting #2273

Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 72 additions & 10 deletions python/cugraph/cugraph/community/triangle_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from cugraph.community import triangle_count_wrapper
from cugraph.utilities import ensure_cugraph_obj_for_nx
import cudf

from pylibcugraph import triangle_count as \
pylibcugraph_triangle_count

def triangles(G):
from pylibcugraph import (ResourceHandle,
GraphProperties,
SGGraph
)


# FIXME: rename this to triangle_conut to match the MG implmentation
def triangles(G, start_list=None):
"""
Compute the number of triangles (cycles of length three) in the
input graph.

Unlike NetworkX, this algorithm simply returns the total number of
triangle and not the number per vertex.

Parameters
----------
G : cugraph.graph or networkx.Graph
cuGraph graph descriptor, should contain the connectivity information,
(edge weights are not used in this algorithm).
The current implementation only supports undirected graphs.

start_list : list or cudf.Series (int32), optional (default=None)
list of vertices for triangle count. if None the entire set of vertices
in the graph is processed

Returns
-------
count : int64
A 64 bit integer whose value gives the number of triangles in the
graph.
result : cudf.DataFrame
GPU data frame containing 2 cudf.Series

ddf['vertex']: cudf.Series
Contains the triangle counting vertices
ddf['counts']: cudf.Series
Contains the triangle counting counts

Examples
--------
Expand All @@ -53,6 +67,54 @@ def triangles(G):
if G.is_directed():
raise ValueError("input graph must be undirected")

result = triangle_count_wrapper.triangles(G)
if start_list is not None:
if isinstance(start_list, int):
start_list = [start_list]
if isinstance(start_list, list):
start_list = cudf.Series(start_list)
if start_list.dtype != 'int32':
raise ValueError(f"'start_list' must have int32 values, "
f"got: {start_list.dtype}")
if not isinstance(start_list, cudf.Series):
raise TypeError(
f"'start_list' must be either a list or a cudf.Series,"
f"got: {start_list.dtype}")

if G.renumbered is True:
if isinstance(start_list, cudf.DataFrame):
start_list = G.lookup_internal_vertex_id(
start_list, start_list.columns)
else:
start_list = G.lookup_internal_vertex_id(start_list)

srcs = G.edgelist.edgelist_df['src']
dsts = G.edgelist.edgelist_df['dst']
weights = G.edgelist.edgelist_df['weights']

if srcs.dtype != 'int32':
raise ValueError(f"Graph vertices must have int32 values, "
f"got: {srcs.dtype}")

resource_handle = ResourceHandle()
graph_props = GraphProperties(is_multigraph=G.is_multigraph())
store_transposed = False

# FIXME: This should be based on the renumber parameter set when creating
# the graph
renumber = False
do_expensive_check = False

sg = SGGraph(resource_handle, graph_props, srcs, dsts, weights,
store_transposed, renumber, do_expensive_check)

vertex, counts = pylibcugraph_triangle_count(
resource_handle, sg, start_list, do_expensive_check)

df = cudf.DataFrame()
df["vertex"] = vertex
df["counts"] = counts

if G.renumbered:
df = G.unrenumber(df, "vertex")

return result
return df
1 change: 1 addition & 0 deletions python/cugraph/cugraph/dask/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@
from .traversal.sssp import sssp
from .common.read_utils import get_chunksize
from .community.louvain import louvain
from .community.triangle_count import triangle_count
from .centrality.katz_centrality import katz_centrality
from .components.connectivity import weakly_connected_components
3 changes: 2 additions & 1 deletion python/cugraph/cugraph/dask/community/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -12,3 +12,4 @@
# limitations under the License.

from .louvain import louvain
from .triangle_count import triangle_count
176 changes: 176 additions & 0 deletions python/cugraph/cugraph/dask/community/triangle_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# Copyright (c) 2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from dask.distributed import wait, default_client
from cugraph.dask.common.input_utils import get_distributed_data

import cugraph.dask.comms.comms as Comms
import dask_cudf
import cudf

from pylibcugraph import triangle_count as \
pylibcugraph_triangle_count

from pylibcugraph import (ResourceHandle,
GraphProperties,
MGGraph
)


def call_triangles(sID,
data,
src_col_name,
dst_col_name,
graph_properties,
store_transposed,
num_edges,
do_expensive_check,
start_list
):

handle = Comms.get_handle(sID)
h = ResourceHandle(handle.getHandle())
srcs = data[0][src_col_name]
dsts = data[0][dst_col_name]
weights = None
if "value" in data[0].columns:
weights = data[0]['value']

mg = MGGraph(h,
graph_properties,
srcs,
dsts,
weights,
store_transposed,
num_edges,
do_expensive_check)

result = pylibcugraph_triangle_count(h,
mg,
start_list,
do_expensive_check)

return result


def convert_to_cudf(cp_arrays):
"""
Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
"""
cupy_vertices, cupy_counts = cp_arrays
df = cudf.DataFrame()
df["vertex"] = cupy_vertices
df["counts"] = cupy_counts

return df


def triangle_count(input_graph,
start_list=None):
"""
Computes the number of triangles (cycles of length three) and the number
per vertex in the input graph.

Parameters
----------
G : cugraph.graph
cuGraph graph descriptor, should contain the connectivity information,
(edge weights are not used in this algorithm).
The current implementation only supports undirected graphs.

start_list : list or cudf.Series (int32), optional (default=None)
list of vertices for triangle count. if None the entire set of vertices
in the graph is processed


Returns
-------
result : dask_cudf.DataFrame
GPU distributed data frame containing 2 dask_cudf.Series

ddf['vertex']: dask_cudf.Series
Contains the triangle counting vertices
ddf['counts']: dask_cudf.Series
Contains the triangle counting counts
"""
# Initialize dask client
client = default_client()
# In the future, once all the algos follow the C/Pylibcugraph path,
# compute_renumber_edge_list will only be used for multicolumn and
# string vertices since the renumbering will be done in pylibcugraph
input_graph.compute_renumber_edge_list(
transposed=False, legacy_renum_only=True)

if start_list is not None:
if isinstance(start_list, int):
start_list = [start_list]
if isinstance(start_list, list):
start_list = cudf.Series(start_list)
if start_list.dtype != 'int32':
raise ValueError(f"'start_list' must have int32 values, "
f"got: {start_list.dtype}")
if not isinstance(start_list, cudf.Series):
raise TypeError(
f"'start_list' must be either a list or a cudf.Series,"
f"got: {start_list.dtype}")

# start_list uses "external" vertex IDs, but since the graph has been
# renumbered, the start vertex IDs must also be renumbered.
if input_graph.renumbered:
start_list = input_graph.lookup_internal_vertex_id(
start_list).compute()

ddf = input_graph.edgelist.edgelist_df

# FIXME: The parameter is_multigraph, store_transposed and
# do_expensive_check must be derived from the input_graph.
# For now, they are hardcoded.
graph_properties = GraphProperties(
is_multigraph=False)
store_transposed = False
do_expensive_check = True

num_edges = len(ddf)
data = get_distributed_data(ddf)

src_col_name = input_graph.renumber_map.renumbered_src_col_name
dst_col_name = input_graph.renumber_map.renumbered_dst_col_name

result = [client.submit(call_triangles,
Comms.get_session_id(),
wf[1],
src_col_name,
dst_col_name,
graph_properties,
store_transposed,
num_edges,
do_expensive_check,
start_list,
workers=[wf[0]])
for idx, wf in enumerate(data.worker_to_parts.items())]

wait(result)

cudf_result = [client.submit(convert_to_cudf,
cp_arrays)
for cp_arrays in result]

wait(cudf_result)

ddf = dask_cudf.from_delayed(cudf_result)
if input_graph.renumbered:
ddf = input_graph.unrenumber(ddf, "vertex")

return ddf
4 changes: 2 additions & 2 deletions python/cugraph/cugraph/dask/link_analysis/hits.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ def hits(input_graph, tol=1.0e-5, max_iter=100, nstart=None, normalized=True):
Returns
-------
HubsAndAuthorities : dask_cudf.DataFrame
GPU data frame containing three cudf.Series of size V: the vertex
identifiers and the corresponding hubs values and the corresponding
GPU distributed data frame containing three dask_cudf.Series of
size V: the vertex identifiers and the corresponding hubs and
authorities values.

df['vertex'] : dask_cudf.Series
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
Returns
-------
result : dask_cudf.DataFrame
GPU data frame containing two dask_cudf.Series
GPU distributed data frame containing 4 dask_cudf.Series

ddf['sources']: dask_cudf.Series
Contains the source vertices from the sampling result
Expand Down
Loading