From 98fd1b2aaa6ef2e5947f5fa108e00fc7b9c18f6c Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Wed, 18 May 2022 13:27:04 -0700 Subject: [PATCH 01/21] move the current implementation of mg neighborhood sampling to proto --- python/cugraph/cugraph/proto/__init__.py | 3 +- .../cugraph/proto/sampling/__init__.py | 13 ++ .../proto/sampling/neighborhood_sampling.py | 201 ++++++++++++++++++ .../pylibcugraph/proto/__init__.py | 18 ++ .../pylibcugraph/proto/sampling/__init__.py | 0 .../uniform_neighborhood_sampling.pyx | 179 ++++++++++++++++ 6 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 python/cugraph/cugraph/proto/sampling/__init__.py create mode 100644 python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py create mode 100644 python/pylibcugraph/pylibcugraph/proto/__init__.py create mode 100644 python/pylibcugraph/pylibcugraph/proto/sampling/__init__.py create mode 100644 python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx diff --git a/python/cugraph/cugraph/proto/__init__.py b/python/cugraph/cugraph/proto/__init__.py index 65abc0cba5a..3ed778d68e8 100644 --- a/python/cugraph/cugraph/proto/__init__.py +++ b/python/cugraph/cugraph/proto/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -13,3 +13,4 @@ from cugraph.proto.components import strong_connected_component from cugraph.proto.structure import find_bicliques +from cugraph.proto.sampling import uniform_neighborhood \ No newline at end of file diff --git a/python/cugraph/cugraph/proto/sampling/__init__.py b/python/cugraph/cugraph/proto/sampling/__init__.py new file mode 100644 index 00000000000..5c8fbf27210 --- /dev/null +++ b/python/cugraph/cugraph/proto/sampling/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +from cugraph.proto.sampling.neighborhood_sampling import uniform_neighborhood \ No newline at end of file diff --git a/python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py b/python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py new file mode 100644 index 00000000000..277b27e91ab --- /dev/null +++ b/python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py @@ -0,0 +1,201 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy +from dask.distributed import wait, default_client + +import dask_cudf +import cudf + +from pylibcugraph.proto import uniform_neighborhood_sampling +from pylibcugraph.experimental import (MGGraph, + ResourceHandle, + GraphProperties, + ) + + +from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask.comms import comms as Comms + + +def call_nbr_sampling(sID, + data, + src_col_name, + dst_col_name, + num_edges, + do_expensive_check, + start_list, + info_list, + h_fan_out, + with_replacement): + + # Preparation for graph creation + handle = Comms.get_handle(sID) + handle = ResourceHandle(handle.getHandle()) + graph_properties = GraphProperties(is_symmetric=False, is_multigraph=False) + srcs = data[0][src_col_name] + dsts = data[0][dst_col_name] + weights = None + if "value" in data[0].columns: + weights = data[0]['value'] + + store_transposed = False + + mg = MGGraph(handle, + graph_properties, + srcs, + dsts, + weights, + store_transposed, + num_edges, + do_expensive_check) + + ret_val = uniform_neighborhood_sampling(handle, + mg, + start_list, + info_list, + h_fan_out, + with_replacement, + do_expensive_check) + return ret_val + + +def convert_to_cudf(cp_arrays): + """ + Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper + """ + cupy_sources, cupy_destinations, cupy_labels, cupy_indices = cp_arrays + # cupy_sources, cupy_destinations, cupy_labels, cupy_indices, + # cupy_counts = cp_arrays + df = cudf.DataFrame() + df["sources"] = cupy_sources + df["destinations"] = cupy_destinations + df["labels"] = cupy_labels + df["indices"] = cupy_indices + # df["counts"] = cupy_counts + return df + + +def uniform_neighborhood(input_graph, + start_info_list, + fanout_vals, + with_replacement=True): + """ + Does neighborhood sampling, which samples nodes from a graph based on the + current node's neighbors, with a corresponding fanout value at each hop. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph graph, which contains connectivity information as dask cudf + edge list dataframe + + start_info_list : tuple of list or cudf.Series (int32) + Tuple of a list of starting vertices for sampling, along with a + corresponding list of label for reorganizing results after sending + the input to different callers. + + fanout_vals : list (int32) + List of branching out (fan-out) degrees per starting vertex for each + hop level. + + with_replacement: bool, optional (default=True) + Flag to specify if the random sampling is done with replacement + + Returns + ------- + result : dask_cudf.DataFrame + GPU data frame containing two dask_cudf.Series + + ddf['sources']: dask_cudf.Series + Contains the source vertices from the sampling result + ddf['destinations']: dask_cudf.Series + Contains the destination vertices from the sampling result + ddf['labels']: dask_cudf.Series + Contains the start labels from the sampling result + ddf['indices']: dask_cudf.Series + Contains the indices from the sampling result for path + reconstruction + """ + # Initialize dask client + client = default_client() + # FIXME: 'legacy_renum_only' will not trigger the C++ renumbering + # In the future, once all the algos follow the C/Pylibcugraph path, + # compute_renumber_edge_list will only be used for multicolumn and + # string vertices since the renumbering will be done in pylibcugraph + input_graph.compute_renumber_edge_list( + transposed=False, legacy_renum_only=True) + + start_list, info_list = start_info_list + + if isinstance(start_list, list): + start_list = cudf.Series(start_list) + if start_list.dtype != 'int32': + raise ValueError(f"'start_list' must have int32 values, " + f"got: {start_list.dtype}") + if isinstance(info_list, list): + info_list = cudf.Series(info_list) + if info_list.dtype != 'int32': + raise ValueError(f"'info_list' must have int32 values, " + f"got: {info_list.dtype}") + # fanout_vals must be a host array! + # FIXME: ensure other sequence types (eg. cudf Series) can be handled. + if isinstance(fanout_vals, list): + fanout_vals = numpy.asarray(fanout_vals, dtype="int32") + else: + raise TypeError("fanout_vals must be a list, " + f"got: {type(fanout_vals)}") + + ddf = input_graph.edgelist.edgelist_df + num_edges = len(ddf) + data = get_distributed_data(ddf) + + src_col_name = input_graph.renumber_map.renumbered_src_col_name + dst_col_name = input_graph.renumber_map.renumbered_dst_col_name + + # start_list uses "external" vertex IDs, but if the graph has been + # renumbered, the start vertex IDs must also be renumbered. + if input_graph.renumbered: + start_list = input_graph.lookup_internal_vertex_id( + start_list).compute() + do_expensive_check = True + + result = [client.submit(call_nbr_sampling, + Comms.get_session_id(), + wf[1], + src_col_name, + dst_col_name, + num_edges, + do_expensive_check, + start_list, + info_list, + fanout_vals, + with_replacement, + workers=[wf[0]]) + for idx, wf in enumerate(data.worker_to_parts.items())] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, + cp_arrays) + for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result) + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "sources") + ddf = input_graph.unrenumber(ddf, "destinations") + + return ddf diff --git a/python/pylibcugraph/pylibcugraph/proto/__init__.py b/python/pylibcugraph/pylibcugraph/proto/__init__.py new file mode 100644 index 00000000000..b0128e047a5 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/proto/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# FIXME: The directory proto should be deleted once the the experimental +# uniform neighborhood sampling is completed + +from .sampling.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling +uniform_neighborhood_sampling = EXPERIMENTAL__uniform_neighborhood_sampling diff --git a/python/pylibcugraph/pylibcugraph/proto/sampling/__init__.py b/python/pylibcugraph/pylibcugraph/proto/sampling/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx new file mode 100644 index 00000000000..f9b8a6436b4 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx @@ -0,0 +1,179 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + data_type_id_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_create, + cugraph_type_erased_device_array_free, + cugraph_type_erased_host_array_view_t, + cugraph_type_erased_host_array_view_create +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.algorithms cimport ( + cugraph_uniform_neighbor_sample, + cugraph_sample_result_t, + cugraph_sample_result_get_sources, + cugraph_sample_result_get_destinations, + cugraph_sample_result_get_start_labels, + cugraph_sample_result_get_index, + cugraph_sample_result_get_counts, + cugraph_sample_result_free, +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, + MGGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + assert_CAI_type, + assert_AI_type, + get_c_type_from_numpy_type, +) + + +def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, + MGGraph input_graph, + start_list, + labels_list, + h_fan_out, + bool_t with_replacement, + bool_t do_expensive_check): + """ + Does neighborhood sampling, which samples nodes from a graph based on the + current node's neighbors, with a corresponding fanout value at each hop. + + Parameters + ---------- + resource_handle: ResourceHandle + Handle to the underlying device and host resources needed for + referencing data and running algorithms. + + input_graph: MGGraph + The input graph, for Multi-GPU operations. + + start_list: device array type + Device array containing the list of starting vertices for sampling. + + labels_list: device array type + Device array containing the starting labels for reorganizing the + results after sending the input to different callers. + + h_fan_out: numpy array type + Device array containing the brancing out (fan-out) degrees per + starting vertex for each hop level. + + with_replacement: bool + If true, sampling procedure is done with replacement (the same vertex + can be selected multiple times in the same step). + + do_expensive_check: bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays, where the first and second items in the tuple + are device arrays containing the starting and ending vertices of each + walk respectively, the third item in the tuple is a device array + containing the start labels, the fourth item in the tuple is a device + array containing the indices for reconstructing paths. + + """ + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr + + assert_CAI_type(start_list, "start_list") + assert_CAI_type(labels_list, "labels_list") + assert_AI_type(h_fan_out, "h_fan_out") + + cdef cugraph_sample_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef uintptr_t cai_start_ptr = \ + start_list.__cuda_array_interface__["data"][0] + cdef uintptr_t cai_labels_ptr = \ + labels_list.__cuda_array_interface__["data"][0] + cdef uintptr_t ai_fan_out_ptr = \ + h_fan_out.__array_interface__["data"][0] + + cdef cugraph_type_erased_device_array_view_t* start_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_start_ptr, + len(start_list), + get_c_type_from_numpy_type(start_list.dtype)) + cdef cugraph_type_erased_device_array_view_t* start_labels_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_labels_ptr, + len(labels_list), + get_c_type_from_numpy_type(labels_list.dtype)) + cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \ + cugraph_type_erased_host_array_view_create( + ai_fan_out_ptr, + len(h_fan_out), + get_c_type_from_numpy_type(h_fan_out.dtype)) + + error_code = cugraph_uniform_neighbor_sample(c_resource_handle_ptr, + c_graph_ptr, + start_ptr, + start_labels_ptr, + fan_out_ptr, + with_replacement, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "uniform_nbr_sample") + + # TODO: counts is a part of the output, but another copy_to_cupy array + # with appropriate host array types would likely be required. Also + # potential memory leak until this is covered + cdef cugraph_type_erased_device_array_view_t* src_ptr = \ + cugraph_sample_result_get_sources(result_ptr) + cdef cugraph_type_erased_device_array_view_t* dst_ptr = \ + cugraph_sample_result_get_destinations(result_ptr) + cdef cugraph_type_erased_device_array_view_t* labels_ptr = \ + cugraph_sample_result_get_start_labels(result_ptr) + cdef cugraph_type_erased_device_array_view_t* index_ptr = \ + cugraph_sample_result_get_index(result_ptr) + # cdef cugraph_type_erased_host_array_view_t* counts_ptr = \ + # cugraph_sample_result_get_counts(result_ptr) + + cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) + cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) + cupy_labels = copy_to_cupy_array(c_resource_handle_ptr, labels_ptr) + cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) + # cupy_counts = copy_to_cupy_array(c_resource_handle_ptr, counts_ptr) + + return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices) + # return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices, cupy_counts) From 063e443124b1ab6594a9b63e19af7b8cb5fb2a9d Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Wed, 18 May 2022 13:33:22 -0700 Subject: [PATCH 02/21] remove experimental prefix --- python/pylibcugraph/pylibcugraph/proto/__init__.py | 5 +++-- .../proto/sampling/uniform_neighborhood_sampling.pyx | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/proto/__init__.py b/python/pylibcugraph/pylibcugraph/proto/__init__.py index b0128e047a5..cc2639a9519 100644 --- a/python/pylibcugraph/pylibcugraph/proto/__init__.py +++ b/python/pylibcugraph/pylibcugraph/proto/__init__.py @@ -14,5 +14,6 @@ # FIXME: The directory proto should be deleted once the the experimental # uniform neighborhood sampling is completed -from .sampling.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling -uniform_neighborhood_sampling = EXPERIMENTAL__uniform_neighborhood_sampling +#from .sampling.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling +from .sampling.uniform_neighborhood_sampling import uniform_neighborhood_sampling +#uniform_neighborhood_sampling = EXPERIMENTAL__uniform_neighborhood_sampling diff --git a/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx index f9b8a6436b4..98eb9c6d077 100644 --- a/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx +++ b/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx @@ -61,7 +61,7 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, +def uniform_neighborhood_sampling(ResourceHandle resource_handle, MGGraph input_graph, start_list, labels_list, From e6ed99483f7a520dec70b845790cef57ebbdf428 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Wed, 18 May 2022 15:57:21 -0700 Subject: [PATCH 03/21] refactor mg neighborhood sampling bindings --- .../dask/sampling/neighborhood_sampling.py | 28 ++------- python/cugraph/cugraph/proto/__init__.py | 2 +- .../pylibcugraph/_cugraph_c/algorithms.pxd | 17 ++++++ .../uniform_neighborhood_sampling.pyx | 57 +++++++------------ 4 files changed, 43 insertions(+), 61 deletions(-) diff --git a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py index 20bd6571c14..196ac8995d5 100644 --- a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py +++ b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py @@ -34,7 +34,6 @@ def call_nbr_sampling(sID, num_edges, do_expensive_check, start_list, - info_list, h_fan_out, with_replacement): @@ -62,7 +61,6 @@ def call_nbr_sampling(sID, ret_val = uniform_neighborhood_sampling(handle, mg, start_list, - info_list, h_fan_out, with_replacement, do_expensive_check) @@ -73,20 +71,17 @@ def convert_to_cudf(cp_arrays): """ Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper """ - cupy_sources, cupy_destinations, cupy_labels, cupy_indices = cp_arrays - # cupy_sources, cupy_destinations, cupy_labels, cupy_indices, - # cupy_counts = cp_arrays + cupy_sources, cupy_destinations, cupy_indices = cp_arrays + df = cudf.DataFrame() df["sources"] = cupy_sources df["destinations"] = cupy_destinations - df["labels"] = cupy_labels df["indices"] = cupy_indices - # df["counts"] = cupy_counts return df def EXPERIMENTAL__uniform_neighborhood(input_graph, - start_info_list, + start_list, fanout_vals, with_replacement=True): """ @@ -99,10 +94,8 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, cuGraph graph, which contains connectivity information as dask cudf edge list dataframe - start_info_list : tuple of list or cudf.Series (int32) - Tuple of a list of starting vertices for sampling, along with a - corresponding list of label for reorganizing results after sending - the input to different callers. + start_info_list : list or cudf.Series (int32) + a list of starting vertices for sampling fanout_vals : list (int32) List of branching out (fan-out) degrees per starting vertex for each @@ -120,8 +113,6 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, Contains the source vertices from the sampling result ddf['destinations']: dask_cudf.Series Contains the destination vertices from the sampling result - ddf['labels']: dask_cudf.Series - Contains the start labels from the sampling result ddf['indices']: dask_cudf.Series Contains the indices from the sampling result for path reconstruction @@ -135,18 +126,12 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, input_graph.compute_renumber_edge_list( transposed=False, legacy_renum_only=True) - start_list, info_list = start_info_list - if isinstance(start_list, list): start_list = cudf.Series(start_list) if start_list.dtype != 'int32': raise ValueError(f"'start_list' must have int32 values, " f"got: {start_list.dtype}") - if isinstance(info_list, list): - info_list = cudf.Series(info_list) - if info_list.dtype != 'int32': - raise ValueError(f"'info_list' must have int32 values, " - f"got: {info_list.dtype}") + # fanout_vals must be a host array! # FIXME: ensure other sequence types (eg. cudf Series) can be handled. if isinstance(fanout_vals, list): @@ -177,7 +162,6 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, num_edges, do_expensive_check, start_list, - info_list, fanout_vals, with_replacement, workers=[wf[0]]) diff --git a/python/cugraph/cugraph/proto/__init__.py b/python/cugraph/cugraph/proto/__init__.py index 3ed778d68e8..95e5aaf3af9 100644 --- a/python/cugraph/cugraph/proto/__init__.py +++ b/python/cugraph/cugraph/proto/__init__.py @@ -13,4 +13,4 @@ from cugraph.proto.components import strong_connected_component from cugraph.proto.structure import find_bicliques -from cugraph.proto.sampling import uniform_neighborhood \ No newline at end of file +from cugraph.proto.sampling import uniform_neighborhood diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index 7edb1435a9f..5d2cd9a40dd 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -171,6 +171,7 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_sample_result_t* result ) + # FIXME: This will be obsolete when the older mechanism is removed cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_start_labels( cugraph_sample_result_t* result @@ -181,6 +182,7 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_sample_result_t* result ) + # FIXME: This will be obsolete when the older mechanism is removed cdef cugraph_type_erased_host_array_view_t* \ cugraph_sample_result_get_counts( cugraph_sample_result_t* result @@ -192,6 +194,8 @@ cdef extern from "cugraph_c/algorithms.h": ) # uniform neighborhood sampling + # FIXME: This older API will be phased out in favor of + # the experimental one below cdef cugraph_error_code_t \ cugraph_uniform_neighbor_sample( const cugraph_resource_handle_t* handle, @@ -204,3 +208,16 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_sample_result_t** result, cugraph_error_t** error ) + + # uniform neighborhood sampling + cdef cugraph_error_code_t \ + cugraph_experimental_uniform_neighbor_sample( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* start, + const cugraph_type_erased_host_array_view_t* fan_out, + bool_t without_replacement, + bool_t do_expensive_check, + cugraph_sample_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx index f9b8a6436b4..be14728d422 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx @@ -28,21 +28,20 @@ from pylibcugraph._cugraph_c.error cimport ( from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_view_t, cugraph_type_erased_device_array_view_create, - cugraph_type_erased_device_array_free, + cugraph_type_erased_device_array_view_free, cugraph_type_erased_host_array_view_t, - cugraph_type_erased_host_array_view_create + cugraph_type_erased_host_array_view_create, + cugraph_type_erased_host_array_view_free, ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, ) from pylibcugraph._cugraph_c.algorithms cimport ( - cugraph_uniform_neighbor_sample, + cugraph_experimental_uniform_neighbor_sample, cugraph_sample_result_t, cugraph_sample_result_get_sources, cugraph_sample_result_get_destinations, - cugraph_sample_result_get_start_labels, cugraph_sample_result_get_index, - cugraph_sample_result_get_counts, cugraph_sample_result_free, ) from pylibcugraph.resource_handle cimport ( @@ -64,7 +63,6 @@ from pylibcugraph.utils cimport ( def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, MGGraph input_graph, start_list, - labels_list, h_fan_out, bool_t with_replacement, bool_t do_expensive_check): @@ -84,10 +82,6 @@ def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, start_list: device array type Device array containing the list of starting vertices for sampling. - labels_list: device array type - Device array containing the starting labels for reorganizing the - results after sending the input to different callers. - h_fan_out: numpy array type Device array containing the brancing out (fan-out) degrees per starting vertex for each hop level. @@ -114,7 +108,6 @@ def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr assert_CAI_type(start_list, "start_list") - assert_CAI_type(labels_list, "labels_list") assert_AI_type(h_fan_out, "h_fan_out") cdef cugraph_sample_result_t* result_ptr @@ -123,8 +116,6 @@ def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, cdef uintptr_t cai_start_ptr = \ start_list.__cuda_array_interface__["data"][0] - cdef uintptr_t cai_labels_ptr = \ - labels_list.__cuda_array_interface__["data"][0] cdef uintptr_t ai_fan_out_ptr = \ h_fan_out.__array_interface__["data"][0] @@ -133,47 +124,37 @@ def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, cai_start_ptr, len(start_list), get_c_type_from_numpy_type(start_list.dtype)) - cdef cugraph_type_erased_device_array_view_t* start_labels_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_labels_ptr, - len(labels_list), - get_c_type_from_numpy_type(labels_list.dtype)) cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \ cugraph_type_erased_host_array_view_create( ai_fan_out_ptr, len(h_fan_out), get_c_type_from_numpy_type(h_fan_out.dtype)) - error_code = cugraph_uniform_neighbor_sample(c_resource_handle_ptr, - c_graph_ptr, - start_ptr, - start_labels_ptr, - fan_out_ptr, - with_replacement, - do_expensive_check, - &result_ptr, - &error_ptr) + error_code = cugraph_experimental_uniform_neighbor_sample( + c_resource_handle_ptr, + c_graph_ptr, + start_ptr, + fan_out_ptr, + with_replacement, + do_expensive_check, + &result_ptr, + &error_ptr) assert_success(error_code, error_ptr, "uniform_nbr_sample") - # TODO: counts is a part of the output, but another copy_to_cupy array - # with appropriate host array types would likely be required. Also - # potential memory leak until this is covered cdef cugraph_type_erased_device_array_view_t* src_ptr = \ cugraph_sample_result_get_sources(result_ptr) cdef cugraph_type_erased_device_array_view_t* dst_ptr = \ cugraph_sample_result_get_destinations(result_ptr) - cdef cugraph_type_erased_device_array_view_t* labels_ptr = \ - cugraph_sample_result_get_start_labels(result_ptr) cdef cugraph_type_erased_device_array_view_t* index_ptr = \ cugraph_sample_result_get_index(result_ptr) - # cdef cugraph_type_erased_host_array_view_t* counts_ptr = \ - # cugraph_sample_result_get_counts(result_ptr) + cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) - cupy_labels = copy_to_cupy_array(c_resource_handle_ptr, labels_ptr) cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) - # cupy_counts = copy_to_cupy_array(c_resource_handle_ptr, counts_ptr) - return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices) - # return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices, cupy_counts) + cugraph_sample_result_free(result_ptr) + cugraph_type_erased_device_array_view_free(start_ptr) + cugraph_type_erased_host_array_view_free(fan_out_ptr) + + return (cupy_sources, cupy_destinations, cupy_indices) From 4581645aab6b5ebc506c43ba06812f1ef51a8bd2 Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Thu, 19 May 2022 01:47:36 -0400 Subject: [PATCH 04/21] add and test mechanism for creating graph with edge index as weight --- cpp/include/cugraph_c/array.h | 24 ++++ cpp/src/c_api/array.cpp | 25 ++++ cpp/src/sampling/detail/graph_functions.hpp | 4 +- .../sampling/detail/sampling_utils_impl.cuh | 24 ++-- cpp/src/sampling/detail/sampling_utils_mg.cu | 24 ++-- cpp/src/sampling/detail/sampling_utils_sg.cu | 24 ++-- .../uniform_neighbor_sampling_impl.hpp | 3 +- cpp/tests/c_api/mg_test_utils.cpp | 99 +++++++++++++++ cpp/tests/c_api/mg_test_utils.h | 10 ++ .../c_api/mg_uniform_neighbor_sample_test.c | 37 ++---- .../c_api/uniform_neighbor_sample_test.c | 117 +++++++++++++++--- 11 files changed, 315 insertions(+), 76 deletions(-) diff --git a/cpp/include/cugraph_c/array.h b/cpp/include/cugraph_c/array.h index 273225dcc86..c563d36a5c8 100644 --- a/cpp/include/cugraph_c/array.h +++ b/cpp/include/cugraph_c/array.h @@ -87,6 +87,30 @@ void* cugraph_type_erased_device_array_release(cugraph_type_erased_device_array_ cugraph_type_erased_device_array_view_t* cugraph_type_erased_device_array_view( cugraph_type_erased_device_array_t* array); +/** + * @brief Create a type erased device array view with a different type + * + * Create a type erased device array view from + * a type erased device array treating the underlying + * pointer as a different type. + * + * Note: This is only viable when the underlying types are the same size. That + * is, you can switch between INT32 and FLOAT32, or between INT64 and FLOAT64. + * But if the types are different sizes this will be an error. + * + * @param [in] array Pointer to the type erased device array + * @param [in] dtype The type to cast the pointer to + * @param [out] result_view Address where to put the allocated device view + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_type_erased_device_array_view_as_type( + cugraph_type_erased_device_array_t* array, + data_type_id_t dtype, + cugraph_type_erased_device_array_view_t** result_view, + cugraph_error_t** error); + /** * @brief Create a type erased device array view from * a raw device pointer. diff --git a/cpp/src/c_api/array.cpp b/cpp/src/c_api/array.cpp index 760a68d95fe..e538321bf48 100644 --- a/cpp/src/c_api/array.cpp +++ b/cpp/src/c_api/array.cpp @@ -364,3 +364,28 @@ extern "C" cugraph_error_code_t cugraph_type_erased_device_array_view_copy( return CUGRAPH_UNKNOWN_ERROR; } } + +extern "C" cugraph_error_code_t cugraph_type_erased_device_array_view_as_type( + cugraph_type_erased_device_array_t* array, + data_type_id_t dtype, + cugraph_type_erased_device_array_view_t** result_view, + cugraph_error_t** error) +{ + auto internal_pointer = + reinterpret_cast(array); + + if (data_type_sz[dtype] == data_type_sz[internal_pointer->type_]) { + *result_view = reinterpret_cast( + new cugraph::c_api::cugraph_type_erased_device_array_view_t{internal_pointer->data_.data(), + internal_pointer->size_, + internal_pointer->data_.size(), + dtype}); + return CUGRAPH_SUCCESS; + } else { + std::stringstream ss; + ss << "Could not treat type " << internal_pointer->type_ << " as type " << dtype; + auto tmp_error = new cugraph::c_api::cugraph_error_t{ss.str().c_str()}; + *error = reinterpret_cast(tmp_error); + return CUGRAPH_INVALID_INPUT; + } +} diff --git a/cpp/src/sampling/detail/graph_functions.hpp b/cpp/src/sampling/detail/graph_functions.hpp index 8cd37454741..d875958a6b9 100644 --- a/cpp/src/sampling/detail/graph_functions.hpp +++ b/cpp/src/sampling/detail/graph_functions.hpp @@ -143,7 +143,7 @@ rmm::device_uvector get_active_major_global_d template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges( raft::handle_t const& handle, GraphViewType const& graph_view, @@ -169,7 +169,7 @@ gather_local_edges( template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist( raft::handle_t const& handle, GraphViewType const& graph_view, diff --git a/cpp/src/sampling/detail/sampling_utils_impl.cuh b/cpp/src/sampling/detail/sampling_utils_impl.cuh index d91fdb8f00c..65bd3e660d6 100644 --- a/cpp/src/sampling/detail/sampling_utils_impl.cuh +++ b/cpp/src/sampling/detail/sampling_utils_impl.cuh @@ -434,7 +434,7 @@ partition_information(raft::handle_t const& handle, GraphViewType const& graph_v template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges( raft::handle_t const& handle, GraphViewType const& graph_view, @@ -451,10 +451,10 @@ gather_local_edges( rmm::device_uvector majors(edge_count, handle.get_stream()); rmm::device_uvector minors(edge_count, handle.get_stream()); - thrust::optional> weights = + std::optional> weights = graph_view.is_weighted() - ? thrust::make_optional(rmm::device_uvector(edge_count, handle.get_stream())) - : thrust::nullopt; + ? std::make_optional(rmm::device_uvector(edge_count, handle.get_stream())) + : std::nullopt; // FIXME: This should be the global constant vertex_t invalid_vertex_id = graph_view.number_of_vertices(); @@ -477,6 +477,7 @@ gather_local_edges( glbl_adj_list_offsets = global_adjacency_list_offsets.data(), majors = majors.data(), minors = minors.data(), + weights = weights ? weights->data() : nullptr, partitions = partitions.data(), hypersparse_begin = hypersparse_begin.data(), invalid_vertex_id, @@ -524,6 +525,10 @@ gather_local_edges( (g_dst_index < g_degree_offset + local_out_degree)) { minors[index] = adjacency_list[g_dst_index - g_degree_offset]; edge_index_first[index] = g_dst_index - g_degree_offset + glbl_adj_list_offsets[location]; + if (weights != nullptr) { + weight_t const* edge_weights = *(partitions[partition_id].weights()) + sparse_offset; + weights[index] = edge_weights[g_dst_index]; + } } else { minors[index] = invalid_vertex_id; } @@ -542,6 +547,7 @@ gather_local_edges( glbl_degree_offsets = global_degree_offsets.data(), majors = majors.data(), minors = minors.data(), + weights = weights ? weights->data() : nullptr, partitions = partitions.data(), hypersparse_begin = hypersparse_begin.data(), invalid_vertex_id, @@ -585,7 +591,11 @@ gather_local_edges( auto location = location_in_segment + vertex_count_offsets[partition_id]; auto g_dst_index = edge_index_first[index]; if (g_dst_index >= 0) { - minors[index] = adjacency_list[g_dst_index]; + minors[index] = adjacency_list[g_dst_index]; + if (weights != nullptr) { + weight_t const* edge_weights = *(partitions[partition_id].weights()) + sparse_offset; + weights[index] = edge_weights[g_dst_index]; + } edge_index_first[index] = g_dst_index; } else { minors[index] = invalid_vertex_id; @@ -758,7 +768,7 @@ void local_major_degree( template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist( raft::handle_t const& handle, GraphViewType const& graph_view, @@ -771,7 +781,7 @@ gather_one_hop_edgelist( rmm::device_uvector majors(0, handle.get_stream()); rmm::device_uvector minors(0, handle.get_stream()); - auto weights = thrust::make_optional>(0, handle.get_stream()); + auto weights = std::make_optional>(0, handle.get_stream()); if constexpr (GraphViewType::is_multi_gpu == true) { std::vector> active_majors_segments; diff --git a/cpp/src/sampling/detail/sampling_utils_mg.cu b/cpp/src/sampling/detail/sampling_utils_mg.cu index f21798322ed..ffcead02cf9 100644 --- a/cpp/src/sampling/detail/sampling_utils_mg.cu +++ b/cpp/src/sampling/detail/sampling_utils_mg.cu @@ -180,7 +180,7 @@ partition_information(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -191,7 +191,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -202,7 +202,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -213,7 +213,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -224,7 +224,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -235,7 +235,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -246,42 +246,42 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); diff --git a/cpp/src/sampling/detail/sampling_utils_sg.cu b/cpp/src/sampling/detail/sampling_utils_sg.cu index d05c861effd..64778511391 100644 --- a/cpp/src/sampling/detail/sampling_utils_sg.cu +++ b/cpp/src/sampling/detail/sampling_utils_sg.cu @@ -123,7 +123,7 @@ template rmm::device_uvector get_active_major_global_degrees( template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -134,7 +134,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -145,7 +145,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -156,7 +156,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -167,7 +167,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -178,7 +178,7 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_local_edges(raft::handle_t const& handle, graph_view_t const& graph_view, const rmm::device_uvector& active_majors, @@ -189,42 +189,42 @@ gather_local_edges(raft::handle_t const& handle, template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); template std::tuple, rmm::device_uvector, - thrust::optional>> + std::optional>> gather_one_hop_edgelist(raft::handle_t const& handle, graph_view_t const& graph_view, rmm::device_uvector const& active_majors); diff --git a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp index 0050ba5ce28..310be6ec24d 100644 --- a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp +++ b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp @@ -86,8 +86,7 @@ uniform_nbr_sample_impl( rmm::device_uvector d_out_src(0, handle.get_stream()); rmm::device_uvector d_out_dst(0, handle.get_stream()); - auto d_out_indices = - thrust::make_optional(rmm::device_uvector(0, handle.get_stream())); + auto d_out_indices = std::make_optional(rmm::device_uvector(0, handle.get_stream())); if (k_level != 0) { // extract out-degs(sources): diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index 41bdcefaf3a..358b80afe44 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -191,3 +191,102 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, return test_ret_value; } +extern "C" int create_mg_test_graph_with_ids(const cugraph_resource_handle_t* handle, + int32_t* h_src, + int32_t* h_dst, + int32_t* h_idx, + size_t num_edges, + bool_t store_transposed, + bool_t is_symmetric, + cugraph_graph_t** p_graph, + cugraph_error_t** ret_error) +{ + int test_ret_value = 0; + cugraph_error_code_t ret_code; + cugraph_graph_properties_t properties; + + properties.is_symmetric = is_symmetric; + properties.is_multigraph = FALSE; + + data_type_id_t vertex_tid = INT32; + data_type_id_t edge_tid = INT32; + data_type_id_t weight_tid = FLOAT32; + + cugraph_type_erased_device_array_t* src; + cugraph_type_erased_device_array_t* dst; + cugraph_type_erased_device_array_t* idx; + cugraph_type_erased_device_array_view_t* src_view; + cugraph_type_erased_device_array_view_t* dst_view; + cugraph_type_erased_device_array_view_t* idx_view; + cugraph_type_erased_device_array_view_t* wgt_view; + + int rank = 0; + + rank = cugraph_resource_handle_get_rank(handle); + + if (rank == 0) { + ret_code = + cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &src, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed."); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(*ret_error)); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_edges, vertex_tid, &dst, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed."); + + ret_code = + cugraph_type_erased_device_array_create(handle, num_edges, weight_tid, &idx, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "idx create failed."); + } else { + ret_code = cugraph_type_erased_device_array_create(handle, 0, vertex_tid, &src, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed."); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(*ret_error)); + + ret_code = cugraph_type_erased_device_array_create(handle, 0, vertex_tid, &dst, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed."); + + ret_code = cugraph_type_erased_device_array_create(handle, 0, weight_tid, &idx, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt create failed."); + } + + src_view = cugraph_type_erased_device_array_view(src); + dst_view = cugraph_type_erased_device_array_view(dst); + idx_view = cugraph_type_erased_device_array_view(idx); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, src_view, (byte_t*)h_src, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, dst_view, (byte_t*)h_dst, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, idx_view, (byte_t*)h_idx, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_as_type(idx, weight_tid, &wgt_view, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt cast from idx failed."); + + ret_code = cugraph_mg_graph_create(handle, + &properties, + src_view, + dst_view, + wgt_view, + store_transposed, + num_edges, + FALSE, + p_graph, + ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + + cugraph_type_erased_device_array_view_free(wgt_view); + cugraph_type_erased_device_array_view_free(idx_view); + cugraph_type_erased_device_array_view_free(dst_view); + cugraph_type_erased_device_array_view_free(src_view); + cugraph_type_erased_device_array_free(idx); + cugraph_type_erased_device_array_free(dst); + cugraph_type_erased_device_array_free(src); + + return test_ret_value; +} diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h index 827cfa5c885..a8fcd43cf16 100644 --- a/cpp/tests/c_api/mg_test_utils.h +++ b/cpp/tests/c_api/mg_test_utils.h @@ -66,3 +66,13 @@ int create_mg_test_graph(const cugraph_resource_handle_t* p_handle, bool_t is_symmetric, cugraph_graph_t** p_graph, cugraph_error_t** ret_error); + +int create_mg_test_graph_with_ids(const cugraph_resource_handle_t* p_handle, + int32_t* h_src, + int32_t* h_dst, + int32_t* h_idx, + size_t num_edges, + bool_t store_transposed, + bool_t is_symmetric, + cugraph_graph_t** p_graph, + cugraph_error_t** ret_error); diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c index c92b968153d..edc28e22124 100644 --- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c @@ -28,7 +28,7 @@ typedef float weight_t; int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_handle_t* handle, vertex_t* h_src, vertex_t* h_dst, - weight_t* h_wgt, + edge_t* h_idx, size_t num_vertices, size_t num_edges, vertex_t* h_start, @@ -50,8 +50,8 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han cugraph_type_erased_device_array_view_t* d_start_view = NULL; cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL; - ret_code = create_mg_test_graph( - handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, &graph, &ret_error); + ret_code = create_mg_test_graph_with_ids( + handle, h_src, h_dst, h_idx, num_edges, store_transposed, FALSE, &graph, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); ret_code = @@ -69,8 +69,6 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han ret_code = cugraph_experimental_uniform_neighbor_sample( handle, graph, d_start_view, h_fan_out_view, with_replacement, FALSE, &result, &ret_error); -#if 0 - // FIXME: cugraph_experimental_uniform_neighbor_sample is not implemented yet TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_neighbor_sample failed."); @@ -78,9 +76,9 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han cugraph_type_erased_device_array_view_t* dsts; cugraph_type_erased_device_array_view_t* index; - srcs = cugraph_sample_result_get_sources(result); - dsts = cugraph_sample_result_get_destinations(result); - index = cugraph_sample_result_get_index(result); + srcs = cugraph_sample_result_get_sources(result); + dsts = cugraph_sample_result_get_destinations(result); + index = cugraph_sample_result_get_index(result); size_t result_size = cugraph_type_erased_device_array_view_size(srcs); @@ -105,31 +103,20 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han // NOTE: The C++ tester does a more thorough validation. For our purposes // here we will do a simpler validation, merely checking that all edges // are actually part of the graph - weight_t M[num_vertices][num_vertices]; + edge_t M[num_vertices][num_vertices]; for (int i = 0; i < num_vertices; ++i) for (int j = 0; j < num_vertices; ++j) - M[i][j] = 0.0; + M[i][j] = -1; for (int i = 0; i < num_edges; ++i) - M[h_src[i]][h_dst[i]] = h_wgt[i]; + M[h_src[i]][h_dst[i]] = h_idx[i]; for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { TEST_ASSERT(test_ret_value, - M[h_srcs[i]][h_dsts[i]] > 0.0, + M[h_srcs[i]][h_dsts[i]] >= 0, "uniform_neighbor_sample got edge that doesn't exist"); - - bool_t found = FALSE; - for (int j = 0; j < num_starts; ++j) - found = found || (h_labels[i] == h_start_label[j]); - - TEST_ASSERT(test_ret_value, found, "invalid label"); } -#else - TEST_ASSERT(test_ret_value, - ret_code != CUGRAPH_SUCCESS, - "cugraph_experimental_uniform_neighbor_sample expected to fail in SG test"); -#endif cugraph_type_erased_host_array_view_free(h_fan_out_view); @@ -308,14 +295,14 @@ int test_experimental_uniform_neighbor_sample(const cugraph_resource_handle_t* h vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - weight_t wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + edge_t idx[] = {0, 1, 2, 3, 4, 5, 6, 7}; vertex_t start[] = {2, 2}; int fan_out[] = {1, 2}; return generic_experimental_uniform_neighbor_sample_test(handle, src, dst, - wgt, + idx, num_vertices, num_edges, start, diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c index 6c0b3e14640..8928dc890ed 100644 --- a/cpp/tests/c_api/uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c @@ -25,9 +25,94 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; +int create_test_graph_with_ids(const cugraph_resource_handle_t* p_handle, + vertex_t* h_src, + vertex_t* h_dst, + edge_t* h_ids, + size_t num_edges, + bool_t store_transposed, + bool_t renumber, + bool_t is_symmetric, + cugraph_graph_t** p_graph, + cugraph_error_t** ret_error) +{ + int test_ret_value = 0; + cugraph_error_code_t ret_code; + cugraph_graph_properties_t properties; + + properties.is_symmetric = is_symmetric; + properties.is_multigraph = FALSE; + + data_type_id_t vertex_tid = INT32; + data_type_id_t edge_tid = INT32; + data_type_id_t weight_tid = FLOAT32; + + cugraph_type_erased_device_array_t* src; + cugraph_type_erased_device_array_t* dst; + cugraph_type_erased_device_array_t* ids; + cugraph_type_erased_device_array_view_t* src_view; + cugraph_type_erased_device_array_view_t* dst_view; + cugraph_type_erased_device_array_view_t* ids_view; + cugraph_type_erased_device_array_view_t* wgt_view; + + ret_code = + cugraph_type_erased_device_array_create(p_handle, num_edges, vertex_tid, &src, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src create failed."); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(*ret_error)); + + ret_code = + cugraph_type_erased_device_array_create(p_handle, num_edges, vertex_tid, &dst, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst create failed."); + + ret_code = + cugraph_type_erased_device_array_create(p_handle, num_edges, edge_tid, &ids, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "ids create failed."); + + src_view = cugraph_type_erased_device_array_view(src); + dst_view = cugraph_type_erased_device_array_view(dst); + ids_view = cugraph_type_erased_device_array_view(ids); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + p_handle, src_view, (byte_t*)h_src, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "src copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + p_handle, dst_view, (byte_t*)h_dst, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "dst copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + p_handle, ids_view, (byte_t*)h_ids, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_as_type(ids, weight_tid, &wgt_view, ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt cast from ids failed."); + + ret_code = cugraph_sg_graph_create(p_handle, + &properties, + src_view, + dst_view, + wgt_view, + store_transposed, + renumber, + FALSE, + p_graph, + ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + + cugraph_type_erased_device_array_view_free(wgt_view); + cugraph_type_erased_device_array_view_free(ids_view); + cugraph_type_erased_device_array_view_free(dst_view); + cugraph_type_erased_device_array_view_free(src_view); + cugraph_type_erased_device_array_free(ids); + cugraph_type_erased_device_array_free(dst); + cugraph_type_erased_device_array_free(src); + + return test_ret_value; +} + int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, vertex_t* h_dst, - weight_t* h_wgt, + edge_t* h_ids, size_t num_vertices, size_t num_edges, vertex_t* h_start, @@ -54,8 +139,8 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, handle = cugraph_create_resource_handle(NULL); TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed."); - ret_code = create_test_graph( - handle, h_src, h_dst, h_wgt, num_edges, store_transposed, renumber, FALSE, &graph, &ret_error); + ret_code = create_test_graph_with_ids( + handle, h_src, h_dst, h_ids, num_edges, store_transposed, renumber, FALSE, &graph, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); ret_code = @@ -80,9 +165,9 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, cugraph_type_erased_device_array_view_t* dsts; cugraph_type_erased_device_array_view_t* index; - srcs = cugraph_sample_result_get_sources(result); - dsts = cugraph_sample_result_get_destinations(result); - index = cugraph_sample_result_get_index(result); + srcs = cugraph_sample_result_get_sources(result); + dsts = cugraph_sample_result_get_destinations(result); + index = cugraph_sample_result_get_index(result); size_t result_size = cugraph_type_erased_device_array_view_size(srcs); @@ -105,18 +190,18 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, // NOTE: The C++ tester does a more thorough validation. For our purposes // here we will do a simpler validation, merely checking that all edges // are actually part of the graph - weight_t M[num_vertices][num_vertices]; + edge_t M[num_vertices][num_vertices]; for (int i = 0; i < num_vertices; ++i) for (int j = 0; j < num_vertices; ++j) - M[i][j] = 0.0; + M[i][j] = -1; for (int i = 0; i < num_edges; ++i) - M[h_src[i]][h_dst[i]] = h_wgt[i]; + M[h_src[i]][h_dst[i]] = h_ids[i]; for (int i = 0; (i < result_size) && (test_ret_value == 0); ++i) { TEST_ASSERT(test_ret_value, - M[h_srcs[i]][h_dsts[i]] > 0.0, + M[h_srcs[i]][h_dsts[i]] > 0, "uniform_neighbor_sample got edge that doesn't exist"); } @@ -306,15 +391,15 @@ int test_experimental_uniform_neighbor_sample() size_t fan_out_size = 2; size_t num_starts = 2; - vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; - vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - weight_t wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t start[] = {2, 2}; - int fan_out[] = {1, 2}; + vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; + vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; + edge_t edge_ids[] = {0, 1, 2, 3, 4, 5, 6, 7}; + vertex_t start[] = {2, 2}; + int fan_out[] = {1, 2}; return generic_experimental_uniform_neighbor_sample_test(src, dst, - wgt, + edge_ids, num_vertices, num_edges, start, From 16cea303f0eadc3df532dcb57d5235ff0c8bca1f Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Thu, 19 May 2022 13:56:35 -0400 Subject: [PATCH 05/21] rename create*_with_ids to create*_with_edge_ids --- cpp/tests/c_api/mg_test_utils.cpp | 18 +++++++-------- cpp/tests/c_api/mg_test_utils.h | 18 +++++++-------- .../c_api/mg_uniform_neighbor_sample_test.c | 2 +- .../c_api/uniform_neighbor_sample_test.c | 22 +++++++++---------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index 358b80afe44..53ce87abe23 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -191,15 +191,15 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, return test_ret_value; } -extern "C" int create_mg_test_graph_with_ids(const cugraph_resource_handle_t* handle, - int32_t* h_src, - int32_t* h_dst, - int32_t* h_idx, - size_t num_edges, - bool_t store_transposed, - bool_t is_symmetric, - cugraph_graph_t** p_graph, - cugraph_error_t** ret_error) +extern "C" int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_t* handle, + int32_t* h_src, + int32_t* h_dst, + int32_t* h_idx, + size_t num_edges, + bool_t store_transposed, + bool_t is_symmetric, + cugraph_graph_t** p_graph, + cugraph_error_t** ret_error) { int test_ret_value = 0; cugraph_error_code_t ret_code; diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h index a8fcd43cf16..2aecbc13cc6 100644 --- a/cpp/tests/c_api/mg_test_utils.h +++ b/cpp/tests/c_api/mg_test_utils.h @@ -67,12 +67,12 @@ int create_mg_test_graph(const cugraph_resource_handle_t* p_handle, cugraph_graph_t** p_graph, cugraph_error_t** ret_error); -int create_mg_test_graph_with_ids(const cugraph_resource_handle_t* p_handle, - int32_t* h_src, - int32_t* h_dst, - int32_t* h_idx, - size_t num_edges, - bool_t store_transposed, - bool_t is_symmetric, - cugraph_graph_t** p_graph, - cugraph_error_t** ret_error); +int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_t* p_handle, + int32_t* h_src, + int32_t* h_dst, + int32_t* h_idx, + size_t num_edges, + bool_t store_transposed, + bool_t is_symmetric, + cugraph_graph_t** p_graph, + cugraph_error_t** ret_error); diff --git a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c index edc28e22124..8fbd80a90c0 100644 --- a/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/mg_uniform_neighbor_sample_test.c @@ -50,7 +50,7 @@ int generic_experimental_uniform_neighbor_sample_test(const cugraph_resource_han cugraph_type_erased_device_array_view_t* d_start_view = NULL; cugraph_type_erased_host_array_view_t* h_fan_out_view = NULL; - ret_code = create_mg_test_graph_with_ids( + ret_code = create_mg_test_graph_with_edge_ids( handle, h_src, h_dst, h_idx, num_edges, store_transposed, FALSE, &graph, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); diff --git a/cpp/tests/c_api/uniform_neighbor_sample_test.c b/cpp/tests/c_api/uniform_neighbor_sample_test.c index 8928dc890ed..428ccbec7a9 100644 --- a/cpp/tests/c_api/uniform_neighbor_sample_test.c +++ b/cpp/tests/c_api/uniform_neighbor_sample_test.c @@ -25,16 +25,16 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -int create_test_graph_with_ids(const cugraph_resource_handle_t* p_handle, - vertex_t* h_src, - vertex_t* h_dst, - edge_t* h_ids, - size_t num_edges, - bool_t store_transposed, - bool_t renumber, - bool_t is_symmetric, - cugraph_graph_t** p_graph, - cugraph_error_t** ret_error) +int create_test_graph_with_edge_ids(const cugraph_resource_handle_t* p_handle, + vertex_t* h_src, + vertex_t* h_dst, + edge_t* h_ids, + size_t num_edges, + bool_t store_transposed, + bool_t renumber, + bool_t is_symmetric, + cugraph_graph_t** p_graph, + cugraph_error_t** ret_error) { int test_ret_value = 0; cugraph_error_code_t ret_code; @@ -139,7 +139,7 @@ int generic_experimental_uniform_neighbor_sample_test(vertex_t* h_src, handle = cugraph_create_resource_handle(NULL); TEST_ASSERT(test_ret_value, handle != NULL, "resource handle creation failed."); - ret_code = create_test_graph_with_ids( + ret_code = create_test_graph_with_edge_ids( handle, h_src, h_dst, h_ids, num_edges, store_transposed, renumber, FALSE, &graph, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); From 3f90963890cd184fddc12797831f962f1150419a Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Thu, 19 May 2022 19:26:09 -0700 Subject: [PATCH 06/21] update python bindings to create graph with edge index as weight --- python/pylibcugraph/pylibcugraph/graphs.pyx | 44 ++++++++++---- .../uniform_neighborhood_sampling.pyx | 13 ++++- python/pylibcugraph/pylibcugraph/utils.pxd | 8 +++ python/pylibcugraph/pylibcugraph/utils.pyx | 58 +++++++++++++++++++ 4 files changed, 110 insertions(+), 13 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index 6413e0dab94..388317052be 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -15,6 +15,7 @@ # cython: language_level = 3 from libc.stdint cimport uintptr_t +import numpy from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -54,6 +55,7 @@ from pylibcugraph.utils cimport ( assert_success, assert_CAI_type, get_c_type_from_numpy_type, + get_c_weight_type_from_numpy_edge_ids_type, ) @@ -164,11 +166,22 @@ cdef class SGGraph(_GPUGraph): cdef uintptr_t cai_weights_ptr = \ weight_array.__cuda_array_interface__["data"][0] - cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_weights_ptr, - len(weight_array), - get_c_type_from_numpy_type(weight_array.dtype)) + + cdef cugraph_type_erased_device_array_view_t* weights_view_ptr + + if weight_array.dtype in [numpy.int32, numpy.int64]: + weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_weight_type_from_numpy_edge_ids_type( + weight_array.dtype)) + else: + weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_type_from_numpy_type(weight_array.dtype)) error_code = cugraph_sg_graph_create( resource_handle.c_resource_handle_ptr, @@ -284,11 +297,22 @@ cdef class MGGraph(_GPUGraph): cdef uintptr_t cai_weights_ptr = \ weight_array.__cuda_array_interface__["data"][0] - cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_weights_ptr, - len(weight_array), - get_c_type_from_numpy_type(weight_array.dtype)) + cdef cugraph_type_erased_device_array_view_t* weights_view_ptr + print("the weight array type is ", weight_array.dtype) + + if weight_array.dtype in [numpy.int32, numpy.int64]: + weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_weight_type_from_numpy_edge_ids_type( + weight_array.dtype)) + else: + weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_type_from_numpy_type(weight_array.dtype)) error_code = cugraph_mg_graph_create( resource_handle.c_resource_handle_ptr, diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx index be14728d422..bbed79bfba3 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx @@ -16,6 +16,9 @@ from libc.stdint cimport uintptr_t +# FIXME: Added this +import numpy + from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, data_type_id_t, @@ -51,9 +54,11 @@ from pylibcugraph.graphs cimport ( _GPUGraph, MGGraph, ) +# FIXME: added copy_to_cupy_array_ from pylibcugraph.utils cimport ( assert_success, copy_to_cupy_array, + copy_to_cupy_array_ids, assert_CAI_type, assert_AI_type, get_c_type_from_numpy_type, @@ -147,11 +152,13 @@ def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, cugraph_sample_result_get_destinations(result_ptr) cdef cugraph_type_erased_device_array_view_t* index_ptr = \ cugraph_sample_result_get_index(result_ptr) - - + + cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) - cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) + cupy_indices = copy_to_cupy_array_ids(c_resource_handle_ptr, index_ptr) + #print("indices are \n", cupy_indices) + #print("type is ", cupy_indices.dtype) cugraph_sample_result_free(result_ptr) cugraph_type_erased_device_array_view_free(start_ptr) diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index 3f508b85fbb..21c6e66b8dc 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -39,6 +39,14 @@ cdef get_numpy_type_from_c_type(data_type_id_t c_type) cdef get_c_type_from_numpy_type(numpy_type) +cdef get_c_weight_type_from_numpy_edge_ids_type(numpy_type) + +cdef get_numpy_edge_ids_type_from_c_weight_type(data_type_id_t c_type) + cdef copy_to_cupy_array( cugraph_resource_handle_t* c_resource_handle_ptr, cugraph_type_erased_device_array_view_t* device_array_view_ptr) + +cdef copy_to_cupy_array_ids( + cugraph_resource_handle_t* c_resource_handle_ptr, + cugraph_type_erased_device_array_view_t* device_array_view_ptr) \ No newline at end of file diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index 54b39dc6843..121c8109660 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -102,6 +102,18 @@ cdef get_c_type_from_numpy_type(numpy_type): raise RuntimeError("Internal error: got invalid data type enum value " f"from Numpy: {numpy_type}") +cdef get_c_weight_type_from_numpy_edge_ids_type(numpy_type): + if numpy_type == numpy.int32: + return data_type_id_t.FLOAT32 + else: + return data_type_id_t.FLOAT64 + +cdef get_numpy_edge_ids_type_from_c_weight_type(data_type_id_t c_weight_type): + if c_weight_type == data_type_id_t.FLOAT32: + return numpy.int32 + else: + return numpy.int64 + cdef copy_to_cupy_array( cugraph_resource_handle_t* c_resource_handle_ptr, @@ -138,3 +150,49 @@ cdef copy_to_cupy_array( cugraph_type_erased_device_array_view_free(device_array_view_ptr) return cupy_array + +cdef copy_to_cupy_array_ids( + cugraph_resource_handle_t* c_resource_handle_ptr, + cugraph_type_erased_device_array_view_t* device_array_view_ptr): + """ + Copy the contents from a device array view as returned by various cugraph_* + APIs to a new cupy device array, typically intended to be used as a return + value from pylibcugraph APIs. + """ + + + cdef c_type = cugraph_type_erased_device_array_view_type( + device_array_view_ptr) + + #cdef c_type = data_type_id_t.INT32 + array_size = cugraph_type_erased_device_array_view_size( + device_array_view_ptr) + + cupy_array = cupy.zeros( + array_size, dtype=get_numpy_edge_ids_type_from_c_weight_type(c_type)) + + """ + cupy_array = cupy.zeros( + array_size, dtype=numpy.int32) + """ + + cdef uintptr_t cupy_array_ptr = \ + cupy_array.__cuda_array_interface__["data"][0] + + cdef cugraph_type_erased_device_array_view_t* cupy_array_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cupy_array_ptr, array_size, get_c_type_from_numpy_type(cupy_array.dtype)) + + cdef cugraph_error_t* error_ptr + error_code = cugraph_type_erased_device_array_view_copy( + c_resource_handle_ptr, + cupy_array_view_ptr, + device_array_view_ptr, + &error_ptr) + assert_success(error_code, error_ptr, + "cugraph_type_erased_device_array_view_copy") + + cugraph_type_erased_device_array_view_free(device_array_view_ptr) + + return cupy_array + From d46019336e717d2aba0bbb468ba4d69c386ce84e Mon Sep 17 00:00:00 2001 From: Chuck Hastings Date: Fri, 20 May 2022 16:03:51 -0400 Subject: [PATCH 07/21] fix bug in MG case... cugraph_ops function doesn't handle an empty request properly --- .../sampling/uniform_neighbor_sampling_impl.hpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp index 310be6ec24d..0847abf9556 100644 --- a/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp +++ b/cpp/src/sampling/uniform_neighbor_sampling_impl.hpp @@ -103,13 +103,16 @@ uniform_nbr_sample_impl( raft::random::RngState rng_state(seed); seed += d_rnd_indices.size() * row_comm_size; - cugraph_ops::get_sampling_index(d_rnd_indices.data(), - rng_state, - d_out_degs.data(), - static_cast(d_out_degs.size()), - static_cast(k_level), - with_replacement, - handle.get_stream()); + if (d_rnd_indices.size() > 0) { + // FIXME: This cugraph_ops function does not handle 0 inputs properly + cugraph_ops::get_sampling_index(d_rnd_indices.data(), + rng_state, + d_out_degs.data(), + static_cast(d_out_degs.size()), + static_cast(k_level), + with_replacement, + handle.get_stream()); + } std::tie(d_out_src, d_out_dst, d_out_indices) = gather_local_edges(handle, From 3b76a49b3270178a0d5955ebe17d428de1b5b491 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Mon, 23 May 2022 23:12:34 -0700 Subject: [PATCH 08/21] add bindings for SG uniform_neighbor_sample --- python/cugraph/cugraph/__init__.py | 8 +- python/cugraph/cugraph/dask/__init__.py | 1 + ...sampling.py => uniform_neighbor_sample.py} | 66 ++++++---- .../cugraph/experimental/dask/__init__.py | 8 +- python/cugraph/cugraph/sampling/__init__.py | 2 + .../sampling/uniform_neighbor_sample.py | 123 ++++++++++++++++++ .../cugraph/structure/graph_classes.py | 15 ++- .../graph_implementation/simpleGraph.py | 10 +- .../cugraph/cugraph/structure/number_map.py | 58 +++++---- .../pylibcugraph/experimental/__init__.py | 4 +- python/pylibcugraph/pylibcugraph/graphs.pyx | 1 - .../uniform_neighborhood_sampling.pyx | 4 +- python/pylibcugraph/pylibcugraph/utils.pyx | 7 +- 13 files changed, 237 insertions(+), 70 deletions(-) rename python/cugraph/cugraph/dask/sampling/{neighborhood_sampling.py => uniform_neighbor_sample.py} (75%) create mode 100644 python/cugraph/cugraph/sampling/uniform_neighbor_sample.py diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index 5e459c59414..95664913390 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -108,7 +108,13 @@ from raft import raft_include_test from cugraph.dask.comms import comms -from cugraph.sampling import random_walks, rw_path, node2vec +from cugraph.sampling import ( + random_walks, + rw_path, + node2vec, + uniform_neighbor_sample, +) + from cugraph import experimental diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index 7e60315ffb5..b621ae35a88 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -19,3 +19,4 @@ from .community.louvain import louvain from .centrality.katz_centrality import katz_centrality from .components.connectivity import weakly_connected_components +from .sampling.uniform_neighbor_sample import uniform_neighbor_sample diff --git a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py similarity index 75% rename from python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py rename to python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py index 196ac8995d5..a6285685d78 100644 --- a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py @@ -18,17 +18,20 @@ import dask_cudf import cudf -from pylibcugraph.experimental import (MGGraph, - ResourceHandle, - GraphProperties, - uniform_neighborhood_sampling, - ) +from pylibcugraph.experimental import ( + MGGraph, + ResourceHandle, + GraphProperties, + uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample, + ) from cugraph.dask.common.input_utils import get_distributed_data from cugraph.dask.comms import comms as Comms def call_nbr_sampling(sID, data, + idx, + num_edges_per_partition, src_col_name, dst_col_name, num_edges, @@ -43,9 +46,15 @@ def call_nbr_sampling(sID, graph_properties = GraphProperties(is_symmetric=False, is_multigraph=False) srcs = data[0][src_col_name] dsts = data[0][dst_col_name] - weights = None + # Weights are not currently supported. Create an edge_ids + # column of the same type as the vertices. They will be + # ignored during the algo computation + # FIXME: Drop the edge_ids once weights are supported + edge_ids = None if "value" in data[0].columns: - weights = data[0]['value'] + start = sum(num_edges_per_partition[:idx]) + end = start + num_edges_per_partition[idx] + edge_ids = cudf.Series(range(start, end), dtype=srcs.dtype) store_transposed = False @@ -53,17 +62,17 @@ def call_nbr_sampling(sID, graph_properties, srcs, dsts, - weights, + edge_ids, store_transposed, num_edges, do_expensive_check) - ret_val = uniform_neighborhood_sampling(handle, - mg, - start_list, - h_fan_out, - with_replacement, - do_expensive_check) + ret_val = pylibcugraph_uniform_neighbor_sample(handle, + mg, + start_list, + h_fan_out, + with_replacement, + do_expensive_check) return ret_val @@ -80,10 +89,10 @@ def convert_to_cudf(cp_arrays): return df -def EXPERIMENTAL__uniform_neighborhood(input_graph, - start_list, - fanout_vals, - with_replacement=True): +def uniform_neighbor_sample(input_graph, + start_list, + fanout_vals, + with_replacement=True): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. @@ -94,7 +103,7 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, cuGraph graph, which contains connectivity information as dask cudf edge list dataframe - start_info_list : list or cudf.Series (int32) + start_list : list or cudf.Series (int32) a list of starting vertices for sampling fanout_vals : list (int32) @@ -126,6 +135,9 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, input_graph.compute_renumber_edge_list( transposed=False, legacy_renum_only=True) + if isinstance(start_list, int): + start_list = [start_list] + if isinstance(start_list, list): start_list = cudf.Series(start_list) if start_list.dtype != 'int32': @@ -140,13 +152,17 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}") + # FIXME: Add graph property for multigraph ddf = input_graph.edgelist.edgelist_df - num_edges = len(ddf) - data = get_distributed_data(ddf) - src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name + num_edges_per_partition = [len( + ddf.get_partition(p)) for p in range(ddf.npartitions)] + + num_edges = len(ddf) + data = get_distributed_data(ddf) + # start_list uses "external" vertex IDs, but if the graph has been # renumbered, the start vertex IDs must also be renumbered. if input_graph.renumbered: @@ -157,6 +173,8 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, result = [client.submit(call_nbr_sampling, Comms.get_session_id(), wf[1], + idx, + num_edges_per_partition, src_col_name, dst_col_name, num_edges, @@ -177,7 +195,7 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph, ddf = dask_cudf.from_delayed(cudf_result) if input_graph.renumbered: - ddf = input_graph.unrenumber(ddf, "sources") - ddf = input_graph.unrenumber(ddf, "destinations") + ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True) + ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True) return ddf diff --git a/python/cugraph/cugraph/experimental/dask/__init__.py b/python/cugraph/cugraph/experimental/dask/__init__.py index 059df21d487..9ec37e91f95 100644 --- a/python/cugraph/cugraph/experimental/dask/__init__.py +++ b/python/cugraph/cugraph/experimental/dask/__init__.py @@ -13,7 +13,7 @@ from cugraph.utilities.api_tools import experimental_warning_wrapper -from cugraph.dask.sampling.neighborhood_sampling import \ - EXPERIMENTAL__uniform_neighborhood -uniform_neighborhood_sampling = \ - experimental_warning_wrapper(EXPERIMENTAL__uniform_neighborhood) +from cugraph.dask.sampling.uniform_neighbor_sample import \ + uniform_neighbor_sample +uniform_neighbor_sample = \ + experimental_warning_wrapper(uniform_neighbor_sample) diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index df8c66f43a9..7b82e73f6cc 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -13,3 +13,5 @@ from cugraph.sampling.random_walks import random_walks, rw_path from cugraph.sampling.node2vec import node2vec +from cugraph.sampling.uniform_neighbor_sample import \ + uniform_neighbor_sample diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py new file mode 100644 index 00000000000..55c0b1c5c8c --- /dev/null +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -0,0 +1,123 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pylibcugraph import (ResourceHandle, + GraphProperties, + SGGraph, + ) +from pylibcugraph.experimental import uniform_neighbor_sample as \ + pylibcugraph_uniform_neighbor_sample + +import numpy + +import cudf + + +def uniform_neighbor_sample(G, + start_list, + fanout_vals, + with_replacement=True): + """ + Does neighborhood sampling, which samples nodes from a graph based on the + current node's neighbors, with a corresponding fanout value at each hop. + + Parameters + ---------- + G : cugraph.Graph + cuGraph graph, which contains connectivity information as dask cudf + edge list dataframe + + start_list : list or cudf.Series (int32) + a list of starting vertices for sampling + + fanout_vals : list (int32) + List of branching out (fan-out) degrees per starting vertex for each + hop level. + + with_replacement: bool, optional (default=True) + Flag to specify if the random sampling is done with replacement + + Returns + ------- + result : cudf.DataFrame + GPU data frame containing two cudf.Series + + df['sources']: cudf.Series + Contains the source vertices from the sampling result + df['destinations']: cudf.Series + Contains the destination vertices from the sampling result + df['indices']: cudf.Series + Contains the indices from the sampling result for path + reconstruction + """ + + if isinstance(start_list, int): + start_list = [start_list] + + if isinstance(start_list, list): + start_list = cudf.Series(start_list, dtype='int32') + if start_list.dtype != 'int32': + raise ValueError(f"'start_list' must have int32 values, " + f"got: {start_list.dtype}") + + # fanout_vals must be a host array! + # FIXME: ensure other sequence types (eg. cudf Series) can be handled. + if isinstance(fanout_vals, list): + fanout_vals = numpy.asarray(fanout_vals, dtype="int32") + else: + raise TypeError("fanout_vals must be a list, " + f"got: {type(fanout_vals)}") + + if G.renumbered is True: + if isinstance(start_list, cudf.DataFrame): + start_list = G.lookup_internal_vertex_id( + start_list, start_list.columns) + else: + start_list = G.lookup_internal_vertex_id(start_list) + + srcs = G.edgelist.edgelist_df['src'] + dsts = G.edgelist.edgelist_df['dst'] + # Weights are not currently supported. Create an edge_ids + # column of type same type as the vertices which will be + # ignored when computing the algo + # FIXME: Drop the edge_ids once weights are supported + edge_ids = cudf.Series(range(len(srcs)), dtype=srcs.dtype) + + if srcs.dtype != 'int32': + raise ValueError(f"Graph vertices must have int32 values, " + f"got: {srcs.dtype}") + + resource_handle = ResourceHandle() + graph_props = GraphProperties(is_multigraph=G.is_multigraph()) + store_transposed = False + renumber = False + do_expensive_check = False + + sg = SGGraph(resource_handle, graph_props, srcs, dsts, edge_ids, + store_transposed, renumber, do_expensive_check) + + sources, destinations, indices = \ + pylibcugraph_uniform_neighbor_sample(resource_handle, sg, start_list, + fanout_vals, with_replacement, + do_expensive_check) + + df = cudf.DataFrame() + df["sources"] = sources + df["destinations"] = destinations + df["indices"] = indices + + if G.renumbered: + df = G.unrenumber(df, "sources", preserve_order=True) + df = G.unrenumber(df, "destinations", preserve_order=True) + + return df diff --git a/python/cugraph/cugraph/structure/graph_classes.py b/python/cugraph/cugraph/structure/graph_classes.py index 4b6c03882e8..cd2b6ff0565 100644 --- a/python/cugraph/cugraph/structure/graph_classes.py +++ b/python/cugraph/cugraph/structure/graph_classes.py @@ -98,7 +98,8 @@ def from_cudf_edgelist( source="source", destination="destination", edge_attr=None, - renumber=True + renumber=True, + legacy_renum_only=False ): """ Initialize a graph from the edge list. It is an error to call this @@ -150,11 +151,13 @@ def from_cudf_edgelist( elif (self._Impl.edgelist is not None or self._Impl.adjlist is not None): raise RuntimeError("Graph already has values") - self._Impl._simpleGraphImpl__from_edgelist(input_df, - source=source, - destination=destination, - edge_attr=edge_attr, - renumber=renumber) + self._Impl._simpleGraphImpl__from_edgelist( + input_df, + source=source, + destination=destination, + edge_attr=edge_attr, + renumber=renumber, + legacy_renum_only=legacy_renum_only) def from_cudf_adjlist(self, offset_col, index_col, value_col=None): """ diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 2b57736562b..2bed6abc7b8 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -88,6 +88,7 @@ def __from_edgelist( destination="destination", edge_attr=None, renumber=True, + legacy_renum_only=False, ): # Verify column names present in input DataFrame @@ -146,11 +147,16 @@ def __from_edgelist( if renumber: # FIXME: Should SG do lazy evaluation like MG? elist, renumber_map = NumberMap.renumber( - elist, source, destination, store_transposed=False + elist, source, destination, store_transposed=False, + legacy_renum_only=legacy_renum_only ) source = renumber_map.renumbered_src_col_name destination = renumber_map.renumbered_dst_col_name - self.properties.renumbered = True + # Use renumber_map to figure out if renumbering was skipped or not + # This was added to handle 'legacy_renum_only' which may + # will skip the old renumbering when running the + # pylibcugraph/C algos + self.properties.renumbered = renumber_map.implementation.numbered self.renumber_map = renumber_map else: if type(source) is list and type(destination) is list: diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 10de74cd744..367dbbe9829 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -63,6 +63,7 @@ def to_internal_vertex_id(self, df, col_names): ) index_name = NumberMap.generate_unused_column_name(df.columns) tmp_df[index_name] = tmp_df.index + return ( self.df.merge(tmp_df, on=self.col_names, how="right") .sort_values(index_name) @@ -182,6 +183,8 @@ def __init__( self.numbered = False def to_internal_vertex_id(self, ddf, col_names): + print("mapping is \n", self.ddf.compute()) + print("Original df is \n", ddf.compute()) tmp_ddf = ddf[col_names].rename( columns=dict(zip(col_names, self.col_names))) for name in self.col_names: @@ -501,6 +504,7 @@ def renumber_and_segment( df, src_col_names, dst_col_names, preserve_order=False, store_transposed=False, legacy_renum_only=False ): + renumbered = True # FIXME: Drop the renumber_type 'experimental' once all the # algos follow the C/Pylibcugraph path @@ -519,6 +523,7 @@ def renumber_and_segment( if legacy_renum_only and renumber_type == 'experimental': # The original dataframe will be returned. renumber_type = 'skip_renumbering' + renumbered = False renumber_map = NumberMap() if not isinstance(src_col_names, list): @@ -545,6 +550,8 @@ def renumber_and_segment( else: raise TypeError("df must be cudf.DataFrame or dask_cudf.DataFrame") + renumber_map.implementation.numbered = renumbered + if renumber_type == 'legacy': indirection_map = renumber_map.implementation.\ indirection_map(df, @@ -652,34 +659,41 @@ def get_renumbered_df(id_type, data): return df, renumber_map, None else: - renumbering_map, segment_offsets, renumbered_df = \ - c_renumber.renumber(df, - renumber_map.renumbered_src_col_name, - renumber_map.renumbered_dst_col_name, - num_edges, - 0, - Comms.get_default_handle(), - is_mnmg, - store_transposed) - if renumber_type == 'legacy': - renumber_map.implementation.df = indirection_map.\ - merge(renumbering_map, - right_on='original_ids', left_on='id').\ - drop(columns=['id', 'original_ids'])\ - .rename(columns={'new_ids': 'id'}, copy=False) - else: - renumber_map.implementation.df = renumbering_map.rename( - columns={'original_ids': '0', 'new_ids': 'id'}, copy=False) + # Do not renumber the algos following the C/Pylibcugraph path + if renumber_type in ['legacy', 'experimental']: + renumbering_map, segment_offsets, renumbered_df = \ + c_renumber.renumber(df, + renumber_map.renumbered_src_col_name, + renumber_map.renumbered_dst_col_name, + num_edges, + 0, + Comms.get_default_handle(), + is_mnmg, + store_transposed) + if renumber_type == 'legacy': + renumber_map.implementation.df = indirection_map.merge( + renumbering_map, + right_on='original_ids', + left_on='id').drop(columns=['id', 'original_ids'])\ + .rename(columns={'new_ids': 'id'}, copy=False) + else: + renumber_map.implementation.df = renumbering_map.rename( + columns={ + 'original_ids': '0', 'new_ids': 'id'}, copy=False) - renumber_map.implementation.numbered = True - return renumbered_df, renumber_map, segment_offsets + renumber_map.implementation.numbered = True + return renumbered_df, renumber_map, segment_offsets + else: + # There is no aggregate_segment_offsets since the + # C++ renumbering is skipped + return df, renumber_map, None @staticmethod def renumber(df, src_col_names, dst_col_names, preserve_order=False, - store_transposed=False): + store_transposed=False, legacy_renum_only=False): return NumberMap.renumber_and_segment( df, src_col_names, dst_col_names, - preserve_order, store_transposed)[0:2] + preserve_order, store_transposed, legacy_renum_only)[0:2] def unrenumber(self, df, column_name, preserve_order=False, get_column_names=False): diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 2587b2457a1..d17815d5fbd 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -66,8 +66,8 @@ from pylibcugraph.node2vec import node2vec node2vec = promoted_experimental_warning_wrapper(node2vec) -from pylibcugraph.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling -uniform_neighborhood_sampling = experimental_warning_wrapper(EXPERIMENTAL__uniform_neighborhood_sampling) +from pylibcugraph.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighbor_sample +uniform_neighbor_sample = experimental_warning_wrapper(EXPERIMENTAL__uniform_neighbor_sample) from pylibcugraph.katz_centrality import EXPERIMENTAL__katz_centrality katz_centrality = experimental_warning_wrapper(EXPERIMENTAL__katz_centrality) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index 732a7dca53c..574aac9bc5d 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -298,7 +298,6 @@ cdef class MGGraph(_GPUGraph): cdef uintptr_t cai_weights_ptr = \ weight_array.__cuda_array_interface__["data"][0] cdef cugraph_type_erased_device_array_view_t* weights_view_ptr - print("the weight array type is ", weight_array.dtype) if weight_array.dtype in [numpy.int32, numpy.int64]: weights_view_ptr = \ diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx index bbed79bfba3..6b6c468eccf 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx @@ -65,8 +65,8 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__uniform_neighborhood_sampling(ResourceHandle resource_handle, - MGGraph input_graph, +def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, + _GPUGraph input_graph, start_list, h_fan_out, bool_t with_replacement, diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index 121c8109660..79d8ebe2579 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -154,6 +154,7 @@ cdef copy_to_cupy_array( cdef copy_to_cupy_array_ids( cugraph_resource_handle_t* c_resource_handle_ptr, cugraph_type_erased_device_array_view_t* device_array_view_ptr): + # FIXME: Update this docstring """ Copy the contents from a device array view as returned by various cugraph_* APIs to a new cupy device array, typically intended to be used as a return @@ -164,18 +165,12 @@ cdef copy_to_cupy_array_ids( cdef c_type = cugraph_type_erased_device_array_view_type( device_array_view_ptr) - #cdef c_type = data_type_id_t.INT32 array_size = cugraph_type_erased_device_array_view_size( device_array_view_ptr) cupy_array = cupy.zeros( array_size, dtype=get_numpy_edge_ids_type_from_c_weight_type(c_type)) - """ - cupy_array = cupy.zeros( - array_size, dtype=numpy.int32) - """ - cdef uintptr_t cupy_array_ptr = \ cupy_array.__cuda_array_interface__["data"][0] From 9933120bf1fe8bd5c767c0155325c7e3da8d4bc6 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Mon, 23 May 2022 23:52:11 -0700 Subject: [PATCH 09/21] remove debug print --- python/cugraph/cugraph/structure/number_map.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 367dbbe9829..4f6edf5fcf2 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -183,8 +183,6 @@ def __init__( self.numbered = False def to_internal_vertex_id(self, ddf, col_names): - print("mapping is \n", self.ddf.compute()) - print("Original df is \n", ddf.compute()) tmp_ddf = ddf[col_names].rename( columns=dict(zip(col_names, self.col_names))) for name in self.col_names: @@ -650,7 +648,6 @@ def get_renumbered_df(id_type, data): else: renumber_map.implementation.ddf = renumbering_map.rename( columns={'original_ids': '0', 'new_ids': 'global_id'}) - renumber_map.implementation.numbered = True return renumbered_df, renumber_map, aggregate_segment_offsets else: @@ -681,7 +678,6 @@ def get_renumbered_df(id_type, data): columns={ 'original_ids': '0', 'new_ids': 'id'}, copy=False) - renumber_map.implementation.numbered = True return renumbered_df, renumber_map, segment_offsets else: # There is no aggregate_segment_offsets since the From 604ab0fada35f1fbb37eceaf27913bf084e8be4e Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Tue, 24 May 2022 05:14:03 -0700 Subject: [PATCH 10/21] update pylibcugraph uniform_neighbor_sample tests because of the API changes --- .../tests/test_neighborhood_sampling.py | 45 +++++++++---------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py b/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py index aa3dd849529..1afb44b1a99 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py @@ -19,7 +19,7 @@ ResourceHandle, GraphProperties, ) -from pylibcugraph.experimental import uniform_neighborhood_sampling +from pylibcugraph.experimental import uniform_neighbor_sample # ============================================================================= @@ -34,15 +34,16 @@ def check_edges(result, srcs, dsts, weights, num_verts, num_edges, num_seeds): - result_srcs, result_dsts, result_labels, result_indices = result + # FIXME: Update the result retrieval as the API changed + result_srcs, result_dsts, result_indices = result h_src_arr = srcs.get() h_dst_arr = dsts.get() h_wgt_arr = weights.get() h_result_srcs = result_srcs.get() h_result_dsts = result_dsts.get() - h_result_labels = result_labels.get() - h_result_indices = result_indices.get() + # FIXME: Variable not used + # h_result_indices = result_indices.get() # Following the C validation, we will check that all edges are part of the # graph @@ -53,10 +54,12 @@ def check_edges(result, srcs, dsts, weights, num_verts, num_edges, num_seeds): for edge in range(h_result_srcs): assert M[h_result_srcs[edge]][h_result_dsts[edge]] > 0.0 - found = False + # found = False for j in range(num_seeds): - # Revise, this is not correct - found = found or (h_result_labels[edge] == h_result_indices[j]) + # FIXME: Revise, this is not correct. + # Labels are no longer supported. + # found = found or (h_result_labels[edge] == h_result_indices[j]) + pass # TODO: Refactor after creating a helper within conftest.py to pass in an @@ -71,7 +74,6 @@ def test_neighborhood_sampling_cupy(): device_weights = cp.asarray([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], dtype=np.float32) start_list = cp.asarray([2, 2], dtype=np.int32) - info_list = cp.asarray([0, 1], dtype=np.int32) fanout_vals = cp.asarray([1, 2], dtype=np.int32) mg = MGGraph(resource_handle, @@ -83,13 +85,12 @@ def test_neighborhood_sampling_cupy(): num_edges=8, do_expensive_check=False) - result = uniform_neighborhood_sampling(resource_handle, - mg, - start_list, - info_list, - fanout_vals, - with_replacement=True, - do_expensive_check=False) + result = uniform_neighbor_sample(resource_handle, + mg, + start_list, + fanout_vals, + with_replacement=True, + do_expensive_check=False) check_edges(result, device_srcs, device_dsts, device_weights, 6, 8, 2) @@ -104,7 +105,6 @@ def test_neighborhood_sampling_cudf(): device_weights = cudf.Series([0.1, 2.1, 1.1, 5.1, 3.1, 4.1, 7.2, 3.2], dtype=np.float32) start_list = cudf.Series([2, 2], dtype=np.int32) - info_list = cudf.Series([0, 1], dtype=np.int32) fanout_vals = cudf.Series([1, 2], dtype=np.int32) mg = MGGraph(resource_handle, @@ -116,12 +116,11 @@ def test_neighborhood_sampling_cudf(): num_edges=8, do_expensive_check=False) - result = uniform_neighborhood_sampling(resource_handle, - mg, - start_list, - info_list, - fanout_vals, - with_replacement=True, - do_expensive_check=False) + result = uniform_neighbor_sample(resource_handle, + mg, + start_list, + fanout_vals, + with_replacement=True, + do_expensive_check=False) check_edges(result, device_srcs, device_dsts, device_weights, 6, 8, 2) From c579261c53061130cc0c398ba1264fa4b6e75af8 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Tue, 24 May 2022 07:12:15 -0700 Subject: [PATCH 11/21] drop the directory proto --- python/cugraph/cugraph/dask/__init__.py | 2 ++ .../sampling/neighborhood_sampling.py | 33 ++++++++++--------- .../cugraph/experimental/dask/__init__.py | 4 +++ python/cugraph/cugraph/proto/__init__.py | 1 - .../cugraph/proto/sampling/__init__.py | 13 -------- python/pylibcugraph/cufile.log | 3 ++ .../pylibcugraph/experimental/__init__.py | 5 ++- ...sampling.pyx => neighborhood_sampling.pyx} | 19 +++++++---- .../pylibcugraph/proto/__init__.py | 19 ----------- .../pylibcugraph/proto/sampling/__init__.py | 0 ...mpling.pyx => uniform_neighbor_sample.pyx} | 0 11 files changed, 42 insertions(+), 57 deletions(-) rename python/cugraph/cugraph/{proto => dask}/sampling/neighborhood_sampling.py (88%) delete mode 100644 python/cugraph/cugraph/proto/sampling/__init__.py create mode 100644 python/pylibcugraph/cufile.log rename python/pylibcugraph/pylibcugraph/{proto/sampling/uniform_neighborhood_sampling.pyx => neighborhood_sampling.pyx} (93%) delete mode 100644 python/pylibcugraph/pylibcugraph/proto/__init__.py delete mode 100644 python/pylibcugraph/pylibcugraph/proto/sampling/__init__.py rename python/pylibcugraph/pylibcugraph/{uniform_neighborhood_sampling.pyx => uniform_neighbor_sample.pyx} (100%) diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index b621ae35a88..48325e4eaf6 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -20,3 +20,5 @@ from .centrality.katz_centrality import katz_centrality from .components.connectivity import weakly_connected_components from .sampling.uniform_neighbor_sample import uniform_neighbor_sample +# FIXME: This call is deprecated and will be removed next release +from .sampling.neighborhood_sampling import neighborhood_sampling diff --git a/python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py similarity index 88% rename from python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py rename to python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py index 277b27e91ab..cd87951eaec 100644 --- a/python/cugraph/cugraph/proto/sampling/neighborhood_sampling.py +++ b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py @@ -18,11 +18,12 @@ import dask_cudf import cudf -from pylibcugraph.proto import uniform_neighborhood_sampling -from pylibcugraph.experimental import (MGGraph, - ResourceHandle, - GraphProperties, - ) +from pylibcugraph.experimental import ( + MGGraph, + ResourceHandle, + GraphProperties, + neighborhood_sampling as pylibcugraph_neighborhood_sampling + ) from cugraph.dask.common.input_utils import get_distributed_data @@ -61,13 +62,13 @@ def call_nbr_sampling(sID, num_edges, do_expensive_check) - ret_val = uniform_neighborhood_sampling(handle, - mg, - start_list, - info_list, - h_fan_out, - with_replacement, - do_expensive_check) + ret_val = pylibcugraph_neighborhood_sampling(handle, + mg, + start_list, + info_list, + h_fan_out, + with_replacement, + do_expensive_check) return ret_val @@ -87,10 +88,10 @@ def convert_to_cudf(cp_arrays): return df -def uniform_neighborhood(input_graph, - start_info_list, - fanout_vals, - with_replacement=True): +def neighborhood_sampling(input_graph, + start_info_list, + fanout_vals, + with_replacement=True): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. diff --git a/python/cugraph/cugraph/experimental/dask/__init__.py b/python/cugraph/cugraph/experimental/dask/__init__.py index 9ec37e91f95..4e8ef08736c 100644 --- a/python/cugraph/cugraph/experimental/dask/__init__.py +++ b/python/cugraph/cugraph/experimental/dask/__init__.py @@ -17,3 +17,7 @@ uniform_neighbor_sample uniform_neighbor_sample = \ experimental_warning_wrapper(uniform_neighbor_sample) + +# FIXME: This call is deprecated and will be removed in the next release +from cugraph.dask.sampling.neighborhood_sampling import \ + neighborhood_sampling diff --git a/python/cugraph/cugraph/proto/__init__.py b/python/cugraph/cugraph/proto/__init__.py index 95e5aaf3af9..6ed5e96cff0 100644 --- a/python/cugraph/cugraph/proto/__init__.py +++ b/python/cugraph/cugraph/proto/__init__.py @@ -13,4 +13,3 @@ from cugraph.proto.components import strong_connected_component from cugraph.proto.structure import find_bicliques -from cugraph.proto.sampling import uniform_neighborhood diff --git a/python/cugraph/cugraph/proto/sampling/__init__.py b/python/cugraph/cugraph/proto/sampling/__init__.py deleted file mode 100644 index 5c8fbf27210..00000000000 --- a/python/cugraph/cugraph/proto/sampling/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and - -from cugraph.proto.sampling.neighborhood_sampling import uniform_neighborhood \ No newline at end of file diff --git a/python/pylibcugraph/cufile.log b/python/pylibcugraph/cufile.log new file mode 100644 index 00000000000..fb60a50c9de --- /dev/null +++ b/python/pylibcugraph/cufile.log @@ -0,0 +1,3 @@ + 24-05-2022 11:57:57:955 [pid=3446 tid=3446] ERROR cufio-drv:632 nvidia-fs.ko driver not loaded + 24-05-2022 12:02:27:919 [pid=3468 tid=3468] ERROR cufio-drv:632 nvidia-fs.ko driver not loaded + 24-05-2022 12:12:09:520 [pid=13919 tid=13919] ERROR cufio-drv:632 nvidia-fs.ko driver not loaded diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index d17815d5fbd..2961ea3f962 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -66,8 +66,11 @@ from pylibcugraph.node2vec import node2vec node2vec = promoted_experimental_warning_wrapper(node2vec) -from pylibcugraph.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighbor_sample +from pylibcugraph.uniform_neighbor_sample import EXPERIMENTAL__uniform_neighbor_sample uniform_neighbor_sample = experimental_warning_wrapper(EXPERIMENTAL__uniform_neighbor_sample) +# FIXME: This call is deprecated and will be removed in the next release +from pylibcugraph.neighborhood_sampling import neighborhood_sampling + from pylibcugraph.katz_centrality import EXPERIMENTAL__katz_centrality katz_centrality = experimental_warning_wrapper(EXPERIMENTAL__katz_centrality) diff --git a/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx similarity index 93% rename from python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx rename to python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx index 98eb9c6d077..73e51720532 100644 --- a/python/pylibcugraph/pylibcugraph/proto/sampling/uniform_neighborhood_sampling.pyx +++ b/python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx @@ -14,6 +14,7 @@ # Have cython use python 3 syntax # cython: language_level = 3 +import warnings from libc.stdint cimport uintptr_t from pylibcugraph._cugraph_c.resource_handle cimport ( @@ -61,13 +62,13 @@ from pylibcugraph.utils cimport ( ) -def uniform_neighborhood_sampling(ResourceHandle resource_handle, - MGGraph input_graph, - start_list, - labels_list, - h_fan_out, - bool_t with_replacement, - bool_t do_expensive_check): +def neighborhood_sampling(ResourceHandle resource_handle, + MGGraph input_graph, + start_list, + labels_list, + h_fan_out, + bool_t with_replacement, + bool_t do_expensive_check): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. @@ -109,6 +110,10 @@ def uniform_neighborhood_sampling(ResourceHandle resource_handle, array containing the indices for reconstructing paths. """ + warning_msg = ("This call is deprecated and will be removed" + "in the next release") + warnings.warn(warning_msg, PendingDeprecationWarning) + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ resource_handle.c_resource_handle_ptr cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr diff --git a/python/pylibcugraph/pylibcugraph/proto/__init__.py b/python/pylibcugraph/pylibcugraph/proto/__init__.py deleted file mode 100644 index cc2639a9519..00000000000 --- a/python/pylibcugraph/pylibcugraph/proto/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# FIXME: The directory proto should be deleted once the the experimental -# uniform neighborhood sampling is completed - -#from .sampling.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling -from .sampling.uniform_neighborhood_sampling import uniform_neighborhood_sampling -#uniform_neighborhood_sampling = EXPERIMENTAL__uniform_neighborhood_sampling diff --git a/python/pylibcugraph/pylibcugraph/proto/sampling/__init__.py b/python/pylibcugraph/pylibcugraph/proto/sampling/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx similarity index 100% rename from python/pylibcugraph/pylibcugraph/uniform_neighborhood_sampling.pyx rename to python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx From 626a8330c2f301d74539a76bd972e984ffccf7bb Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Wed, 25 May 2022 14:39:58 -0700 Subject: [PATCH 12/21] enable support for weigths --- .../dask/sampling/uniform_neighbor_sample.py | 37 ++++++++++--------- .../sampling/uniform_neighbor_sample.py | 25 ++++++++----- .../graph_implementation/simpleGraph.py | 5 +-- python/pylibcugraph/cufile.log | 3 -- .../pylibcugraph/uniform_neighbor_sample.pyx | 18 +++++++-- python/pylibcugraph/pylibcugraph/utils.pyx | 2 + 6 files changed, 52 insertions(+), 38 deletions(-) delete mode 100644 python/pylibcugraph/cufile.log diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py index a6285685d78..425ee7aaf45 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py @@ -30,15 +30,14 @@ def call_nbr_sampling(sID, data, - idx, - num_edges_per_partition, src_col_name, dst_col_name, num_edges, do_expensive_check, start_list, h_fan_out, - with_replacement): + with_replacement, + is_edge_ids): # Preparation for graph creation handle = Comms.get_handle(sID) @@ -46,15 +45,9 @@ def call_nbr_sampling(sID, graph_properties = GraphProperties(is_symmetric=False, is_multigraph=False) srcs = data[0][src_col_name] dsts = data[0][dst_col_name] - # Weights are not currently supported. Create an edge_ids - # column of the same type as the vertices. They will be - # ignored during the algo computation - # FIXME: Drop the edge_ids once weights are supported - edge_ids = None + weights = None if "value" in data[0].columns: - start = sum(num_edges_per_partition[:idx]) - end = start + num_edges_per_partition[idx] - edge_ids = cudf.Series(range(start, end), dtype=srcs.dtype) + weights = data[0]['value'] store_transposed = False @@ -62,7 +55,7 @@ def call_nbr_sampling(sID, graph_properties, srcs, dsts, - edge_ids, + weights, store_transposed, num_edges, do_expensive_check) @@ -72,6 +65,7 @@ def call_nbr_sampling(sID, start_list, h_fan_out, with_replacement, + is_edge_ids, do_expensive_check) return ret_val @@ -92,7 +86,8 @@ def convert_to_cudf(cp_arrays): def uniform_neighbor_sample(input_graph, start_list, fanout_vals, - with_replacement=True): + with_replacement=True, + is_edge_ids=False): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. @@ -113,6 +108,10 @@ def uniform_neighbor_sample(input_graph, with_replacement: bool, optional (default=True) Flag to specify if the random sampling is done with replacement + is_edge_ids: bool, (default=False) + Flag to specify if the weights were passed as edge_ids. + If true, the input graph's weight will be treated as edge ids + Returns ------- result : dask_cudf.DataFrame @@ -140,7 +139,7 @@ def uniform_neighbor_sample(input_graph, if isinstance(start_list, list): start_list = cudf.Series(start_list) - if start_list.dtype != 'int32': + if start_list.dtype != "int32": raise ValueError(f"'start_list' must have int32 values, " f"got: {start_list.dtype}") @@ -157,8 +156,11 @@ def uniform_neighbor_sample(input_graph, src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name - num_edges_per_partition = [len( - ddf.get_partition(p)) for p in range(ddf.npartitions)] + weight_t = ddf["value"].dtype + + if is_edge_ids and weight_t not in ["int32", "in64"]: + raise ValueError(f"Graph weights must have int32 or int64 values " + f"if they are edge ids, got: {weight_t}") num_edges = len(ddf) data = get_distributed_data(ddf) @@ -173,8 +175,6 @@ def uniform_neighbor_sample(input_graph, result = [client.submit(call_nbr_sampling, Comms.get_session_id(), wf[1], - idx, - num_edges_per_partition, src_col_name, dst_col_name, num_edges, @@ -182,6 +182,7 @@ def uniform_neighbor_sample(input_graph, start_list, fanout_vals, with_replacement, + is_edge_ids, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index 55c0b1c5c8c..9b31575066c 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -26,7 +26,8 @@ def uniform_neighbor_sample(G, start_list, fanout_vals, - with_replacement=True): + with_replacement=True, + is_edge_ids=False): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. @@ -47,6 +48,10 @@ def uniform_neighbor_sample(G, with_replacement: bool, optional (default=True) Flag to specify if the random sampling is done with replacement + is_edge_ids: bool, (default=False) + Flag to specify if the input graph's weights were passed as edge_ids. + If true, the input graph's weight will be treated as edge ids + Returns ------- result : cudf.DataFrame @@ -65,8 +70,8 @@ def uniform_neighbor_sample(G, start_list = [start_list] if isinstance(start_list, list): - start_list = cudf.Series(start_list, dtype='int32') - if start_list.dtype != 'int32': + start_list = cudf.Series(start_list, dtype="int32") + if start_list.dtype != "int32": raise ValueError(f"'start_list' must have int32 values, " f"got: {start_list.dtype}") @@ -87,11 +92,11 @@ def uniform_neighbor_sample(G, srcs = G.edgelist.edgelist_df['src'] dsts = G.edgelist.edgelist_df['dst'] - # Weights are not currently supported. Create an edge_ids - # column of type same type as the vertices which will be - # ignored when computing the algo - # FIXME: Drop the edge_ids once weights are supported - edge_ids = cudf.Series(range(len(srcs)), dtype=srcs.dtype) + weights = G.edgelist.edgelist_df['weights'] + + if is_edge_ids and weights.dtype not in ['int32', 'in64']: + raise ValueError(f"Graph weights must have int32 or int64 values " + f"if they are edge ids, got: {weights.dtype}") if srcs.dtype != 'int32': raise ValueError(f"Graph vertices must have int32 values, " @@ -103,13 +108,13 @@ def uniform_neighbor_sample(G, renumber = False do_expensive_check = False - sg = SGGraph(resource_handle, graph_props, srcs, dsts, edge_ids, + sg = SGGraph(resource_handle, graph_props, srcs, dsts, weights, store_transposed, renumber, do_expensive_check) sources, destinations, indices = \ pylibcugraph_uniform_neighbor_sample(resource_handle, sg, start_list, fanout_vals, with_replacement, - do_expensive_check) + is_edge_ids, do_expensive_check) df = cudf.DataFrame() df["sources"] = sources diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 2bed6abc7b8..06e76bee49e 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -153,9 +153,8 @@ def __from_edgelist( source = renumber_map.renumbered_src_col_name destination = renumber_map.renumbered_dst_col_name # Use renumber_map to figure out if renumbering was skipped or not - # This was added to handle 'legacy_renum_only' which may - # will skip the old renumbering when running the - # pylibcugraph/C algos + # This was added to handle 'legacy_renum_only' which will skip the + # old C++ renumbering when running the pylibcugraph/C algos self.properties.renumbered = renumber_map.implementation.numbered self.renumber_map = renumber_map else: diff --git a/python/pylibcugraph/cufile.log b/python/pylibcugraph/cufile.log deleted file mode 100644 index fb60a50c9de..00000000000 --- a/python/pylibcugraph/cufile.log +++ /dev/null @@ -1,3 +0,0 @@ - 24-05-2022 11:57:57:955 [pid=3446 tid=3446] ERROR cufio-drv:632 nvidia-fs.ko driver not loaded - 24-05-2022 12:02:27:919 [pid=3468 tid=3468] ERROR cufio-drv:632 nvidia-fs.ko driver not loaded - 24-05-2022 12:12:09:520 [pid=13919 tid=13919] ERROR cufio-drv:632 nvidia-fs.ko driver not loaded diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index 6b6c468eccf..36ca14f71be 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -35,6 +35,7 @@ from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_host_array_view_t, cugraph_type_erased_host_array_view_create, cugraph_type_erased_host_array_view_free, + cugraph_type_erased_device_array_view_type, ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, @@ -70,6 +71,7 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, start_list, h_fan_out, bool_t with_replacement, + bool_t is_edge_ids, bool_t do_expensive_check): """ Does neighborhood sampling, which samples nodes from a graph based on the @@ -94,6 +96,10 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, with_replacement: bool If true, sampling procedure is done with replacement (the same vertex can be selected multiple times in the same step). + + is_edge_ids: bool + Flag to specify if the weights were passed as edge_ids. + If true, the input graph's weight will be treated as edge ids do_expensive_check: bool If True, performs more extensive tests on the inputs to ensure @@ -144,7 +150,7 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, do_expensive_check, &result_ptr, &error_ptr) - assert_success(error_code, error_ptr, "uniform_nbr_sample") + assert_success(error_code, error_ptr, "cugraph_uniform_neighbor_sample") cdef cugraph_type_erased_device_array_view_t* src_ptr = \ cugraph_sample_result_get_sources(result_ptr) @@ -156,9 +162,13 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) - cupy_indices = copy_to_cupy_array_ids(c_resource_handle_ptr, index_ptr) - #print("indices are \n", cupy_indices) - #print("type is ", cupy_indices.dtype) + + + if is_edge_ids: + cupy_indices = copy_to_cupy_array_ids(c_resource_handle_ptr, index_ptr) + else: + cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) + cugraph_sample_result_free(result_ptr) cugraph_type_erased_device_array_view_free(start_ptr) diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index 79d8ebe2579..af658c44b5a 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -81,8 +81,10 @@ cdef get_numpy_type_from_c_type(data_type_id_t c_type): elif c_type == data_type_id_t.INT64: return numpy.int64 elif c_type == data_type_id_t.FLOAT32: + print("This is a float32") return numpy.float32 elif c_type == data_type_id_t.FLOAT64: + print("this is a float64") return numpy.float64 else: raise RuntimeError("Internal error: got invalid data type enum value " From 6d91c3964af8858179254ad13910bae9a57ddcd4 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Wed, 25 May 2022 22:08:51 -0700 Subject: [PATCH 13/21] remove debug prints, address PR comments --- .../dask/sampling/uniform_neighbor_sample.py | 14 ++++++++------ .../pylibcugraph/uniform_neighbor_sample.pyx | 5 +---- python/pylibcugraph/pylibcugraph/utils.pyx | 2 -- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py index 425ee7aaf45..3693bf4a0bd 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py @@ -18,12 +18,14 @@ import dask_cudf import cudf -from pylibcugraph.experimental import ( - MGGraph, - ResourceHandle, - GraphProperties, - uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample, - ) +from pylibcugraph import (ResourceHandle, + GraphProperties, + MGGraph + ) + +from pylibcugraph.experimental import \ + uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample + from cugraph.dask.common.input_utils import get_distributed_data from cugraph.dask.comms import comms as Comms diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index 36ca14f71be..50f3182ebf1 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -158,18 +158,15 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, cugraph_sample_result_get_destinations(result_ptr) cdef cugraph_type_erased_device_array_view_t* index_ptr = \ cugraph_sample_result_get_index(result_ptr) - - + cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) - if is_edge_ids: cupy_indices = copy_to_cupy_array_ids(c_resource_handle_ptr, index_ptr) else: cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) - cugraph_sample_result_free(result_ptr) cugraph_type_erased_device_array_view_free(start_ptr) cugraph_type_erased_host_array_view_free(fan_out_ptr) diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index af658c44b5a..79d8ebe2579 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -81,10 +81,8 @@ cdef get_numpy_type_from_c_type(data_type_id_t c_type): elif c_type == data_type_id_t.INT64: return numpy.int64 elif c_type == data_type_id_t.FLOAT32: - print("This is a float32") return numpy.float32 elif c_type == data_type_id_t.FLOAT64: - print("this is a float64") return numpy.float64 else: raise RuntimeError("Internal error: got invalid data type enum value " From f0386884c2b107e19ff6297bde841afa5133a799 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Thu, 26 May 2022 21:15:50 -0700 Subject: [PATCH 14/21] move uniform_neighbor_sample to stable API, convert edge_ids to weights in cugraph instead of pylibcugraph --- python/cugraph/cugraph/dask/__init__.py | 2 - .../dask/sampling/neighborhood_sampling.py | 202 ------------------ .../dask/sampling/uniform_neighbor_sample.py | 32 ++- .../cugraph/experimental/dask/__init__.py | 11 - .../sampling/uniform_neighbor_sample.py | 18 +- .../tests/mg/test_mg_neighborhood_sampling.py | 22 +- python/pylibcugraph/pylibcugraph/__init__.py | 2 + .../pylibcugraph/experimental/__init__.py | 6 - python/pylibcugraph/pylibcugraph/graphs.pyx | 45 +--- .../tests/test_neighborhood_sampling.py | 2 +- .../pylibcugraph/uniform_neighbor_sample.pyx | 23 +- python/pylibcugraph/pylibcugraph/utils.pyx | 5 +- 12 files changed, 60 insertions(+), 310 deletions(-) delete mode 100644 python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index 48325e4eaf6..b621ae35a88 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -20,5 +20,3 @@ from .centrality.katz_centrality import katz_centrality from .components.connectivity import weakly_connected_components from .sampling.uniform_neighbor_sample import uniform_neighbor_sample -# FIXME: This call is deprecated and will be removed next release -from .sampling.neighborhood_sampling import neighborhood_sampling diff --git a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py b/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py deleted file mode 100644 index cd87951eaec..00000000000 --- a/python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py +++ /dev/null @@ -1,202 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy -from dask.distributed import wait, default_client - -import dask_cudf -import cudf - -from pylibcugraph.experimental import ( - MGGraph, - ResourceHandle, - GraphProperties, - neighborhood_sampling as pylibcugraph_neighborhood_sampling - ) - - -from cugraph.dask.common.input_utils import get_distributed_data -from cugraph.dask.comms import comms as Comms - - -def call_nbr_sampling(sID, - data, - src_col_name, - dst_col_name, - num_edges, - do_expensive_check, - start_list, - info_list, - h_fan_out, - with_replacement): - - # Preparation for graph creation - handle = Comms.get_handle(sID) - handle = ResourceHandle(handle.getHandle()) - graph_properties = GraphProperties(is_symmetric=False, is_multigraph=False) - srcs = data[0][src_col_name] - dsts = data[0][dst_col_name] - weights = None - if "value" in data[0].columns: - weights = data[0]['value'] - - store_transposed = False - - mg = MGGraph(handle, - graph_properties, - srcs, - dsts, - weights, - store_transposed, - num_edges, - do_expensive_check) - - ret_val = pylibcugraph_neighborhood_sampling(handle, - mg, - start_list, - info_list, - h_fan_out, - with_replacement, - do_expensive_check) - return ret_val - - -def convert_to_cudf(cp_arrays): - """ - Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper - """ - cupy_sources, cupy_destinations, cupy_labels, cupy_indices = cp_arrays - # cupy_sources, cupy_destinations, cupy_labels, cupy_indices, - # cupy_counts = cp_arrays - df = cudf.DataFrame() - df["sources"] = cupy_sources - df["destinations"] = cupy_destinations - df["labels"] = cupy_labels - df["indices"] = cupy_indices - # df["counts"] = cupy_counts - return df - - -def neighborhood_sampling(input_graph, - start_info_list, - fanout_vals, - with_replacement=True): - """ - Does neighborhood sampling, which samples nodes from a graph based on the - current node's neighbors, with a corresponding fanout value at each hop. - - Parameters - ---------- - input_graph : cugraph.Graph - cuGraph graph, which contains connectivity information as dask cudf - edge list dataframe - - start_info_list : tuple of list or cudf.Series (int32) - Tuple of a list of starting vertices for sampling, along with a - corresponding list of label for reorganizing results after sending - the input to different callers. - - fanout_vals : list (int32) - List of branching out (fan-out) degrees per starting vertex for each - hop level. - - with_replacement: bool, optional (default=True) - Flag to specify if the random sampling is done with replacement - - Returns - ------- - result : dask_cudf.DataFrame - GPU data frame containing two dask_cudf.Series - - ddf['sources']: dask_cudf.Series - Contains the source vertices from the sampling result - ddf['destinations']: dask_cudf.Series - Contains the destination vertices from the sampling result - ddf['labels']: dask_cudf.Series - Contains the start labels from the sampling result - ddf['indices']: dask_cudf.Series - Contains the indices from the sampling result for path - reconstruction - """ - # Initialize dask client - client = default_client() - # FIXME: 'legacy_renum_only' will not trigger the C++ renumbering - # In the future, once all the algos follow the C/Pylibcugraph path, - # compute_renumber_edge_list will only be used for multicolumn and - # string vertices since the renumbering will be done in pylibcugraph - input_graph.compute_renumber_edge_list( - transposed=False, legacy_renum_only=True) - - start_list, info_list = start_info_list - - if isinstance(start_list, list): - start_list = cudf.Series(start_list) - if start_list.dtype != 'int32': - raise ValueError(f"'start_list' must have int32 values, " - f"got: {start_list.dtype}") - if isinstance(info_list, list): - info_list = cudf.Series(info_list) - if info_list.dtype != 'int32': - raise ValueError(f"'info_list' must have int32 values, " - f"got: {info_list.dtype}") - # fanout_vals must be a host array! - # FIXME: ensure other sequence types (eg. cudf Series) can be handled. - if isinstance(fanout_vals, list): - fanout_vals = numpy.asarray(fanout_vals, dtype="int32") - else: - raise TypeError("fanout_vals must be a list, " - f"got: {type(fanout_vals)}") - - ddf = input_graph.edgelist.edgelist_df - num_edges = len(ddf) - data = get_distributed_data(ddf) - - src_col_name = input_graph.renumber_map.renumbered_src_col_name - dst_col_name = input_graph.renumber_map.renumbered_dst_col_name - - # start_list uses "external" vertex IDs, but if the graph has been - # renumbered, the start vertex IDs must also be renumbered. - if input_graph.renumbered: - start_list = input_graph.lookup_internal_vertex_id( - start_list).compute() - do_expensive_check = True - - result = [client.submit(call_nbr_sampling, - Comms.get_session_id(), - wf[1], - src_col_name, - dst_col_name, - num_edges, - do_expensive_check, - start_list, - info_list, - fanout_vals, - with_replacement, - workers=[wf[0]]) - for idx, wf in enumerate(data.worker_to_parts.items())] - - wait(result) - - cudf_result = [client.submit(convert_to_cudf, - cp_arrays) - for cp_arrays in result] - - wait(cudf_result) - - ddf = dask_cudf.from_delayed(cudf_result) - if input_graph.renumbered: - ddf = input_graph.unrenumber(ddf, "sources") - ddf = input_graph.unrenumber(ddf, "destinations") - - return ddf diff --git a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py index 3693bf4a0bd..58a59a4b56a 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py @@ -23,7 +23,7 @@ MGGraph ) -from pylibcugraph.experimental import \ +from pylibcugraph import \ uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample from cugraph.dask.common.input_utils import get_distributed_data @@ -38,8 +38,7 @@ def call_nbr_sampling(sID, do_expensive_check, start_list, h_fan_out, - with_replacement, - is_edge_ids): + with_replacement): # Preparation for graph creation handle = Comms.get_handle(sID) @@ -67,12 +66,11 @@ def call_nbr_sampling(sID, start_list, h_fan_out, with_replacement, - is_edge_ids, do_expensive_check) return ret_val -def convert_to_cudf(cp_arrays): +def convert_to_cudf(cp_arrays, weight_t): """ Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper """ @@ -82,14 +80,19 @@ def convert_to_cudf(cp_arrays): df["sources"] = cupy_sources df["destinations"] = cupy_destinations df["indices"] = cupy_indices + + if weight_t == "int32": + df.indices = df.indices.astype("int32") + elif weight_t == "int64": + df.indices = df.indices.astype("int64") + return df def uniform_neighbor_sample(input_graph, start_list, fanout_vals, - with_replacement=True, - is_edge_ids=False): + with_replacement=True): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. @@ -110,9 +113,6 @@ def uniform_neighbor_sample(input_graph, with_replacement: bool, optional (default=True) Flag to specify if the random sampling is done with replacement - is_edge_ids: bool, (default=False) - Flag to specify if the weights were passed as edge_ids. - If true, the input graph's weight will be treated as edge ids Returns ------- @@ -153,16 +153,15 @@ def uniform_neighbor_sample(input_graph, raise TypeError("fanout_vals must be a list, " f"got: {type(fanout_vals)}") - # FIXME: Add graph property for multigraph ddf = input_graph.edgelist.edgelist_df src_col_name = input_graph.renumber_map.renumbered_src_col_name dst_col_name = input_graph.renumber_map.renumbered_dst_col_name weight_t = ddf["value"].dtype - - if is_edge_ids and weight_t not in ["int32", "in64"]: - raise ValueError(f"Graph weights must have int32 or int64 values " - f"if they are edge ids, got: {weight_t}") + if weight_t == "int32": + ddf = ddf.astype({"value": "float32"}) + elif weight_t == "int64": + ddf = ddf.astype({"value": "float64"}) num_edges = len(ddf) data = get_distributed_data(ddf) @@ -184,14 +183,13 @@ def uniform_neighbor_sample(input_graph, start_list, fanout_vals, with_replacement, - is_edge_ids, workers=[wf[0]]) for idx, wf in enumerate(data.worker_to_parts.items())] wait(result) cudf_result = [client.submit(convert_to_cudf, - cp_arrays) + cp_arrays, weight_t) for cp_arrays in result] wait(cudf_result) diff --git a/python/cugraph/cugraph/experimental/dask/__init__.py b/python/cugraph/cugraph/experimental/dask/__init__.py index 4e8ef08736c..b04c7e4b5f5 100644 --- a/python/cugraph/cugraph/experimental/dask/__init__.py +++ b/python/cugraph/cugraph/experimental/dask/__init__.py @@ -10,14 +10,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from cugraph.utilities.api_tools import experimental_warning_wrapper - -from cugraph.dask.sampling.uniform_neighbor_sample import \ - uniform_neighbor_sample -uniform_neighbor_sample = \ - experimental_warning_wrapper(uniform_neighbor_sample) - -# FIXME: This call is deprecated and will be removed in the next release -from cugraph.dask.sampling.neighborhood_sampling import \ - neighborhood_sampling diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index 9b31575066c..cd9d9c47fce 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -15,7 +15,7 @@ GraphProperties, SGGraph, ) -from pylibcugraph.experimental import uniform_neighbor_sample as \ +from pylibcugraph import uniform_neighbor_sample as \ pylibcugraph_uniform_neighbor_sample import numpy @@ -93,10 +93,12 @@ def uniform_neighbor_sample(G, srcs = G.edgelist.edgelist_df['src'] dsts = G.edgelist.edgelist_df['dst'] weights = G.edgelist.edgelist_df['weights'] + weight_t = weights.dtype - if is_edge_ids and weights.dtype not in ['int32', 'in64']: - raise ValueError(f"Graph weights must have int32 or int64 values " - f"if they are edge ids, got: {weights.dtype}") + if weight_t == "int32": + weights = weights.astype("float32") + if weight_t == "int64": + weights = weights.astype("float64") if srcs.dtype != 'int32': raise ValueError(f"Graph vertices must have int32 values, " @@ -114,12 +116,18 @@ def uniform_neighbor_sample(G, sources, destinations, indices = \ pylibcugraph_uniform_neighbor_sample(resource_handle, sg, start_list, fanout_vals, with_replacement, - is_edge_ids, do_expensive_check) + do_expensive_check) df = cudf.DataFrame() df["sources"] = sources df["destinations"] = destinations df["indices"] = indices + if weight_t == "int32": + df["indices"] = indices.astype("int32") + elif weight_t == "int64": + df["indices"] = indices.astype("int64") + else: + df["indices"] = indices if G.renumbered: df = G.unrenumber(df, "sources", preserve_order=True) diff --git a/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py b/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py index 29810fd4ad4..403adf68c13 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py @@ -53,7 +53,7 @@ def _get_param_args(param_name, param_values): @pytest.mark.parametrize("directed", IS_DIRECTED) def test_mg_neighborhood_sampling_simple(dask_client, directed): - from cugraph.experimental.dask import uniform_neighborhood_sampling + from cugraph.dask import uniform_neighbor_sample df = cudf.DataFrame({"src": cudf.Series([0, 1, 1, 2, 2, 2, 3, 4], dtype="int32"), @@ -71,13 +71,12 @@ def test_mg_neighborhood_sampling_simple(dask_client, directed): # TODO: Incomplete, include more testing for tree graph as well as # for larger graphs start_list = cudf.Series([0, 1], dtype="int32") - info_list = cudf.Series([0, 0], dtype="int32") fanout_vals = [1, 1] with_replacement = True - result_nbr = uniform_neighborhood_sampling(G, - (start_list, info_list), - fanout_vals, - with_replacement) + result_nbr = uniform_neighbor_sample(G, + start_list, + fanout_vals, + with_replacement) result_nbr = result_nbr.compute() # Since the validity of results have (probably) been tested at both the C++ @@ -99,7 +98,7 @@ def test_mg_neighborhood_sampling_simple(dask_client, directed): @pytest.mark.skip(reason="Currently hangs, awaiting fix in algo") def test_mg_neighborhood_sampling_tree(dask_client, directed): - from cugraph.experimental.dask import uniform_neighborhood_sampling + from cugraph.dask import uniform_neighbor_sample input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / "small_tree.csv").as_posix() @@ -119,13 +118,12 @@ def test_mg_neighborhood_sampling_tree(dask_client, directed): # TODO: Incomplete, include more testing for tree graph as well as # for larger graphs start_list = cudf.Series([0, 0], dtype="int32") - info_list = cudf.Series([0, 0], dtype="int32") fanout_vals = [4, 1, 3] with_replacement = True - result_nbr = uniform_neighborhood_sampling(G, - (start_list, info_list), - fanout_vals, - with_replacement) + result_nbr = uniform_neighbor_sample(G, + start_list, + fanout_vals, + with_replacement) result_nbr = result_nbr.compute() # Since the validity of results have (probably) been tested at both the C++ diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 0a5bdc7cb66..f9dc4e1992f 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -36,3 +36,5 @@ from pylibcugraph.node2vec import node2vec from pylibcugraph.bfs import bfs + +from pylibcugraph.uniform_neighbor_sample import uniform_neighbor_sample diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 2961ea3f962..fa0a975885f 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -66,11 +66,5 @@ from pylibcugraph.node2vec import node2vec node2vec = promoted_experimental_warning_wrapper(node2vec) -from pylibcugraph.uniform_neighbor_sample import EXPERIMENTAL__uniform_neighbor_sample -uniform_neighbor_sample = experimental_warning_wrapper(EXPERIMENTAL__uniform_neighbor_sample) - -# FIXME: This call is deprecated and will be removed in the next release -from pylibcugraph.neighborhood_sampling import neighborhood_sampling - from pylibcugraph.katz_centrality import EXPERIMENTAL__katz_centrality katz_centrality = experimental_warning_wrapper(EXPERIMENTAL__katz_centrality) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index 574aac9bc5d..212e50863ec 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -15,7 +15,6 @@ # cython: language_level = 3 from libc.stdint cimport uintptr_t -import numpy from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -55,7 +54,6 @@ from pylibcugraph.utils cimport ( assert_success, assert_CAI_type, get_c_type_from_numpy_type, - get_c_weight_type_from_numpy_edge_ids_type, ) @@ -166,22 +164,11 @@ cdef class SGGraph(_GPUGraph): cdef uintptr_t cai_weights_ptr = \ weight_array.__cuda_array_interface__["data"][0] - - cdef cugraph_type_erased_device_array_view_t* weights_view_ptr - - if weight_array.dtype in [numpy.int32, numpy.int64]: - weights_view_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_weights_ptr, - len(weight_array), - get_c_weight_type_from_numpy_edge_ids_type( - weight_array.dtype)) - else: - weights_view_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_weights_ptr, - len(weight_array), - get_c_type_from_numpy_type(weight_array.dtype)) + cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_type_from_numpy_type(weight_array.dtype)) error_code = cugraph_sg_graph_create( resource_handle.c_resource_handle_ptr, @@ -297,21 +284,11 @@ cdef class MGGraph(_GPUGraph): cdef uintptr_t cai_weights_ptr = \ weight_array.__cuda_array_interface__["data"][0] - cdef cugraph_type_erased_device_array_view_t* weights_view_ptr - - if weight_array.dtype in [numpy.int32, numpy.int64]: - weights_view_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_weights_ptr, - len(weight_array), - get_c_weight_type_from_numpy_edge_ids_type( - weight_array.dtype)) - else: - weights_view_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_weights_ptr, - len(weight_array), - get_c_type_from_numpy_type(weight_array.dtype)) + cdef cugraph_type_erased_device_array_view_t* weights_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_weights_ptr, + len(weight_array), + get_c_type_from_numpy_type(weight_array.dtype)) error_code = cugraph_mg_graph_create( resource_handle.c_resource_handle_ptr, @@ -334,4 +311,4 @@ cdef class MGGraph(_GPUGraph): def __dealloc__(self): if self.c_graph_ptr is not NULL: - cugraph_mg_graph_free(self.c_graph_ptr) + cugraph_mg_graph_free(self.c_graph_ptr) \ No newline at end of file diff --git a/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py b/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py index 1afb44b1a99..a6b5bea1c5f 100644 --- a/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py +++ b/python/pylibcugraph/pylibcugraph/tests/test_neighborhood_sampling.py @@ -19,7 +19,7 @@ ResourceHandle, GraphProperties, ) -from pylibcugraph.experimental import uniform_neighbor_sample +from pylibcugraph import uniform_neighbor_sample # ============================================================================= diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index 50f3182ebf1..4ed412a3127 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -66,13 +66,12 @@ from pylibcugraph.utils cimport ( ) -def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, - _GPUGraph input_graph, - start_list, - h_fan_out, - bool_t with_replacement, - bool_t is_edge_ids, - bool_t do_expensive_check): +def uniform_neighbor_sample(ResourceHandle resource_handle, + _GPUGraph input_graph, + start_list, + h_fan_out, + bool_t with_replacement, + bool_t do_expensive_check): """ Does neighborhood sampling, which samples nodes from a graph based on the current node's neighbors, with a corresponding fanout value at each hop. @@ -96,10 +95,6 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, with_replacement: bool If true, sampling procedure is done with replacement (the same vertex can be selected multiple times in the same step). - - is_edge_ids: bool - Flag to specify if the weights were passed as edge_ids. - If true, the input graph's weight will be treated as edge ids do_expensive_check: bool If True, performs more extensive tests on the inputs to ensure @@ -161,11 +156,7 @@ def EXPERIMENTAL__uniform_neighbor_sample(ResourceHandle resource_handle, cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) - - if is_edge_ids: - cupy_indices = copy_to_cupy_array_ids(c_resource_handle_ptr, index_ptr) - else: - cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) + cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) cugraph_sample_result_free(result_ptr) cugraph_type_erased_device_array_view_free(start_ptr) diff --git a/python/pylibcugraph/pylibcugraph/utils.pyx b/python/pylibcugraph/pylibcugraph/utils.pyx index 79d8ebe2579..8ae9e680c5d 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pyx +++ b/python/pylibcugraph/pylibcugraph/utils.pyx @@ -154,14 +154,11 @@ cdef copy_to_cupy_array( cdef copy_to_cupy_array_ids( cugraph_resource_handle_t* c_resource_handle_ptr, cugraph_type_erased_device_array_view_t* device_array_view_ptr): - # FIXME: Update this docstring """ Copy the contents from a device array view as returned by various cugraph_* APIs to a new cupy device array, typically intended to be used as a return - value from pylibcugraph APIs. + value from pylibcugraph APIs then convert float to int """ - - cdef c_type = cugraph_type_erased_device_array_view_type( device_array_view_ptr) From d97ce6721e0dda8b30a2e2f555ce75153f7d06ac Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Mon, 30 May 2022 01:06:33 -0700 Subject: [PATCH 15/21] update uniform neighborhood sampling tests --- .../sampling/uniform_neighbor_sample.py | 4 - .../tests/mg/test_mg_neighborhood_sampling.py | 189 ++++++++++----- .../tests/test_neighborhood_sampling.py | 218 ++++++++++++++++++ .../pylibcugraph/neighborhood_sampling.pyx | 184 --------------- 4 files changed, 345 insertions(+), 250 deletions(-) create mode 100644 python/cugraph/cugraph/tests/test_neighborhood_sampling.py delete mode 100644 python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx diff --git a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py index cd9d9c47fce..21125601dcb 100644 --- a/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/sampling/uniform_neighbor_sample.py @@ -48,10 +48,6 @@ def uniform_neighbor_sample(G, with_replacement: bool, optional (default=True) Flag to specify if the random sampling is done with replacement - is_edge_ids: bool, (default=False) - Flag to specify if the input graph's weights were passed as edge_ids. - If true, the input graph's weight will be treated as edge ids - Returns ------- result : cudf.DataFrame diff --git a/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py b/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py index 403adf68c13..ebaf6a8880f 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_neighborhood_sampling.py @@ -10,96 +10,145 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import pandas as pd import gc import pytest import cugraph.dask as dcg import cugraph import dask_cudf import cudf -from cugraph.dask.common.mg_utils import is_single_gpu from cugraph.testing import utils +from cugraph.dask import uniform_neighbor_sample +import random # ============================================================================= -# Test helpers +# Pytest Setup / Teardown - called for each test function # ============================================================================= def setup_function(): gc.collect() +# ============================================================================= +# Pytest fixtures +# ============================================================================= IS_DIRECTED = [True, False] +# FIXME: Do more testing for this datasets +# [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"] +datasets = utils.DATASETS_UNDIRECTED + +fixture_params = utils.genFixtureParamsProduct( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), + ([False, True], "with_replacement"), + (["int32", "float32"], "indices_type") + ) -# datasets = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"karate.csv" -datasets = utils.DATASETS_SMALL -fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file")) - - -def _get_param_args(param_name, param_values): +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): """ - Returns a tuple of (, ) which can be applied - as the args to pytest.mark.parametrize(). The pytest.param list also - contains param id string formed from the param name and values. + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. """ - return (param_name, - [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) + parameters = dict(zip(("graph_file", + "directed", + "with_replacement", + "indices_type"), request.param)) + indices_type = parameters["indices_type"] -# @pytest.mark.skipif( -# is_single_gpu(), reason="skipping MG testing on Single GPU system" -# ) -@pytest.mark.skip(reason="Currently hangs, awaiting fix in algo") -@pytest.mark.parametrize("directed", IS_DIRECTED) -def test_mg_neighborhood_sampling_simple(dask_client, directed): + input_data_path = parameters["graph_file"] + directed = parameters["directed"] - from cugraph.dask import uniform_neighbor_sample + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + chunksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", indices_type], + ) - df = cudf.DataFrame({"src": cudf.Series([0, 1, 1, 2, 2, 2, 3, 4], - dtype="int32"), - "dst": cudf.Series([1, 3, 4, 0, 1, 3, 5, 5], - dtype="int32"), - "value": cudf.Series([0.1, 2.1, 1.1, 5.1, 3.1, - 4.1, 7.2, 3.2], - dtype="float32"), - }) - ddf = dask_cudf.from_cudf(df, npartitions=2) + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, source='src', destination='dst', edge_attr='value') - G = cugraph.Graph(directed=directed) - G.from_dask_cudf_edgelist(ddf, "src", "dst", "value") + parameters["MGGraph"] = dg - # TODO: Incomplete, include more testing for tree graph as well as - # for larger graphs - start_list = cudf.Series([0, 1], dtype="int32") - fanout_vals = [1, 1] - with_replacement = True - result_nbr = uniform_neighbor_sample(G, - start_list, - fanout_vals, - with_replacement) - result_nbr = result_nbr.compute() + # sample k vertices from the cuGraph graph + k = random.randint(1, 10) + srcs = dg.input_df["src"] + dsts = dg.input_df["dst"] - # Since the validity of results have (probably) been tested at both the C++ - # and C layers, simply test that the python interface and conversions were - # done correctly. - assert result_nbr['sources'].dtype == "int32" - assert result_nbr['destinations'].dtype == "int32" - assert result_nbr['labels'].dtype == "int32" - assert result_nbr['indices'].dtype == "int32" + vertices = dask_cudf.concat([srcs, dsts]).drop_duplicates().compute() + start_list = vertices.sample(k) + + # Generate a random fanout_vals list of length k + fanout_vals = [random.randint(1, k) for _ in range(k)] + + # These prints are for debugging purposes since the vertices and the + # fanout_vals are randomly sampled/chosen + print("start_list: \n", start_list) + print("fanout_vals: ", fanout_vals) + + parameters["start_list"] = start_list + parameters["fanout_vals"] = fanout_vals + + return parameters + + +def test_mg_neighborhood_sampling_simple(dask_client, input_combo): + + dg = input_combo["MGGraph"] - # ALl labels should be 0 or 1 - assert result_nbr['labels'].isin([0, 1]).all() + input_df = dg.input_df + result_nbr = uniform_neighbor_sample(dg, + input_combo["start_list"], + input_combo["fanout_vals"], + input_combo["with_replacement"]) + + # multi edges are dropped to easily verify that each edge in the + # results is present in the input dataframe + result_nbr = result_nbr.drop_duplicates() + + # FIXME: The indices are not included in the comparison because garbage + # value are intermittently retuned. This observation is observed when + # passing float weights + join = result_nbr.merge( + input_df, left_on=[*result_nbr.columns[:2]], + right_on=[*input_df.columns[:2]]) + if len(result_nbr) != len(join): + join2 = input_df.merge( + result_nbr, how='left', left_on=[*input_df.columns], + right_on=[*result_nbr.columns]) + pd.set_option('display.max_rows', 500) + print('df1 = \n', input_df.sort_values([*input_df.columns])) + print('df2 = \n', result_nbr.sort_values( + [*result_nbr.columns]).compute()) + print('join2 = \n', join2.sort_values( + [*input_df.columns]).compute().to_pandas().query( + 'sources.isnull()', engine='python')) + + assert len(join) == len(result_nbr) + # Ensure the right indices type is returned + assert result_nbr['indices'].dtype == input_combo["indices_type"] + + start_list = input_combo["start_list"].to_pandas() + result_nbr_vertices = dask_cudf.concat( + [result_nbr["sources"], result_nbr["destinations"]]). \ + drop_duplicates().compute().reset_index(drop=True) + + result_nbr_vertices = result_nbr_vertices.to_pandas() + + # The vertices in start_list must be a subsets of the vertices + # in the result + assert set(start_list).issubset(set(result_nbr_vertices)) -@pytest.mark.skipif( - is_single_gpu(), reason="skipping MG testing on Single GPU system" -) @pytest.mark.parametrize("directed", IS_DIRECTED) -@pytest.mark.skip(reason="Currently hangs, awaiting fix in algo") def test_mg_neighborhood_sampling_tree(dask_client, directed): - from cugraph.dask import uniform_neighbor_sample - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / "small_tree.csv").as_posix() chunksize = dcg.get_chunksize(input_data_path) @@ -124,15 +173,31 @@ def test_mg_neighborhood_sampling_tree(dask_client, directed): start_list, fanout_vals, with_replacement) - result_nbr = result_nbr.compute() + result_nbr = result_nbr.drop_duplicates() + + # input_df != ddf if 'directed = False' because ddf will be symmetrized + # internally. + input_df = G.input_df + join = result_nbr.merge( + input_df, left_on=[*result_nbr.columns[:2]], + right_on=[*input_df.columns[:2]]) + + assert len(join) == len(result_nbr) # Since the validity of results have (probably) been tested at both the C++ # and C layers, simply test that the python interface and conversions were # done correctly. assert result_nbr['sources'].dtype == "int32" assert result_nbr['destinations'].dtype == "int32" - assert result_nbr['labels'].dtype == "int32" - assert result_nbr['indices'].dtype == "int32" + assert result_nbr['indices'].dtype == "float32" + + result_nbr_vertices = dask_cudf.concat( + [result_nbr["sources"], result_nbr["destinations"]]). \ + drop_duplicates().compute().reset_index(drop=True) + + result_nbr_vertices = result_nbr_vertices.to_pandas() + start_list = start_list.to_pandas() - # All labels should be 0 - assert (result_nbr['labels'] == 0).all() + # The vertices in start_list must be a subsets of the vertices + # in the result + assert set(start_list).issubset(set(result_nbr_vertices)) diff --git a/python/cugraph/cugraph/tests/test_neighborhood_sampling.py b/python/cugraph/cugraph/tests/test_neighborhood_sampling.py new file mode 100644 index 00000000000..03ab70d4f3a --- /dev/null +++ b/python/cugraph/cugraph/tests/test_neighborhood_sampling.py @@ -0,0 +1,218 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import gc +import pytest +import cugraph +import cudf +from cugraph.testing import utils +from cugraph import uniform_neighbor_sample +import random + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= +IS_DIRECTED = [True, False] +# FIXME: Do more testing for this datasets +# [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"] +datasets = utils.DATASETS_UNDIRECTED + +fixture_params = utils.genFixtureParamsProduct( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), + ([False, True], "with_replacement"), + (["int32", "float32"], "indices_type") + ) + + +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict(zip(("graph_file", + "directed", + "with_replacement", + "indices_type"), request.param)) + + indices_type = parameters["indices_type"] + + input_data_path = parameters["graph_file"] + directed = parameters["directed"] + + df = cudf.read_csv( + input_data_path, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", indices_type], + ) + + G = cugraph.Graph(directed=directed) + G.from_cudf_edgelist( + df, source='src', destination='dst', + edge_attr='value', legacy_renum_only=True) + + parameters["Graph"] = G + + # sample k vertices from the cuGraph graph + k = random.randint(1, 10) + srcs = G.view_edge_list()["src"] + dsts = G.view_edge_list()["dst"] + + vertices = cudf.concat([srcs, dsts]).drop_duplicates() + + start_list = vertices.sample(k) + # Generate a random fanout_vals list of length k + fanout_vals = [random.randint(1, k) for _ in range(k)] + + # These prints are for debugging purposes since the vertices and + # the fanout_vals are randomly sampled/chosen + print("start_list: \n", start_list) + print("fanout_vals: ", fanout_vals) + + parameters["start_list"] = start_list + parameters["fanout_vals"] = fanout_vals + + return parameters + + +def test_neighborhood_sampling_simple(input_combo): + + G = input_combo["Graph"] + + # + # Make sure the old C++ renumbering was skipped because: + # 1) Pylibcugraph already does renumbering + # 2) Uniform neighborhood sampling allows int32 weights + # which are not supported by the C++ renumbering + # This should be 'True' only for string vertices and multi columns vertices + # + + assert G.renumbered is False + # Retrieve the input dataframe. + # FIXME: in simpleGraph and simpleDistributedGraph, G.edgelist.edgelist_df + # should be 'None' if the datasets was never renumbered + input_df = G.edgelist.edgelist_df + + result_nbr = uniform_neighbor_sample(G, + input_combo["start_list"], + input_combo["fanout_vals"], + input_combo["with_replacement"]) + + # multi edges are dropped to easily verify that each edge in the + # results is present in the input dataframe + result_nbr = result_nbr.drop_duplicates() + + # FIXME: The indices are not included in the comparison because garbage + # value are intermittently retuned. This observation is observed + # when passing float weights + join = result_nbr.merge( + input_df, left_on=[*result_nbr.columns[:2]], + right_on=[*input_df.columns[:2]]) + + if len(result_nbr) != len(join): + join2 = input_df.merge( + result_nbr, how='left', left_on=[*input_df.columns], + right_on=[*result_nbr.columns]) + + pd.set_option('display.max_rows', 500) + print('df1 = \n', input_df.sort_values([*input_df.columns])) + print('df2 = \n', result_nbr.sort_values([*result_nbr.columns])) + print('join2 = \n', join2.sort_values([*input_df.columns]) + .to_pandas().query('sources.isnull()', engine='python')) + + assert len(join) == len(result_nbr) + # Ensure the right indices type is returned + assert result_nbr['indices'].dtype == input_combo["indices_type"] + + start_list = input_combo["start_list"] + + result_nbr_vertices = cudf.concat( + [result_nbr["sources"], result_nbr["destinations"]]) \ + .drop_duplicates().reset_index(drop=True) + + assert set( + start_list.to_pandas()).issubset( + set(result_nbr_vertices.to_pandas())) + + +@pytest.mark.parametrize("directed", IS_DIRECTED) +def test_mg_neighborhood_sampling_tree(directed): + + input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / + "small_tree.csv").as_posix() + + df = cudf.read_csv( + input_data_path, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + G = cugraph.Graph(directed=directed) + G.from_cudf_edgelist(df, "src", "dst", "value", legacy_renum_only=True) + + # + # Make sure the old C++ renumbering was skipped because: + # 1) Pylibcugraph already does renumbering + # 2) Uniform neighborhood sampling allows int32 weights + # which are not supported by the C++ renumbering + # This should be 'True' only for string vertices and multi columns vertices + # + + assert G.renumbered is False + + # Retrieve the input dataframe. + # input_df != df if 'directed = False' because df will be symmetrized + # internally. + input_df = G.edgelist.edgelist_df + + # TODO: Incomplete, include more testing for tree graph as well as + # for larger graphs + start_list = cudf.Series([0, 0], dtype="int32") + fanout_vals = [4, 1, 3] + with_replacement = True + result_nbr = uniform_neighbor_sample(G, + start_list, + fanout_vals, + with_replacement) + + result_nbr = result_nbr.drop_duplicates() + + join = result_nbr.merge( + input_df, left_on=[*result_nbr.columns[:2]], + right_on=[*input_df.columns[:2]]) + + assert len(join) == len(result_nbr) + # Since the validity of results have (probably) been tested at both the C++ + # and C layers, simply test that the python interface and conversions were + # done correctly. + assert result_nbr['sources'].dtype == "int32" + assert result_nbr['destinations'].dtype == "int32" + assert result_nbr['indices'].dtype == "float32" + + result_nbr_vertices = cudf.concat( + [result_nbr["sources"], result_nbr["destinations"]]). \ + drop_duplicates().reset_index(drop=True) + + assert set( + start_list.to_pandas()).issubset(set(result_nbr_vertices.to_pandas())) diff --git a/python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx b/python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx deleted file mode 100644 index 73e51720532..00000000000 --- a/python/pylibcugraph/pylibcugraph/neighborhood_sampling.pyx +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Have cython use python 3 syntax -# cython: language_level = 3 - -import warnings -from libc.stdint cimport uintptr_t - -from pylibcugraph._cugraph_c.resource_handle cimport ( - bool_t, - data_type_id_t, - cugraph_resource_handle_t, -) -from pylibcugraph._cugraph_c.error cimport ( - cugraph_error_code_t, - cugraph_error_t, -) -from pylibcugraph._cugraph_c.array cimport ( - cugraph_type_erased_device_array_view_t, - cugraph_type_erased_device_array_view_create, - cugraph_type_erased_device_array_free, - cugraph_type_erased_host_array_view_t, - cugraph_type_erased_host_array_view_create -) -from pylibcugraph._cugraph_c.graph cimport ( - cugraph_graph_t, -) -from pylibcugraph._cugraph_c.algorithms cimport ( - cugraph_uniform_neighbor_sample, - cugraph_sample_result_t, - cugraph_sample_result_get_sources, - cugraph_sample_result_get_destinations, - cugraph_sample_result_get_start_labels, - cugraph_sample_result_get_index, - cugraph_sample_result_get_counts, - cugraph_sample_result_free, -) -from pylibcugraph.resource_handle cimport ( - ResourceHandle, -) -from pylibcugraph.graphs cimport ( - _GPUGraph, - MGGraph, -) -from pylibcugraph.utils cimport ( - assert_success, - copy_to_cupy_array, - assert_CAI_type, - assert_AI_type, - get_c_type_from_numpy_type, -) - - -def neighborhood_sampling(ResourceHandle resource_handle, - MGGraph input_graph, - start_list, - labels_list, - h_fan_out, - bool_t with_replacement, - bool_t do_expensive_check): - """ - Does neighborhood sampling, which samples nodes from a graph based on the - current node's neighbors, with a corresponding fanout value at each hop. - - Parameters - ---------- - resource_handle: ResourceHandle - Handle to the underlying device and host resources needed for - referencing data and running algorithms. - - input_graph: MGGraph - The input graph, for Multi-GPU operations. - - start_list: device array type - Device array containing the list of starting vertices for sampling. - - labels_list: device array type - Device array containing the starting labels for reorganizing the - results after sending the input to different callers. - - h_fan_out: numpy array type - Device array containing the brancing out (fan-out) degrees per - starting vertex for each hop level. - - with_replacement: bool - If true, sampling procedure is done with replacement (the same vertex - can be selected multiple times in the same step). - - do_expensive_check: bool - If True, performs more extensive tests on the inputs to ensure - validitity, at the expense of increased run time. - - Returns - ------- - A tuple of device arrays, where the first and second items in the tuple - are device arrays containing the starting and ending vertices of each - walk respectively, the third item in the tuple is a device array - containing the start labels, the fourth item in the tuple is a device - array containing the indices for reconstructing paths. - - """ - warning_msg = ("This call is deprecated and will be removed" - "in the next release") - warnings.warn(warning_msg, PendingDeprecationWarning) - - cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ - resource_handle.c_resource_handle_ptr - cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr - - assert_CAI_type(start_list, "start_list") - assert_CAI_type(labels_list, "labels_list") - assert_AI_type(h_fan_out, "h_fan_out") - - cdef cugraph_sample_result_t* result_ptr - cdef cugraph_error_code_t error_code - cdef cugraph_error_t* error_ptr - - cdef uintptr_t cai_start_ptr = \ - start_list.__cuda_array_interface__["data"][0] - cdef uintptr_t cai_labels_ptr = \ - labels_list.__cuda_array_interface__["data"][0] - cdef uintptr_t ai_fan_out_ptr = \ - h_fan_out.__array_interface__["data"][0] - - cdef cugraph_type_erased_device_array_view_t* start_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_start_ptr, - len(start_list), - get_c_type_from_numpy_type(start_list.dtype)) - cdef cugraph_type_erased_device_array_view_t* start_labels_ptr = \ - cugraph_type_erased_device_array_view_create( - cai_labels_ptr, - len(labels_list), - get_c_type_from_numpy_type(labels_list.dtype)) - cdef cugraph_type_erased_host_array_view_t* fan_out_ptr = \ - cugraph_type_erased_host_array_view_create( - ai_fan_out_ptr, - len(h_fan_out), - get_c_type_from_numpy_type(h_fan_out.dtype)) - - error_code = cugraph_uniform_neighbor_sample(c_resource_handle_ptr, - c_graph_ptr, - start_ptr, - start_labels_ptr, - fan_out_ptr, - with_replacement, - do_expensive_check, - &result_ptr, - &error_ptr) - assert_success(error_code, error_ptr, "uniform_nbr_sample") - - # TODO: counts is a part of the output, but another copy_to_cupy array - # with appropriate host array types would likely be required. Also - # potential memory leak until this is covered - cdef cugraph_type_erased_device_array_view_t* src_ptr = \ - cugraph_sample_result_get_sources(result_ptr) - cdef cugraph_type_erased_device_array_view_t* dst_ptr = \ - cugraph_sample_result_get_destinations(result_ptr) - cdef cugraph_type_erased_device_array_view_t* labels_ptr = \ - cugraph_sample_result_get_start_labels(result_ptr) - cdef cugraph_type_erased_device_array_view_t* index_ptr = \ - cugraph_sample_result_get_index(result_ptr) - # cdef cugraph_type_erased_host_array_view_t* counts_ptr = \ - # cugraph_sample_result_get_counts(result_ptr) - - cupy_sources = copy_to_cupy_array(c_resource_handle_ptr, src_ptr) - cupy_destinations = copy_to_cupy_array(c_resource_handle_ptr, dst_ptr) - cupy_labels = copy_to_cupy_array(c_resource_handle_ptr, labels_ptr) - cupy_indices = copy_to_cupy_array(c_resource_handle_ptr, index_ptr) - # cupy_counts = copy_to_cupy_array(c_resource_handle_ptr, counts_ptr) - - return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices) - # return (cupy_sources, cupy_destinations, cupy_labels, cupy_indices, cupy_counts) From 720b05d9c9b89ce75f7b0b5d3d25a5622d890919 Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Mon, 30 May 2022 01:37:28 -0700 Subject: [PATCH 16/21] remove uniform neighbor sample older mechanism --- .../pylibcugraph/_cugraph_c/algorithms.pxd | 30 +------------------ 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index 5d2cd9a40dd..396b73afee5 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -171,44 +171,16 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_sample_result_t* result ) - # FIXME: This will be obsolete when the older mechanism is removed - cdef cugraph_type_erased_device_array_view_t* \ - cugraph_sample_result_get_start_labels( - cugraph_sample_result_t* result - ) - cdef cugraph_type_erased_device_array_view_t* \ cugraph_sample_result_get_index( cugraph_sample_result_t* result ) - # FIXME: This will be obsolete when the older mechanism is removed - cdef cugraph_type_erased_host_array_view_t* \ - cugraph_sample_result_get_counts( - cugraph_sample_result_t* result - ) - cdef void \ cugraph_sample_result_free( cugraph_sample_result_t* result ) - - # uniform neighborhood sampling - # FIXME: This older API will be phased out in favor of - # the experimental one below - cdef cugraph_error_code_t \ - cugraph_uniform_neighbor_sample( - const cugraph_resource_handle_t* handle, - cugraph_graph_t* graph, - const cugraph_type_erased_device_array_view_t* start, - const cugraph_type_erased_device_array_view_t* start_labels, - const cugraph_type_erased_host_array_view_t* fan_out, - bool_t without_replacement, - bool_t do_expensive_check, - cugraph_sample_result_t** result, - cugraph_error_t** error - ) - + # uniform neighborhood sampling cdef cugraph_error_code_t \ cugraph_experimental_uniform_neighbor_sample( From 78f7dd6adb0e75de98d28ddc44cebf16bd76752f Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Mon, 30 May 2022 01:42:38 -0700 Subject: [PATCH 17/21] add end of line --- python/pylibcugraph/pylibcugraph/graphs.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcugraph/pylibcugraph/graphs.pyx b/python/pylibcugraph/pylibcugraph/graphs.pyx index 212e50863ec..96cb224eb75 100644 --- a/python/pylibcugraph/pylibcugraph/graphs.pyx +++ b/python/pylibcugraph/pylibcugraph/graphs.pyx @@ -311,4 +311,4 @@ cdef class MGGraph(_GPUGraph): def __dealloc__(self): if self.c_graph_ptr is not NULL: - cugraph_mg_graph_free(self.c_graph_ptr) \ No newline at end of file + cugraph_mg_graph_free(self.c_graph_ptr) From 7fdc09db546bd940d08f04507e44f307112120aa Mon Sep 17 00:00:00 2001 From: root Date: Wed, 1 Jun 2022 18:04:46 +0000 Subject: [PATCH 18/21] remove merge labels --- python/pylibcugraph/pylibcugraph/experimental/__init__.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/experimental/__init__.py b/python/pylibcugraph/pylibcugraph/experimental/__init__.py index 24237689451..be08848d0a0 100644 --- a/python/pylibcugraph/pylibcugraph/experimental/__init__.py +++ b/python/pylibcugraph/pylibcugraph/experimental/__init__.py @@ -65,12 +65,7 @@ from pylibcugraph.node2vec import node2vec node2vec = promoted_experimental_warning_wrapper(node2vec) -<<<<<<< HEAD -======= - -from pylibcugraph.uniform_neighborhood_sampling import EXPERIMENTAL__uniform_neighborhood_sampling -uniform_neighborhood_sampling = experimental_warning_wrapper(EXPERIMENTAL__uniform_neighborhood_sampling) from pylibcugraph.triangle_count import EXPERIMENTAL__triangle_count triangle_count = experimental_warning_wrapper(EXPERIMENTAL__triangle_count) ->>>>>>> upstream/branch-22.06 + From b629ca983a4d1bdef26f6a41cfa7a8ecf9825b6f Mon Sep 17 00:00:00 2001 From: root Date: Wed, 1 Jun 2022 18:09:15 +0000 Subject: [PATCH 19/21] remove outdated fixme --- python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index 4ed412a3127..ae2a42a3185 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -16,9 +16,6 @@ from libc.stdint cimport uintptr_t -# FIXME: Added this -import numpy - from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, data_type_id_t, @@ -55,11 +52,9 @@ from pylibcugraph.graphs cimport ( _GPUGraph, MGGraph, ) -# FIXME: added copy_to_cupy_array_ from pylibcugraph.utils cimport ( assert_success, copy_to_cupy_array, - copy_to_cupy_array_ids, assert_CAI_type, assert_AI_type, get_c_type_from_numpy_type, From 8a8f06392630d40a0465de4ec37f11447eb135bf Mon Sep 17 00:00:00 2001 From: root Date: Wed, 1 Jun 2022 18:14:03 +0000 Subject: [PATCH 20/21] remove unused import --- python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx index ae2a42a3185..8dfea32d821 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_neighbor_sample.pyx @@ -32,7 +32,6 @@ from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_host_array_view_t, cugraph_type_erased_host_array_view_create, cugraph_type_erased_host_array_view_free, - cugraph_type_erased_device_array_view_type, ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, From 1af8e7b668bb7d11bf119aa7cb43fe21cd641ebf Mon Sep 17 00:00:00 2001 From: Joseph Nke Date: Wed, 1 Jun 2022 11:15:15 -0700 Subject: [PATCH 21/21] add end of line --- python/pylibcugraph/pylibcugraph/utils.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index 21c6e66b8dc..83f534c297e 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -49,4 +49,4 @@ cdef copy_to_cupy_array( cdef copy_to_cupy_array_ids( cugraph_resource_handle_t* c_resource_handle_ptr, - cugraph_type_erased_device_array_view_t* device_array_view_ptr) \ No newline at end of file + cugraph_type_erased_device_array_view_t* device_array_view_ptr)