From 728ffd05783c45acdf2ad42c5188e05ccefbb378 Mon Sep 17 00:00:00 2001 From: Joseph Nke <76006812+jnke2016@users.noreply.github.com> Date: Wed, 13 Mar 2024 17:25:00 -0500 Subject: [PATCH] Enforce matching type (#4161) This PR enforces that the `vertex` and the `start_vertices` are of the same type and throws a warning in the python API and an exception in the CAPI when there is a mismatch. closes #4094 Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Seunghwa Kang (https://github.com/seunghwak) - Naim (https://github.com/naimnv) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/4161 --- cpp/src/c_api/random_walks.cpp | 32 +++++++++++++++++++ python/cugraph/cugraph/sampling/node2vec.py | 26 ++++++++++++++- .../cugraph/tests/sampling/test_node2vec.py | 27 ++++++++++++---- 3 files changed, 77 insertions(+), 8 deletions(-) diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp index fd340a2c8e3..b9a2c8e4f60 100644 --- a/cpp/src/c_api/random_walks.cpp +++ b/cpp/src/c_api/random_walks.cpp @@ -475,6 +475,14 @@ cugraph_error_code_t cugraph_node2vec(const cugraph_resource_handle_t* handle, cugraph_random_walk_result_t** result, cugraph_error_t** error) { + CAPI_EXPECTS(reinterpret_cast(graph)->vertex_type_ == + reinterpret_cast( + start_vertices) + ->type_, + CUGRAPH_INVALID_INPUT, + "vertex type of graph and start_vertices must match", + *error); + cugraph::c_api::node2vec_functor functor( handle, graph, start_vertices, max_length, compress_results, p, q); @@ -528,6 +536,14 @@ cugraph_error_code_t cugraph_uniform_random_walks( cugraph_random_walk_result_t** result, cugraph_error_t** error) { + CAPI_EXPECTS(reinterpret_cast(graph)->vertex_type_ == + reinterpret_cast( + start_vertices) + ->type_, + CUGRAPH_INVALID_INPUT, + "vertex type of graph and start_vertices must match", + *error); + uniform_random_walks_functor functor(handle, graph, start_vertices, max_length); return cugraph::c_api::run_algorithm(graph, functor, result, error); @@ -541,6 +557,14 @@ cugraph_error_code_t cugraph_biased_random_walks( cugraph_random_walk_result_t** result, cugraph_error_t** error) { + CAPI_EXPECTS(reinterpret_cast(graph)->vertex_type_ == + reinterpret_cast( + start_vertices) + ->type_, + CUGRAPH_INVALID_INPUT, + "vertex type of graph and start_vertices must match", + *error); + biased_random_walks_functor functor(handle, graph, start_vertices, max_length); return cugraph::c_api::run_algorithm(graph, functor, result, error); @@ -556,6 +580,14 @@ cugraph_error_code_t cugraph_node2vec_random_walks( cugraph_random_walk_result_t** result, cugraph_error_t** error) { + CAPI_EXPECTS(reinterpret_cast(graph)->vertex_type_ == + reinterpret_cast( + start_vertices) + ->type_, + CUGRAPH_INVALID_INPUT, + "vertex type of graph and start_vertices must match", + *error); + node2vec_random_walks_functor functor(handle, graph, start_vertices, max_length, p, q); return cugraph::c_api::run_algorithm(graph, functor, result, error); diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index bc9b88250af..71fc2969f86 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -16,10 +16,32 @@ node2vec as pylibcugraph_node2vec, ) from cugraph.utilities import ensure_cugraph_obj_for_nx +import warnings import cudf +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, start_vertices): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes.iloc[0] + if isinstance(start_vertices, cudf.Series): + start_vertices_dtype = start_vertices.dtype + else: + start_vertices_dtype = start_vertices.dtypes.iloc[0] + + if start_vertices_dtype != vertex_dtype: + warning_msg = ( + "Node2vec requires 'start_vertices' to match the graph's " + f"'vertex' type. input graph's vertex type is: {vertex_dtype} and got " + f"'start_vertices' of type: {start_vertices_dtype}." + ) + warnings.warn(warning_msg, UserWarning) + start_vertices = start_vertices.astype(vertex_dtype) + + return start_vertices + + def node2vec(G, start_vertices, max_depth=1, compress_result=True, p=1.0, q=1.0): """ Computes random walks for each node in 'start_vertices', under the @@ -120,6 +142,8 @@ def node2vec(G, start_vertices, max_depth=1, compress_result=True, p=1.0, q=1.0) else: start_vertices = G.lookup_internal_vertex_id(start_vertices) + start_vertices = ensure_valid_dtype(G, start_vertices) + vertex_set, edge_set, sizes = pylibcugraph_node2vec( resource_handle=ResourceHandle(), graph=G._plc_graph, diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec.py b/python/cugraph/cugraph/tests/sampling/test_node2vec.py index 0bfdd460cae..00c32705338 100644 --- a/python/cugraph/cugraph/tests/sampling/test_node2vec.py +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -27,6 +27,7 @@ # ============================================================================= DIRECTED_GRAPH_OPTIONS = [False, True] COMPRESSED = [False, True] +START_VERTICES_TYPE = ["int32", "int64"] LINE = small_line KARATE = karate @@ -150,11 +151,8 @@ def test_node2vec_line(graph_file, directed): @pytest.mark.parametrize(*_get_param_args("graph_file", SMALL_DATASETS)) @pytest.mark.parametrize(*_get_param_args("directed", DIRECTED_GRAPH_OPTIONS)) @pytest.mark.parametrize(*_get_param_args("compress", COMPRESSED)) -def test_node2vec( - graph_file, - directed, - compress, -): +@pytest.mark.parametrize(*_get_param_args("start_vertices_type", START_VERTICES_TYPE)) +def test_node2vec(graph_file, directed, compress, start_vertices_type): dataset_path = graph_file.get_path() cu_M = utils.read_csv_file(dataset_path) @@ -165,8 +163,23 @@ def test_node2vec( ) num_verts = G.number_of_vertices() k = random.randint(6, 12) - start_vertices = cudf.Series(random.sample(range(num_verts), k), dtype="int32") + # FIXME: Random sample can make it hard to debug + start_vertices = cudf.Series( + random.sample(range(num_verts), k), dtype=start_vertices_type + ) max_depth = 5 + + if start_vertices_type == "int64": + warning_msg = ( + "Node2vec requires 'start_vertices' to match the graph's " + "'vertex' type. input graph's vertex type is: int32 and " + "got 'start_vertices' of type: int64." + ) + with pytest.warns(UserWarning, match=warning_msg): + calc_node2vec( + G, start_vertices, max_depth, compress_result=compress, p=0.8, q=0.5 + ) + result, seeds = calc_node2vec( G, start_vertices, max_depth, compress_result=compress, p=0.8, q=0.5 )