From e56ca085b6b508067972d6be39a6e0f227b36894 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 9 Jul 2024 04:49:17 -0700 Subject: [PATCH 1/5] Further optimize `from_pandas_edgelist` with cudf --- .../nx-cugraph/nx_cugraph/convert_matrix.py | 28 ++++++++++++---- .../nx-cugraph/nx_cugraph/tests/test_utils.py | 20 +++++++++++- python/nx-cugraph/nx_cugraph/utils/misc.py | 32 +++++++++++++++++++ 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py index 67f6386987b..f5fd34a8d37 100644 --- a/python/nx-cugraph/nx_cugraph/convert_matrix.py +++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py @@ -15,7 +15,7 @@ import numpy as np from .generators._utils import _create_using_class -from .utils import index_dtype, networkx_algorithm +from .utils import _cp_iscopied_asarray, index_dtype, networkx_algorithm __all__ = [ "from_pandas_edgelist", @@ -36,14 +36,26 @@ def from_pandas_edgelist( """cudf.DataFrame inputs also supported; value columns with str is unsuppported.""" graph_class, inplace = _create_using_class(create_using) # Try to be optimal whether using pandas, cudf, or cudf.pandas - src_array = df[source].to_numpy() - dst_array = df[target].to_numpy() + src_series = df[source] + dst_series = df[target] try: # Optimistically try to use cupy, but fall back to numpy if necessary - src_array = cp.asarray(src_array) - dst_array = cp.asarray(dst_array) + src_array = src_series.to_cupy() + dst_array = dst_series.to_cupy() + except (AttributeError, ValueError, NotImplementedError): + src_array = src_series.to_numpy() + dst_array = dst_series.to_numpy() + try: + # Minimize unnecessary data copies by tracking whether we copy or not + is_src_copied, src_array = _cp_iscopied_asarray( + src_array, orig_object=src_series + ) + is_dst_copied, dst_array = _cp_iscopied_asarray( + dst_array, orig_object=dst_series + ) np_or_cp = cp except ValueError: + is_src_copied = is_dst_copied = False src_array = np.asarray(src_array) dst_array = np.asarray(dst_array) np_or_cp = np @@ -65,8 +77,10 @@ def from_pandas_edgelist( src_indices = cp.asarray(np_or_cp.searchsorted(nodes, src_array), index_dtype) dst_indices = cp.asarray(np_or_cp.searchsorted(nodes, dst_array), index_dtype) else: - src_indices = cp.array(src_array) - dst_indices = cp.array(dst_array) + if not is_src_copied: + src_indices = cp.array(src_array) + if not is_dst_copied: + dst_indices = cp.array(dst_array) if not graph_class.is_directed(): # Symmetrize the edges diff --git a/python/nx-cugraph/nx_cugraph/tests/test_utils.py b/python/nx-cugraph/nx_cugraph/tests/test_utils.py index fdd0c91995c..9d8236053d1 100644 --- a/python/nx-cugraph/nx_cugraph/tests/test_utils.py +++ b/python/nx-cugraph/nx_cugraph/tests/test_utils.py @@ -10,10 +10,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import cupy as cp import numpy as np import pytest -from nx_cugraph.utils import _get_int_dtype +from nx_cugraph.utils import _cp_iscopied_asarray, _get_int_dtype def test_get_int_dtype(): @@ -85,3 +86,20 @@ def test_get_int_dtype(): _get_int_dtype(7, signed=True, unsigned=True) assert _get_int_dtype(7, signed=True, unsigned=False) == np.int8 assert _get_int_dtype(7, signed=False, unsigned=True) == np.uint8 + + +def test_cp_iscopied_asarray(): + # We don't yet run doctest, so do simple copy/paste test here. + # + # >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3]) + # >>> is_copied + # True + # >>> a + # array([1, 2, 3]) + # >>> _cp_iscopied_asarray(a) + # (False, array([1, 2, 3])) + is_copied, a = _cp_iscopied_asarray([1, 2, 3]) + assert is_copied is True + assert isinstance(a, cp.ndarray) + assert repr(a) == "array([1, 2, 3])" + assert _cp_iscopied_asarray(a)[0] is False diff --git a/python/nx-cugraph/nx_cugraph/utils/misc.py b/python/nx-cugraph/nx_cugraph/utils/misc.py index eab4b42c2cc..8526524f1de 100644 --- a/python/nx-cugraph/nx_cugraph/utils/misc.py +++ b/python/nx-cugraph/nx_cugraph/utils/misc.py @@ -45,6 +45,7 @@ def pairwise(it): "_get_int_dtype", "_get_float_dtype", "_dtype_param", + "_cp_iscopied_asarray", ] # This may switch to np.uint32 at some point @@ -206,3 +207,34 @@ def _get_float_dtype( f"Dtype {dtype} cannot be safely promoted to float32 or float64" ) return rv + + +def _cp_iscopied_asarray(a, *args, orig_object=None, **kwargs): + """Like ``cp.asarray``, but also returns whether the input was copied. + + Use this to avoid unnecessary copies. If given, ``orig_object`` will + also be inspected to determine if it was copied. + + >>> is_copied, a = _cp_iscopied_asarray([1, 2, 3]) + >>> is_copied + True + >>> a + array([1, 2, 3]) + >>> _cp_iscopied_asarray(a) + (False, array([1, 2, 3])) + """ + arr = cp.asarray(a, *args, **kwargs) + ptr = arr.__cuda_array_interface__["data"][0] + if ( + hasattr(a, "__cuda_array_interface__") + and a.__cuda_array_interface__["data"][0] == ptr + and ( + orig_object is None + or hasattr(orig_object, "__cuda_array_interface__") + and orig_object.__cuda_array_interface__["data"][0] == ptr + ) + # Should we also check device_id? + # and getattr(getattr(a, "data", None), "device_id", None) == arr.data.device_id + ): + return False, arr + return True, arr From 62619691884a09c403273f74931e92a78037ac2c Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 9 Jul 2024 05:01:55 -0700 Subject: [PATCH 2/5] Oops fix copyright (and install/run pre-commit) --- python/nx-cugraph/nx_cugraph/tests/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/nx-cugraph/nx_cugraph/tests/test_utils.py b/python/nx-cugraph/nx_cugraph/tests/test_utils.py index 9d8236053d1..d38a286fa5d 100644 --- a/python/nx-cugraph/nx_cugraph/tests/test_utils.py +++ b/python/nx-cugraph/nx_cugraph/tests/test_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 768b44d57b66e1e6a3cb2779439c8915d1682ee5 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 9 Jul 2024 08:10:55 -0700 Subject: [PATCH 3/5] Add basic tests that smoked out a couple issues :) --- .../nx-cugraph/nx_cugraph/convert_matrix.py | 10 ++- .../nx_cugraph/tests/test_convert_matrix.py | 82 +++++++++++++++++++ 2 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py index f5fd34a8d37..e00463fbc7b 100644 --- a/python/nx-cugraph/nx_cugraph/convert_matrix.py +++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py @@ -42,7 +42,7 @@ def from_pandas_edgelist( # Optimistically try to use cupy, but fall back to numpy if necessary src_array = src_series.to_cupy() dst_array = dst_series.to_cupy() - except (AttributeError, ValueError, NotImplementedError): + except (AttributeError, TypeError, ValueError, NotImplementedError): src_array = src_series.to_numpy() dst_array = dst_series.to_numpy() try: @@ -77,9 +77,13 @@ def from_pandas_edgelist( src_indices = cp.asarray(np_or_cp.searchsorted(nodes, src_array), index_dtype) dst_indices = cp.asarray(np_or_cp.searchsorted(nodes, dst_array), index_dtype) else: - if not is_src_copied: + if is_src_copied: + src_indices = src_array + else: src_indices = cp.array(src_array) - if not is_dst_copied: + if is_dst_copied: + dst_indices = dst_array + else: dst_indices = cp.array(dst_array) if not graph_class.is_directed(): diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py new file mode 100644 index 00000000000..ee6aca9e0ea --- /dev/null +++ b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py @@ -0,0 +1,82 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pandas as pd +import pytest + +import nx_cugraph as nxcg +from nx_cugraph.utils import _cp_iscopied_asarray + +try: + import cudf +except ModuleNotFoundError: + cudf = None + + +data = [ + {"source": [0, 1], "target": [1, 2]}, # nodes are 0, 1, 2 + {"source": [0, 1], "target": [1, 3]}, # nodes are 0, 1, 3 (need renumbered!) + {"source": ["a", "b"], "target": ["b", "c"]}, # nodes are 'a', 'b', 'c' +] + + +@pytest.mark.skipif("not cudf") +@pytest.mark.parametrize("data", data) +def test_from_cudf_edgelist(data): + df = cudf.DataFrame(data) + nxcg.from_pandas_edgelist(df) # Basic smoke test + source = df["source"] + if source.dtype == int: + is_copied, src_array = _cp_iscopied_asarray(source) + assert is_copied is False + is_copied, src_array = _cp_iscopied_asarray(source.to_cupy()) + assert is_copied is False + is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source) + assert is_copied is False + is_copied, src_array = _cp_iscopied_asarray( + source.to_cupy(), orig_object=source + ) + assert is_copied is False + # to numpy + is_copied, src_array = _cp_iscopied_asarray(source.to_numpy()) + assert is_copied is True + is_copied, src_array = _cp_iscopied_asarray( + source.to_numpy(), orig_object=source + ) + assert is_copied is True + else: + with pytest.raises(TypeError): + _cp_iscopied_asarray(source) + with pytest.raises(TypeError): + _cp_iscopied_asarray(source.to_cupy()) + with pytest.raises(ValueError, match="Unsupported dtype"): + _cp_iscopied_asarray(source.to_numpy()) + with pytest.raises(ValueError, match="Unsupported dtype"): + _cp_iscopied_asarray(source.to_numpy(), orig_object=source) + + +@pytest.mark.parametrize("data", data) +def test_from_pandas_edgelist(data): + df = pd.DataFrame(data) + nxcg.from_pandas_edgelist(df) # Basic smoke test + source = df["source"] + if source.dtype == int: + is_copied, src_array = _cp_iscopied_asarray(source) + assert is_copied is True + is_copied, src_array = _cp_iscopied_asarray(source, orig_object=source) + assert is_copied is True + is_copied, src_array = _cp_iscopied_asarray(source.to_numpy()) + assert is_copied is True + is_copied, src_array = _cp_iscopied_asarray( + source.to_numpy(), orig_object=source + ) + assert is_copied is True From 530b482a5ac104623bb1e797ba56b587c0aff6ed Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Tue, 9 Jul 2024 08:16:58 -0700 Subject: [PATCH 4/5] Test `create_using` too --- .../nx_cugraph/tests/test_convert_matrix.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py index ee6aca9e0ea..0a9cc087ce0 100644 --- a/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py +++ b/python/nx-cugraph/nx_cugraph/tests/test_convert_matrix.py @@ -10,6 +10,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import networkx as nx import pandas as pd import pytest @@ -22,18 +23,20 @@ cudf = None -data = [ +DATA = [ {"source": [0, 1], "target": [1, 2]}, # nodes are 0, 1, 2 {"source": [0, 1], "target": [1, 3]}, # nodes are 0, 1, 3 (need renumbered!) {"source": ["a", "b"], "target": ["b", "c"]}, # nodes are 'a', 'b', 'c' ] +CREATE_USING = [nx.Graph, nx.DiGraph, nx.MultiGraph, nx.MultiDiGraph] @pytest.mark.skipif("not cudf") -@pytest.mark.parametrize("data", data) -def test_from_cudf_edgelist(data): +@pytest.mark.parametrize("data", DATA) +@pytest.mark.parametrize("create_using", CREATE_USING) +def test_from_cudf_edgelist(data, create_using): df = cudf.DataFrame(data) - nxcg.from_pandas_edgelist(df) # Basic smoke test + nxcg.from_pandas_edgelist(df, create_using=create_using) # Basic smoke test source = df["source"] if source.dtype == int: is_copied, src_array = _cp_iscopied_asarray(source) @@ -64,10 +67,11 @@ def test_from_cudf_edgelist(data): _cp_iscopied_asarray(source.to_numpy(), orig_object=source) -@pytest.mark.parametrize("data", data) -def test_from_pandas_edgelist(data): +@pytest.mark.parametrize("data", DATA) +@pytest.mark.parametrize("create_using", CREATE_USING) +def test_from_pandas_edgelist(data, create_using): df = pd.DataFrame(data) - nxcg.from_pandas_edgelist(df) # Basic smoke test + nxcg.from_pandas_edgelist(df, create_using=create_using) # Basic smoke test source = df["source"] if source.dtype == int: is_copied, src_array = _cp_iscopied_asarray(source) From 3b047ecc2abcb8a80dada359e6507848a96bf1f4 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Thu, 25 Jul 2024 06:34:03 -0700 Subject: [PATCH 5/5] Add comments about not sharing ownership of arrays. NetworkX doesn't share ownership, so neither should we --- python/nx-cugraph/nx_cugraph/convert_matrix.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/nx-cugraph/nx_cugraph/convert_matrix.py b/python/nx-cugraph/nx_cugraph/convert_matrix.py index e00463fbc7b..0c414a2f11f 100644 --- a/python/nx-cugraph/nx_cugraph/convert_matrix.py +++ b/python/nx-cugraph/nx_cugraph/convert_matrix.py @@ -34,6 +34,8 @@ def from_pandas_edgelist( edge_key=None, ): """cudf.DataFrame inputs also supported; value columns with str is unsuppported.""" + # This function never shares ownership of the underlying arrays of the DataFrame + # columns. We will perform a copy if necessary even if given e.g. a cudf.DataFrame. graph_class, inplace = _create_using_class(create_using) # Try to be optimal whether using pandas, cudf, or cudf.pandas src_series = df[source] @@ -77,6 +79,7 @@ def from_pandas_edgelist( src_indices = cp.asarray(np_or_cp.searchsorted(nodes, src_array), index_dtype) dst_indices = cp.asarray(np_or_cp.searchsorted(nodes, dst_array), index_dtype) else: + # Copy if necessary so we don't share ownership of input arrays. if is_src_copied: src_indices = src_array else: