From 97ae73a5dd1ef3c8f4c5b9284f7dac1a51d88c82 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 4 Nov 2021 11:33:02 -0500 Subject: [PATCH 1/4] Updates to support correct comparisons of cuDF Series with different names. --- .../cugraph/tests/test_random_walks.py | 5 ++- python/cugraph/cugraph/tests/test_renumber.py | 41 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/python/cugraph/cugraph/tests/test_random_walks.py b/python/cugraph/cugraph/tests/test_random_walks.py index 3e10f94c599..f79607b1d97 100644 --- a/python/cugraph/cugraph/tests/test_random_walks.py +++ b/python/cugraph/cugraph/tests/test_random_walks.py @@ -12,12 +12,13 @@ # limitations under the License. import gc +import random import pytest +from cudf.testing import assert_series_equal from cugraph.tests import utils import cugraph -import random # ============================================================================= @@ -158,7 +159,7 @@ def test_random_walks_coalesced( v_offsets = [0] + path_data[2].cumsum()[:-1].to_array().tolist() w_offsets = [0] + (path_data[2]-1).cumsum()[:-1].to_array().tolist() - assert df['weight_sizes'].equals(path_data[2]-1) + assert_series_equal(df['weight_sizes'], path_data[2]-1, check_names=False) assert df['vertex_offsets'].to_array().tolist() == v_offsets assert df['weight_offsets'].to_array().tolist() == w_offsets diff --git a/python/cugraph/cugraph/tests/test_renumber.py b/python/cugraph/cugraph/tests/test_renumber.py index 129bd667621..f336f48a503 100644 --- a/python/cugraph/cugraph/tests/test_renumber.py +++ b/python/cugraph/cugraph/tests/test_renumber.py @@ -18,6 +18,7 @@ import pandas as pd import pytest import cudf +from cudf.testing import assert_series_equal from cugraph.structure.number_map import NumberMap from cugraph.tests import utils @@ -53,8 +54,8 @@ def test_renumber_ips(): check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] )["0"] - assert check_src.equals(gdf["source_as_int"]) - assert check_dst.equals(gdf["dest_as_int"]) + assert_series_equal(check_src, gdf["source_as_int"], check_names=False) + assert_series_equal(check_dst, gdf["dest_as_int"], check_names=False) def test_renumber_ips_cols(): @@ -88,8 +89,8 @@ def test_renumber_ips_cols(): check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] )["0"] - assert check_src.equals(gdf["source_as_int"]) - assert check_dst.equals(gdf["dest_as_int"]) + assert_series_equal(check_src, gdf["source_as_int"], check_names=False) + assert_series_equal(check_dst, gdf["dest_as_int"], check_names=False) @pytest.mark.skip(reason="temporarily dropped string support") @@ -121,8 +122,8 @@ def test_renumber_ips_str_cols(): check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] )["0"] - assert check_src.equals(gdf["source_list"]) - assert check_dst.equals(gdf["dest_list"]) + assert_series_equal(check_src, gdf["source_list"], check_names=False) + assert_series_equal(check_dst, gdf["dest_list"], check_names=False) def test_renumber_negative(): @@ -142,8 +143,8 @@ def test_renumber_negative(): check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] )["0"] - assert check_src.equals(gdf["source_list"]) - assert check_dst.equals(gdf["dest_list"]) + assert_series_equal(check_src, gdf["source_list"], check_names=False) + assert_series_equal(check_dst, gdf["dest_list"], check_names=False) def test_renumber_negative_col(): @@ -163,8 +164,8 @@ def test_renumber_negative_col(): check_dst = renumber_map.from_internal_vertex_id(renumbered_gdf['dst'] )["0"] - assert check_src.equals(gdf["source_list"]) - assert check_dst.equals(gdf["dest_list"]) + assert_series_equal(check_src, gdf["source_list"], check_names=False) + assert_series_equal(check_dst, gdf["dest_list"], check_names=False) @pytest.mark.skip(reason="dropped renumbering from series support") @@ -200,8 +201,8 @@ def test_renumber_series(graph_file): check_dst = numbering_series_2.from_internal_vertex_id(renumbered_dst, "dst_id") - assert check_src["0_y"].equals(check_src["0_x"]) - assert check_dst["0_y"].equals(check_dst["0_x"]) + assert_series_equal(check_src["0_y"], check_src["0_x"], check_names=False) + assert_series_equal(check_dst["0_y"], check_dst["0_x"], check_names=False) @pytest.mark.parametrize("graph_file", utils.DATASETS) @@ -233,8 +234,8 @@ def test_renumber_files(graph_file): unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=True) - assert exp_src.equals(unrenumbered_df["src"]) - assert exp_dst.equals(unrenumbered_df["dst"]) + assert_series_equal(exp_src, unrenumbered_df["src"], check_names=False) + assert_series_equal(exp_dst, unrenumbered_df["dst"], check_names=False) @pytest.mark.parametrize("graph_file", utils.DATASETS) @@ -265,8 +266,8 @@ def test_renumber_files_col(graph_file): unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=True) - assert exp_src.equals(unrenumbered_df["src"]) - assert exp_dst.equals(unrenumbered_df["dst"]) + assert_series_equal(exp_src, unrenumbered_df["src"], check_names=False) + assert_series_equal(exp_dst, unrenumbered_df["dst"], check_names=False) @pytest.mark.parametrize("graph_file", utils.DATASETS) @@ -295,7 +296,7 @@ def test_renumber_files_multi_col(graph_file): unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=True) - assert gdf["src"].equals(unrenumbered_df["0_src"]) - assert gdf["src_old"].equals(unrenumbered_df["1_src"]) - assert gdf["dst"].equals(unrenumbered_df["0_dst"]) - assert gdf["dst_old"].equals(unrenumbered_df["1_dst"]) + assert_series_equal(gdf["src"], unrenumbered_df["0_src"], check_names=False) + assert_series_equal(gdf["src_old"], unrenumbered_df["1_src"], check_names=False) + assert_series_equal(gdf["dst"], unrenumbered_df["0_dst"], check_names=False) + assert_series_equal(gdf["dst_old"], unrenumbered_df["1_dst"], check_names=False) From d973699ae0bd3567cb01dbb4a56fa09cd933da3e Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 4 Nov 2021 11:35:54 -0500 Subject: [PATCH 2/4] flake8 fixes. --- python/cugraph/cugraph/tests/test_renumber.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/tests/test_renumber.py b/python/cugraph/cugraph/tests/test_renumber.py index f336f48a503..b9077b512bd 100644 --- a/python/cugraph/cugraph/tests/test_renumber.py +++ b/python/cugraph/cugraph/tests/test_renumber.py @@ -296,7 +296,11 @@ def test_renumber_files_multi_col(graph_file): unrenumbered_df = renumber_map.unrenumber(unrenumbered_df, "dst", preserve_order=True) - assert_series_equal(gdf["src"], unrenumbered_df["0_src"], check_names=False) - assert_series_equal(gdf["src_old"], unrenumbered_df["1_src"], check_names=False) - assert_series_equal(gdf["dst"], unrenumbered_df["0_dst"], check_names=False) - assert_series_equal(gdf["dst_old"], unrenumbered_df["1_dst"], check_names=False) + assert_series_equal(gdf["src"], unrenumbered_df["0_src"], + check_names=False) + assert_series_equal(gdf["src_old"], unrenumbered_df["1_src"], + check_names=False) + assert_series_equal(gdf["dst"], unrenumbered_df["0_dst"], + check_names=False) + assert_series_equal(gdf["dst_old"], unrenumbered_df["1_dst"], + check_names=False) From 8dfe1c7ffb4d2d6da80cb7efb5cb9ffda5b12496 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 4 Nov 2021 12:16:29 -0500 Subject: [PATCH 3/4] Updated jaccard to use cudf series compare test utility. --- python/cugraph/cugraph/tests/test_jaccard.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/test_jaccard.py b/python/cugraph/cugraph/tests/test_jaccard.py index 8b66f88e8f4..9e3326a2b26 100644 --- a/python/cugraph/cugraph/tests/test_jaccard.py +++ b/python/cugraph/cugraph/tests/test_jaccard.py @@ -15,6 +15,8 @@ import pytest import cudf +from cudf.testing import assert_series_equal + import cugraph from cugraph.tests import utils @@ -252,4 +254,6 @@ def test_jaccard_multi_column(read_csv): df_exp = cugraph.jaccard(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"]) + actual = df_res.sort_values("0_source").reset_index() + expected = df_exp.sort_values("source").reset_index() + assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) From fb7bf3e4471b30fdec970054ef307127922870d2 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 4 Nov 2021 13:24:11 -0500 Subject: [PATCH 4/4] Updated remaining tests to use cuDF test assertions. --- ci/gpu/build.sh | 2 +- .../cugraph/tests/dask/test_mg_degree.py | 11 +++--- .../cugraph/tests/dask/test_mg_renumber.py | 19 ++++++---- .../cugraph/tests/dask/test_mg_replication.py | 35 ++++++------------- python/cugraph/cugraph/tests/test_overlap.py | 6 +++- python/cugraph/cugraph/tests/test_sorensen.py | 6 +++- python/cugraph/cugraph/tests/test_wjaccard.py | 6 +++- python/cugraph/cugraph/tests/test_woverlap.py | 5 ++- .../cugraph/cugraph/tests/test_wsorensen.py | 6 +++- 9 files changed, 54 insertions(+), 42 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index d20ea25530b..4e1c4c54ce0 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -103,7 +103,7 @@ else CONDA_FILE=`basename "$CONDA_FILE" .tar.bz2` #get filename without extension CONDA_FILE=${CONDA_FILE//-/=} #convert to conda install echo "Installing $CONDA_FILE" - conda install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" + gpuci_mamba_retry install -c ${CONDA_ARTIFACT_PATH} "$CONDA_FILE" gpuci_logger "Install the master version of dask and distributed" pip install "git+https://github.com/dask/distributed.git" --upgrade --no-deps diff --git a/python/cugraph/cugraph/tests/dask/test_mg_degree.py b/python/cugraph/cugraph/tests/dask/test_mg_degree.py index ab7285eebfa..f8af0d0e87d 100644 --- a/python/cugraph/cugraph/tests/dask/test_mg_degree.py +++ b/python/cugraph/cugraph/tests/dask/test_mg_degree.py @@ -14,8 +14,10 @@ import gc import pytest import cudf -import cugraph import dask_cudf +from cudf.testing import assert_series_equal + +import cugraph from cugraph.dask.common.mg_utils import is_single_gpu from cugraph.tests.utils import RAPIDS_DATASET_ROOT_DIR_PATH @@ -65,6 +67,7 @@ def test_dask_mg_degree(dask_client): .compute() ) - assert merge_df_in["degree_dg"].equals(merge_df_in["degree_g"]) - assert merge_df_out["degree_dg"].equals( - merge_df_out["degree_g"]) + assert_series_equal(merge_df_in["degree_dg"], merge_df_in["degree_g"], + check_names=False) + assert_series_equal(merge_df_out["degree_dg"], merge_df_out["degree_g"], + check_names=False) diff --git a/python/cugraph/cugraph/tests/dask/test_mg_renumber.py b/python/cugraph/cugraph/tests/dask/test_mg_renumber.py index 7d6c0add767..29ee0a68c1e 100644 --- a/python/cugraph/cugraph/tests/dask/test_mg_renumber.py +++ b/python/cugraph/cugraph/tests/dask/test_mg_renumber.py @@ -18,12 +18,13 @@ import pandas import numpy as np - -import cugraph.dask as dcg -import cugraph import dask_cudf import dask import cudf +from cudf.testing import assert_series_equal + +import cugraph.dask as dcg +import cugraph from cugraph.tests import utils from cugraph.structure.number_map import NumberMap from cugraph.dask.common.mg_utils import is_single_gpu @@ -72,10 +73,14 @@ def test_mg_renumber(graph_file, dask_client): "0_dst", "1_dst"]) unrenumbered_df = unrenumbered_df.reset_index() - assert gdf["src"].equals(unrenumbered_df["0_src"]) - assert gdf["src_old"].equals(unrenumbered_df["1_src"]) - assert gdf["dst"].equals(unrenumbered_df["0_dst"]) - assert gdf["dst_old"].equals(unrenumbered_df["1_dst"]) + assert_series_equal(gdf["src"], unrenumbered_df["0_src"], + check_names=False) + assert_series_equal(gdf["src_old"], unrenumbered_df["1_src"], + check_names=False) + assert_series_equal(gdf["dst"], unrenumbered_df["0_dst"], + check_names=False) + assert_series_equal(gdf["dst_old"], unrenumbered_df["1_dst"], + check_names=False) @pytest.mark.skipif( diff --git a/python/cugraph/cugraph/tests/dask/test_mg_replication.py b/python/cugraph/cugraph/tests/dask/test_mg_replication.py index ccda75c8ba2..462b0bda184 100644 --- a/python/cugraph/cugraph/tests/dask/test_mg_replication.py +++ b/python/cugraph/cugraph/tests/dask/test_mg_replication.py @@ -15,6 +15,7 @@ import gc import cudf +from cudf.testing import assert_series_equal, assert_frame_equal import cugraph import cugraph.dask.structure.replication as replication @@ -46,9 +47,7 @@ def test_replicate_cudf_dataframe_with_weights( worker_to_futures = replication.replicate_cudf_dataframe(df) for worker in worker_to_futures: replicated_df = worker_to_futures[worker].result() - assert df.equals(replicated_df), ( - "There is a mismatch in one " "of the replications" - ) + assert_frame_equal(df, replicated_df) @pytest.mark.skipif( @@ -68,9 +67,7 @@ def test_replicate_cudf_dataframe_no_weights(input_data_path, dask_client): worker_to_futures = replication.replicate_cudf_dataframe(df) for worker in worker_to_futures: replicated_df = worker_to_futures[worker].result() - assert df.equals(replicated_df), ( - "There is a mismatch in one " "of the replications" - ) + assert_frame_equal(df, replicated_df) @pytest.mark.skipif( @@ -92,9 +89,7 @@ def test_replicate_cudf_series(input_data_path, dask_client): worker_to_futures = replication.replicate_cudf_series(series) for worker in worker_to_futures: replicated_series = worker_to_futures[worker].result() - assert series.equals(replicated_series), ( - "There is a " "mismatch in one of the replications" - ) + assert_series_equal(series, replicated_series, check_names=False) # FIXME: If we do not clear this dictionary, when comparing # results for the 2nd column, one of the workers still # has a value from the 1st column @@ -225,7 +220,7 @@ def test_enable_batch_edgelist_replication( df = G.edgelist.edgelist_df for worker in G.batch_edgelists: replicated_df = G.batch_edgelists[worker].result() - assert df.equals(replicated_df), "Replication of edgelist failed" + assert_frame_equal(df, replicated_df) @pytest.mark.skipif( @@ -257,15 +252,9 @@ def test_enable_batch_adjlist_replication_weights( weights = adjlist.weights for worker in G.batch_adjlists: (rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker] - assert offsets.equals(rep_offsets.result()), ( - "Replication of " "adjlist offsets failed" - ) - assert indices.equals(rep_indices.result()), ( - "Replication of " "adjlist indices failed" - ) - assert weights.equals(rep_weights.result()), ( - "Replication of " "adjlist weights failed" - ) + assert_series_equal(offsets, rep_offsets.result(), check_names=False) + assert_series_equal(indices, rep_indices.result(), check_names=False) + assert_series_equal(weights, rep_weights.result(), check_names=False) @pytest.mark.skipif( @@ -295,10 +284,6 @@ def test_enable_batch_adjlist_replication_no_weights( weights = adjlist.weights for worker in G.batch_adjlists: (rep_offsets, rep_indices, rep_weights) = G.batch_adjlists[worker] - assert offsets.equals(rep_offsets.result()), ( - "Replication of " "adjlist offsets failed" - ) - assert indices.equals(rep_indices.result()), ( - "Replication of " "adjlist indices failed" - ) + assert_series_equal(offsets, rep_offsets.result(), check_names=False) + assert_series_equal(indices, rep_indices.result(), check_names=False) assert weights is None and rep_weights is None diff --git a/python/cugraph/cugraph/tests/test_overlap.py b/python/cugraph/cugraph/tests/test_overlap.py index 5a45fbff554..abf012ac3b9 100644 --- a/python/cugraph/cugraph/tests/test_overlap.py +++ b/python/cugraph/cugraph/tests/test_overlap.py @@ -17,6 +17,8 @@ import scipy import cudf +from cudf.testing import assert_series_equal + import cugraph from cugraph.tests import utils @@ -186,4 +188,6 @@ def test_overlap_multi_column(graph_file): df_exp = cugraph.overlap(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"]) + actual = df_res.sort_values("0_source").reset_index() + expected = df_exp.sort_values("source").reset_index() + assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) diff --git a/python/cugraph/cugraph/tests/test_sorensen.py b/python/cugraph/cugraph/tests/test_sorensen.py index d18c51a5e59..e736b8da0b5 100644 --- a/python/cugraph/cugraph/tests/test_sorensen.py +++ b/python/cugraph/cugraph/tests/test_sorensen.py @@ -15,6 +15,8 @@ import pytest import cudf +from cudf.testing import assert_series_equal + import cugraph from cugraph.tests import utils @@ -241,4 +243,6 @@ def test_sorensen_multi_column(read_csv): df_exp = cugraph.sorensen(G2, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - assert df_res["sorensen_coeff"].equals(df_exp["sorensen_coeff"]) + actual = df_res.sort_values("0_source").reset_index() + expected = df_exp.sort_values("source").reset_index() + assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"]) diff --git a/python/cugraph/cugraph/tests/test_wjaccard.py b/python/cugraph/cugraph/tests/test_wjaccard.py index 73bcfa4de35..f0e1283a0fb 100644 --- a/python/cugraph/cugraph/tests/test_wjaccard.py +++ b/python/cugraph/cugraph/tests/test_wjaccard.py @@ -17,6 +17,8 @@ import pytest import cudf +from cudf.testing import assert_series_equal + import cugraph from cugraph.tests import utils @@ -168,4 +170,6 @@ def test_wjaccard_multi_column(read_csv): df_exp = cugraph.jaccard_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - assert df_res["jaccard_coeff"].equals(df_exp["jaccard_coeff"]) + actual = df_res.sort_values("0_source").reset_index() + expected = df_exp.sort_values("source").reset_index() + assert_series_equal(actual["jaccard_coeff"], expected["jaccard_coeff"]) diff --git a/python/cugraph/cugraph/tests/test_woverlap.py b/python/cugraph/cugraph/tests/test_woverlap.py index 05422b8c6a8..2a4be372517 100644 --- a/python/cugraph/cugraph/tests/test_woverlap.py +++ b/python/cugraph/cugraph/tests/test_woverlap.py @@ -17,6 +17,7 @@ import scipy import numpy as np import cudf +from cudf.testing import assert_series_equal import cugraph from cugraph.tests import utils @@ -157,4 +158,6 @@ def test_woverlap_multi_column(graph_file): df_exp = cugraph.overlap_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - assert df_res["overlap_coeff"].equals(df_exp["overlap_coeff"]) + actual = df_res.sort_values("0_source").reset_index() + expected = df_exp.sort_values("source").reset_index() + assert_series_equal(actual["overlap_coeff"], expected["overlap_coeff"]) diff --git a/python/cugraph/cugraph/tests/test_wsorensen.py b/python/cugraph/cugraph/tests/test_wsorensen.py index 6dcb0065f78..57d277d4173 100644 --- a/python/cugraph/cugraph/tests/test_wsorensen.py +++ b/python/cugraph/cugraph/tests/test_wsorensen.py @@ -17,6 +17,8 @@ import pytest import cudf +from cudf.testing import assert_series_equal + import cugraph from cugraph.tests import utils @@ -172,4 +174,6 @@ def test_wsorensen_multi_column(read_csv): df_exp = cugraph.sorensen_w(G2, weights, vertex_pair[["src_0", "dst_0"]]) # Calculating mismatch - assert df_res["sorensen_coeff"].equals(df_exp["sorensen_coeff"]) + actual = df_res.sort_values("0_source").reset_index() + expected = df_exp.sort_values("source").reset_index() + assert_series_equal(actual["sorensen_coeff"], expected["sorensen_coeff"])