From efc05b3314aa8d558256c89bc49bd30d6ef0c5ab Mon Sep 17 00:00:00 2001 From: Rick Ratzel <3039903+rlratzel@users.noreply.github.com> Date: Mon, 25 Jul 2022 12:24:11 -0400 Subject: [PATCH] Add options to `extract_subgraph()` to bypass renumbering and adding edge_data, exclude internal `_WEIGHT_` column from `edge_property_names`, added `num_vertices_with_properties` attr (#2419) Add options to `extract_subgraph()` to bypass renumbering and adding edge_data, exclude internal `_WEIGHT_` column from `edge_property_names`. Also added a new attribute `num_vertices_with_properties` which returns the number of vertices with properties, which is different than the number of vertices, since vertices can be added via `add_edge_data()`. This is needed for GNN use cases which need to know how many verts have properties which can be accessed (this corresponds to the number of rows in the internal vertex prop data table). Added unit tests to verify new `extract_subgraph()` options work, the new `num_vertices_with_properties` attribute, and `_WEIGHT_` columns names aren't included, for both SG and MG versions. closes #2418 closes #2410 Authors: - Rick Ratzel (https://github.com/rlratzel) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) - Erik Welch (https://github.com/eriknw) - Brad Rees (https://github.com/BradReesWork) URL: https://github.com/rapidsai/cugraph/pull/2419 --- .../dask/structure/mg_property_graph.py | 110 +++++++---- .../cugraph/structure/property_graph.py | 74 +++++--- python/cugraph/cugraph/tests/conftest.py | 5 +- .../tests/mg/test_mg_property_graph.py | 155 ++++++++++++--- .../cugraph/tests/test_property_graph.py | 179 +++++++++++------- 5 files changed, 363 insertions(+), 160 deletions(-) diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py index 83069a2b1a6..9a9438a018e 100644 --- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py +++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py @@ -126,6 +126,7 @@ def __init__(self, num_workers=None): # Cached property values self.__num_vertices = None + self.__num_vertices_with_properties = None # number of gpu's to use if num_workers is None: @@ -145,6 +146,18 @@ def num_vertices(self): self.__num_vertices = vert_count.compute() return self.__num_vertices + @property + def num_vertices_with_properties(self): + if self.__num_vertices_with_properties is not None: + return self.__num_vertices_with_properties + + if self.__vertex_prop_dataframe is not None: + self.__num_vertices_with_properties = \ + len(self.__vertex_prop_dataframe) + return self.__num_vertices_with_properties + + return 0 + @property def num_edges(self): if self.__edge_prop_dataframe is not None: @@ -156,7 +169,8 @@ def num_edges(self): def edges(self): if self.__edge_prop_dataframe is not None: return self.__edge_prop_dataframe[[self.src_col_name, - self.dst_col_name]] + self.dst_col_name, + self.edge_id_col_name]] return None @property @@ -176,6 +190,8 @@ def edge_property_names(self): props.remove(self.dst_col_name) props.remove(self.edge_id_col_name) props.remove(self.type_col_name) # should "type" be removed? + if self.weight_col_name in props: + props.remove(self.weight_col_name) return props return [] @@ -260,9 +276,10 @@ def add_vertex_data(self, "found in dataframe: " f"{list(invalid_columns)}") - # Clear the cached value for num_vertices since more could be added in - # this method. + # Clear the cached values related to the number of vertices since more + # could be added in this method. self.__num_vertices = None + self.__num_vertices_with_properties = None # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -373,7 +390,7 @@ def add_edge_data(self, f"{list(invalid_columns)}") # Clear the cached value for num_vertices since more could be added in - # this method. + # this method. This method cannot affect num_vertices_with_properties self.__num_vertices = None default_edge_columns = [self.src_col_name, @@ -467,7 +484,9 @@ def extract_subgraph(self, selection=None, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True ): """ Return a subgraph of the overall PropertyGraph containing vertices @@ -495,7 +514,13 @@ def extract_subgraph(self, allow_multi_edges : bool If True, multiple edges should be used to create the return Graph, otherwise multiple edges will be detected and an exception raised. - + renumber_graph : bool (default is True) + If True, return a Graph that has been renumbered for use by graph + algorithms. If False, the returned graph will need to be manually + renumbered prior to calling graph algos. + add_edge_data : bool (default is True) + If True, add meta data about the edges contained in the extracted + graph which are required for future calls to annotate_dataframe(). Returns ------- A Graph instance of the same type as create_using containing only the @@ -556,7 +581,9 @@ def extract_subgraph(self, create_using=create_using, edge_weight_property=edge_weight_property, default_edge_weight=default_edge_weight, - allow_multi_edges=allow_multi_edges) + allow_multi_edges=allow_multi_edges, + renumber_graph=renumber_graph, + add_edge_data=add_edge_data) def annotate_dataframe(self, df, G, edge_vertex_col_names): raise NotImplementedError() @@ -566,7 +593,9 @@ def edge_props_to_graph(self, create_using, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False): + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True): """ Create and return a Graph from the edges in edge_prop_df. """ @@ -594,10 +623,8 @@ def edge_props_to_graph(self, # If a default_edge_weight was specified but an edge_weight_property # was not, a new edge weight column must be added. elif default_edge_weight: - edge_attr = self.__gen_unique_name(edge_prop_df.columns, - prefix=self.weight_col_name) + edge_attr = self.weight_col_name edge_prop_df[edge_attr] = default_edge_weight - else: edge_attr = None @@ -630,18 +657,43 @@ def edge_props_to_graph(self, msg = "default Graph graph type" raise RuntimeError("query resulted in duplicate edges which " f"cannot be represented with the {msg}") - G.from_dask_cudf_edgelist( - edge_prop_df, - source=self.src_col_name, - destination=self.dst_col_name, - edge_attr=edge_attr, renumber=True) - # Set the edge_data on the resulting Graph to a DataFrame containing - # the edges and the edge ID for each. Edge IDs are needed for future - # calls to annotate_dataframe() in order to associate edges with their - # properties, since the PG can contain multiple edges between vertrices - # with different properties. - G.edge_data = self.__create_property_lookup_table(edge_prop_df) - # FIXME: also add vertex_data + + # FIXME: MNMG Graphs required renumber to be True due to requirements + # on legacy code that needed segment offsets, partition offsets, + # etc. which were previously computed during the "legacy" C + # renumbering. The workaround is to pass renumber=True, then manually + # call G.compute_renumber_edge_list(legacy_renum_only=True) to compute + # the required meta-data without changing vertex IDs. + if renumber_graph is False: + renumber = True + else: + renumber = renumber_graph + + col_names = [self.src_col_name, self.dst_col_name] + if edge_attr is not None: + col_names.append(edge_attr) + + G.from_dask_cudf_edgelist(edge_prop_df[col_names], + source=self.src_col_name, + destination=self.dst_col_name, + edge_attr=edge_attr, + renumber=renumber) + # FIXME: see FIXME above - to generate the edgelist, + # compute_renumber_edge_list() must be called, but legacy mode needs to + # be used based on if renumbering was to be done or not. + if renumber_graph is False: + G.compute_renumber_edge_list(legacy_renum_only=True) + else: + G.compute_renumber_edge_list(legacy_renum_only=False) + + if add_edge_data: + # Set the edge_data on the resulting Graph to a DataFrame + # containing the edges and the edge ID for each. Edge IDs are + # needed for future calls to annotate_dataframe() in order to + # associate edges with their properties, since the PG can contain + # multiple edges between vertrices with different properties. + # FIXME: also add vertex_data + G.edge_data = self.__create_property_lookup_table(edge_prop_df) return G @@ -684,18 +736,6 @@ def __get_all_vertices_series(self): vert_sers.append(epd[self.dst_col_name]) return vert_sers - @staticmethod - def __gen_unique_name(current_names, prefix="col"): - """ - Helper function to generate a currently unused name. - """ - name = prefix - counter = 2 - while name in current_names: - name = f"{prefix}{counter}" - counter += 1 - return name - @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index ca796aa4c4f..815192ef7b4 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -135,6 +135,7 @@ def __init__(self): # Cached property values self.__num_vertices = None + self.__num_vertices_with_properties = None # PropertyGraph read-only attributes @property @@ -152,6 +153,18 @@ def num_vertices(self): return self.__num_vertices + @property + def num_vertices_with_properties(self): + if self.__num_vertices_with_properties is not None: + return self.__num_vertices_with_properties + + if self.__vertex_prop_dataframe is not None: + self.__num_vertices_with_properties = \ + len(self.__vertex_prop_dataframe) + return self.__num_vertices_with_properties + + return 0 + @property def num_edges(self): if self.__edge_prop_dataframe is not None: @@ -183,6 +196,8 @@ def edge_property_names(self): props.remove(self.dst_col_name) props.remove(self.edge_id_col_name) props.remove(self.type_col_name) # should "type" be removed? + if self.weight_col_name in props: + props.remove(self.weight_col_name) return props return [] @@ -278,9 +293,10 @@ def add_vertex_data(self, "the PropertyGraph was already initialized " f"using type {self.__dataframe_type}") - # Clear the cached value for num_vertices since more could be added in - # this method. + # Clear the cached values related to the number of vertices since more + # could be added in this method. self.__num_vertices = None + self.__num_vertices_with_properties = None # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -400,7 +416,7 @@ def add_edge_data(self, f"using type {self.__dataframe_type}") # Clear the cached value for num_vertices since more could be added in - # this method. + # this method. This method cannot affect num_vertices_with_properties self.__num_vertices = None default_edge_columns = [self.src_col_name, @@ -551,7 +567,9 @@ def extract_subgraph(self, selection=None, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True ): """ Return a subgraph of the overall PropertyGraph containing vertices @@ -579,6 +597,13 @@ def extract_subgraph(self, allow_multi_edges : bool If True, multiple edges should be used to create the return Graph, otherwise multiple edges will be detected and an exception raised. + renumber_graph : bool (default is True) + If True, return a Graph that has been renumbered for use by graph + algorithms. If False, the returned graph will need to be manually + renumbered prior to calling graph algos. + add_edge_data : bool (default is True) + If True, add meta data about the edges contained in the extracted + graph which are required for future calls to annotate_dataframe(). Returns ------- @@ -641,7 +666,9 @@ def extract_subgraph(self, create_using=create_using, edge_weight_property=edge_weight_property, default_edge_weight=default_edge_weight, - allow_multi_edges=allow_multi_edges) + allow_multi_edges=allow_multi_edges, + renumber_graph=renumber_graph, + add_edge_data=add_edge_data) def annotate_dataframe(self, df, G, edge_vertex_col_names): """ @@ -713,7 +740,9 @@ def edge_props_to_graph(self, create_using, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False): + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True): """ Create and return a Graph from the edges in edge_prop_df. """ @@ -742,10 +771,8 @@ def edge_props_to_graph(self, # If a default_edge_weight was specified but an edge_weight_property # was not, a new edge weight column must be added. elif default_edge_weight: - edge_attr = self.__gen_unique_name(edge_prop_df.columns, - prefix=self.weight_col_name) + edge_attr = self.weight_col_name edge_prop_df[edge_attr] = default_edge_weight - else: edge_attr = None @@ -782,20 +809,21 @@ def edge_props_to_graph(self, create_args = {"source": self.src_col_name, "destination": self.dst_col_name, "edge_attr": edge_attr, - "renumber": True, + "renumber": renumber_graph, } if type(edge_prop_df) is cudf.DataFrame: G.from_cudf_edgelist(edge_prop_df, **create_args) else: G.from_pandas_edgelist(edge_prop_df, **create_args) - # Set the edge_data on the resulting Graph to a DataFrame containing - # the edges and the edge ID for each. Edge IDs are needed for future - # calls to annotate_dataframe() in order to associate edges with their - # properties, since the PG can contain multiple edges between vertrices - # with different properties. - G.edge_data = self.__create_property_lookup_table(edge_prop_df) - # FIXME: also add vertex_data + if add_edge_data: + # Set the edge_data on the resulting Graph to a DataFrame + # containing the edges and the edge ID for each. Edge IDs are + # needed for future calls to annotate_dataframe() in order to + # associate edges with their properties, since the PG can contain + # multiple edges between vertrices with different properties. + # FIXME: also add vertex_data + G.edge_data = self.__create_property_lookup_table(edge_prop_df) return G @@ -862,18 +890,6 @@ def __get_all_vertices_series(self): vert_sers.append(epd[self.dst_col_name]) return vert_sers - @staticmethod - def __gen_unique_name(current_names, prefix="col"): - """ - Helper function to generate a currently unused name. - """ - name = prefix - counter = 2 - while name in current_names: - name = f"{prefix}{counter}" - counter += 1 - return name - @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/tests/conftest.py b/python/cugraph/cugraph/tests/conftest.py index f5bcb35995e..775f365042b 100644 --- a/python/cugraph/cugraph/tests/conftest.py +++ b/python/cugraph/cugraph/tests/conftest.py @@ -58,10 +58,7 @@ def dask_client(): yield client Comms.destroy() - # Shut down the connected scheduler and workers - # therefore we will no longer rely on killing the dask cluster ID - # for MNMG runs - client.shutdown() + client.close() if cluster: cluster.close() print("\ndask_client fixture: client.close() called") diff --git a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py index f1ff0a3184e..d69cb600873 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py @@ -12,11 +12,14 @@ # limitations under the License. import gc -import cugraph.dask as dcg + import dask_cudf import pytest import pandas as pd import cudf +from cudf.testing import assert_frame_equal + +import cugraph.dask as dcg from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH from cugraph.testing import utils @@ -51,17 +54,6 @@ (78634, 47906, 0), ] ], - "taxpayers": [ - ["payer_id", "amount"], - [(11, 1123.98), - (4, 3243.7), - (21, 8932.3), - (16, 3241.77), - (86, 789.2), - (89021, 23.98), - (78634, 41.77), - ] - ], "transactions": [ ["user_id", "merchant_id", "volume", "time", "card_num", "card_type"], [(89021, 11, 33.2, 1639084966.5513437, 123456, "MC"), @@ -91,6 +83,17 @@ } +dataset2 = { + "simple": [ + ["src", "dst", "some_property"], + [(99, 22, "a"), + (98, 34, "b"), + (97, 56, "c"), + (96, 88, "d"), + ] + ], +} + # Placeholder for a directed Graph instance. This is not constructed here in # order to prevent cuGraph code from running on import, which would prevent # proper pytest collection if an exception is raised. See setup_function(). @@ -170,7 +173,7 @@ def dataset1_PropertyGraph(request): dataframe_type = request.param[0] from cugraph.experimental import PropertyGraph - (merchants, users, taxpayers, + (merchants, users, transactions, relationships, referrals) = dataset1.values() pG = PropertyGraph() @@ -195,11 +198,6 @@ def dataset1_PropertyGraph(request): type_name="users", vertex_col_name="user_id", property_columns=None) - pG.add_vertex_data(dataframe_type(columns=taxpayers[0], - data=taxpayers[1]), - type_name="taxpayers", - vertex_col_name="payer_id", - property_columns=None) pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), @@ -227,7 +225,7 @@ def dataset1_MGPropertyGraph(dask_client): data added from dataset1, parameterized for different DataFrame types. """ dataframe_type = cudf.DataFrame - (merchants, users, taxpayers, + (merchants, users, transactions, relationships, referrals) = dataset1.values() from cugraph.experimental import MGPropertyGraph mpG = MGPropertyGraph() @@ -256,13 +254,6 @@ def dataset1_MGPropertyGraph(dask_client): vertex_col_name="user_id", property_columns=None) - sg_df = dataframe_type(columns=taxpayers[0], data=taxpayers[1]) - mg_df = dask_cudf.from_cudf(sg_df, npartitions=2) - mpG.add_vertex_data(mg_df, - type_name="taxpayers", - vertex_col_name="payer_id", - property_columns=None) - sg_df = dataframe_type(columns=transactions[0], data=transactions[1]) mg_df = dask_cudf.from_cudf(sg_df, npartitions=2) mpG.add_edge_data(mg_df, @@ -287,6 +278,23 @@ def dataset1_MGPropertyGraph(dask_client): return mpG +@pytest.fixture(scope="module") +def dataset2_MGPropertyGraph(dask_client): + from cugraph.experimental import MGPropertyGraph + + dataframe_type = cudf.DataFrame + simple = dataset2["simple"] + mpG = MGPropertyGraph() + + sg_df = dataframe_type(columns=simple[0], data=simple[1]) + mgdf = dask_cudf.from_cudf(sg_df, npartitions=2) + + mpG.add_edge_data(mgdf, + vertex_col_names=("src", "dst")) + + return (mpG, simple) + + @pytest.fixture(scope="module", params=df_types_fixture_params) def net_MGPropertyGraph(dask_client): """ @@ -377,3 +385,98 @@ def test_frame_data(dataset1_PropertyGraph, dataset1_MGPropertyGraph): mg_ep_df = mgpG._edge_prop_dataframe\ .compute().sort_values(by=edge_sort_col).reset_index(drop=True) assert (sg_ep_df['_SRC_'].equals(mg_ep_df['_SRC_'])) + + +def test_property_names_attrs(dataset1_MGPropertyGraph): + """ + Ensure the correct number of user-visible properties for vertices and edges + are returned. This should exclude the internal bookkeeping properties. + """ + pG = dataset1_MGPropertyGraph + + expected_vert_prop_names = ["merchant_id", "merchant_location", + "merchant_size", "merchant_sales", + "merchant_num_employees", "merchant_name", + "user_id", "user_location", "vertical"] + expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time", + "card_num", "card_type", "user_id_1", + "user_id_2", "relationship_type", "stars"] + + # Extracting a subgraph with weights has/had a side-effect of adding a + # weight column, so call extract_subgraph() to ensure the internal weight + # column name is not present. + pG.extract_subgraph(default_edge_weight=1.0, allow_multi_edges=True) + + actual_vert_prop_names = pG.vertex_property_names + actual_edge_prop_names = pG.edge_property_names + + assert sorted(actual_vert_prop_names) == sorted(expected_vert_prop_names) + assert sorted(actual_edge_prop_names) == sorted(expected_edge_prop_names) + + +def test_extract_subgraph_nonrenumbered_noedgedata(dataset2_MGPropertyGraph): + """ + Ensure a subgraph can be extracted that is not renumbered and contains no + edge_data. + """ + from cugraph import Graph + + (pG, data) = dataset2_MGPropertyGraph + G = pG.extract_subgraph(create_using=Graph(directed=True), + renumber_graph=False, + add_edge_data=False) + + actual_edgelist = G.edgelist.edgelist_df.compute() + + # create a DF without the properties (ie. the last column) + expected_edgelist = cudf.DataFrame(columns=[pG.src_col_name, + pG.dst_col_name], + data=[(i, j) for (i, j, k) in data[1]]) + + assert_frame_equal(expected_edgelist.sort_values(by=pG.src_col_name, + ignore_index=True), + actual_edgelist.sort_values(by=pG.src_col_name, + ignore_index=True)) + assert hasattr(G, "edge_data") is False + + +def test_num_vertices_with_properties(dataset2_MGPropertyGraph): + """ + Checks that the num_vertices_with_properties attr is set to the number of + vertices that have properties, as opposed to just num_vertices which also + includes all verts in the graph edgelist. + """ + (pG, data) = dataset2_MGPropertyGraph + + assert pG.num_vertices == len(data[1]) * 2 # assume no repeated vertices + assert pG.num_vertices_with_properties == 0 + + df = cudf.DataFrame({"vertex": [98, 97], + "some_property": ["a", "b"], + }) + mgdf = dask_cudf.from_cudf(df, npartitions=2) + pG.add_vertex_data(mgdf, vertex_col_name="vertex") + + assert pG.num_vertices == len(data[1]) * 2 # assume no repeated vertices + assert pG.num_vertices_with_properties == 2 + + +def test_edges_attr(dataset2_MGPropertyGraph): + """ + Ensure the edges attr returns the src, dst, edge_id columns properly. + """ + (pG, data) = dataset2_MGPropertyGraph + + # create a DF without the properties (ie. the last column) + expected_edges = cudf.DataFrame(columns=[pG.src_col_name, pG.dst_col_name], + data=[(i, j) for (i, j, k) in data[1]]) + actual_edges = pG.edges[[pG.src_col_name, pG.dst_col_name]].compute() + + assert_frame_equal(expected_edges.sort_values(by=pG.src_col_name, + ignore_index=True), + actual_edges.sort_values(by=pG.src_col_name, + ignore_index=True)) + edge_ids = pG.edges[pG.edge_id_col_name].compute() + expected_num_edges = len(data[1]) + assert len(edge_ids) == expected_num_edges + assert edge_ids.nunique() == expected_num_edges diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 56d0c43bd04..d4d185dac18 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -170,12 +170,19 @@ def dataset1_PropertyGraph(request): type_name="users", vertex_col_name="user_id", property_columns=None) + # Do not add taxpayers since that may now be considered invalid input (it + # adds the same vertices under different types, which leads to the same + # vertex ID appearing in the internal vertex prop table. + # + # FIXME: determine if this should be allowed or not then either remove + # "taxpayers" or uncomment it. + """ pG.add_vertex_data(dataframe_type(columns=taxpayers[0], data=taxpayers[1]), type_name="taxpayers", vertex_col_name="payer_id", property_columns=None) - + """ pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), type_name="transactions", @@ -326,8 +333,9 @@ def test_num_vertices(df_type): assert pG.num_vertices == 9 assert pG.num_edges == 0 - # The taxpayers table does not add new vertices, it only adds properties to - # vertices already present in the merchants and users tables. + # The taxpayers table does not add new unique vertices, it only adds + # properties to vertices already present in the merchants and users + # tables. taxpayers = dataset1["taxpayers"] taxpayers_df = df_type(columns=taxpayers[0], data=taxpayers[1]) @@ -341,6 +349,34 @@ def test_num_vertices(df_type): assert pG.num_edges == 0 +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices_with_properties(df_type): + """ + Checks that the num_vertices_with_properties attr is set to the number of + vertices that have properties, as opposed to just num_vertices which also + includes all verts in the graph edgelist. + """ + from cugraph.experimental import PropertyGraph + + pG = PropertyGraph() + df = df_type({"src": [99, 98, 97], + "dst": [22, 34, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, vertex_col_names=("src", "dst")) + + assert pG.num_vertices == 6 + assert pG.num_vertices_with_properties == 0 + + df = df_type({"vertex": [98, 97], + "some_property": ["a", "b"], + }) + pG.add_vertex_data(df, vertex_col_name="vertex") + + assert pG.num_vertices == 6 + assert pG.num_vertices_with_properties == 2 + + @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_null_data(df_type): """ @@ -508,13 +544,18 @@ def test_extract_subgraph_vertex_prop_condition_only(dataset1_PropertyGraph): pG = dataset1_PropertyGraph - selection = pG.select_vertices("(_TYPE_=='taxpayers') & (amount<100)") + # This should result in two users: 78634 and 89216 + selection = pG.select_vertices( + f"({pG.type_col_name}=='users') " + "& ((user_location<78750) | ((user_location==78757) & (vertical==1)))") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, - edge_weight_property="stars") - - expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634], - "weights": [4]}) + edge_weight_property="relationship_type", + default_edge_weight=99) + # Should result in two edges, one a "relationship", the other a "referral" + expected_edgelist = cudf.DataFrame({"src": [89216, 78634], + "dst": [78634, 89216], + "weights": [99, 8]}) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", @@ -620,51 +661,6 @@ def test_extract_subgraph_specific_query(dataset1_PropertyGraph): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_edge_props_to_graph(dataset1_PropertyGraph): - """ - Access the property DataFrames directly and use them to perform a more - complex query, then call edge_props_to_graph() to create the corresponding - graph. - """ - from cugraph.experimental import PropertyGraph - - pG = dataset1_PropertyGraph - vcn = PropertyGraph.vertex_col_name - tcn = PropertyGraph.type_col_name - scn = PropertyGraph.src_col_name - dcn = PropertyGraph.dst_col_name - - # Select referrals from only taxpayers who are users (should be 1) - - # Find the list of vertices that are both users and taxpayers - def contains_both(df): - return (df[tcn] == "taxpayers").any() and \ - (df[tcn] == "users").any() - verts = pG._vertex_prop_dataframe.groupby(vcn)\ - .apply(contains_both) - verts = verts[verts].keys() # get an array of only verts that have both - - # Find the "referral" edge_props containing only those verts - referrals = pG._edge_prop_dataframe[tcn] == "referrals" - srcs = pG._edge_prop_dataframe[referrals][scn].isin(verts) - dsts = pG._edge_prop_dataframe[referrals][dcn].isin(verts) - matching_edges = (srcs & dsts) - indices = matching_edges.index[matching_edges] - edge_props = pG._edge_prop_dataframe.loc[indices] - - G = pG.edge_props_to_graph(edge_props, - create_using=DiGraph_inst) - - expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) - actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", - preserve_order=True) - actual_edgelist = G.unrenumber(actual_edgelist, "dst", - preserve_order=True) - - assert G.is_directed() - assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) - - def test_select_vertices_from_previous_selection(dataset1_PropertyGraph): """ Ensures that the intersection of vertices of multiple types (only vertices @@ -675,14 +671,17 @@ def test_select_vertices_from_previous_selection(dataset1_PropertyGraph): pG = dataset1_PropertyGraph tcn = PropertyGraph.type_col_name - # Select referrals from only taxpayers who are users (should be 1) - selection = pG.select_vertices(f"{tcn} == 'taxpayers'") - selection = pG.select_vertices(f"{tcn} == 'users'", - from_previous_selection=selection) + # Select referrals from only users 89216 and 78634 using an intentionally + # awkward query with separate select calls to test from_previous_selection + selection = pG.select_vertices(f"{tcn} == 'users'") + selection = pG.select_vertices( + "((user_location == 78757) & (vertical == 1)) " + "| (user_location == 47906)", + from_previous_selection=selection) selection += pG.select_edges(f"{tcn} == 'referrals'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection) - expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) + expected_edgelist = cudf.DataFrame({"src": [89216], "dst": [78634]}) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", @@ -869,6 +868,35 @@ def test_extract_subgraph_default_edge_weight_no_property( assert (G.edgelist.edgelist_df["weights"] == edge_weight).all() +def test_extract_subgraph_nonrenumbered_noedgedata(): + """ + Ensure a subgraph can be extracted that is not renumbered and contains no + edge_data. + """ + from cugraph.experimental import PropertyGraph + from cugraph import Graph + + pG = PropertyGraph() + df = cudf.DataFrame({"src": [99, 98, 97], + "dst": [22, 34, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, vertex_col_names=("src", "dst")) + + G = pG.extract_subgraph(create_using=Graph(directed=True), + renumber_graph=False, + add_edge_data=False) + + expected_edgelist = cudf.DataFrame({"src": [99, 98, 97], + "dst": [22, 34, 56], + }) + assert_frame_equal(expected_edgelist.sort_values(by="src", + ignore_index=True), + G.edgelist.edgelist_df.sort_values(by="src", + ignore_index=True)) + assert hasattr(G, "edge_data") is False + + def test_graph_edge_data_added(dataset1_PropertyGraph): """ Ensures the subgraph returned from extract_subgraph() has the edge_data @@ -1031,6 +1059,33 @@ def test_get_edges(dataset1_PropertyGraph): assert (src, dst) in expected_edges +def test_property_names_attrs(dataset1_PropertyGraph): + """ + Ensure the correct number of user-visible properties for vertices and edges + are returned. This should exclude the internal bookkeeping properties. + """ + pG = dataset1_PropertyGraph + + expected_vert_prop_names = ["merchant_id", "merchant_location", + "merchant_size", "merchant_sales", + "merchant_num_employees", "merchant_name", + "user_id", "user_location", "vertical"] + expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time", + "card_num", "card_type", "user_id_1", + "user_id_2", "relationship_type", "stars"] + + # Extracting a subgraph with weights has/had a side-effect of adding a + # weight column, so call extract_subgraph() to ensure the internal weight + # column name is not present. + pG.extract_subgraph(default_edge_weight=1.0, allow_multi_edges=True) + + actual_vert_prop_names = pG.vertex_property_names + actual_edge_prop_names = pG.edge_property_names + + assert sorted(actual_vert_prop_names) == sorted(expected_vert_prop_names) + assert sorted(actual_edge_prop_names) == sorted(expected_edge_prop_names) + + @pytest.mark.skip(reason="unfinished") def test_extract_subgraph_with_vertex_ids(): """ @@ -1042,14 +1097,6 @@ def test_extract_subgraph_with_vertex_ids(): raise NotImplementedError -@pytest.mark.skip(reason="unfinished") -def test_dgl_use_case(): - """ - FIXME: add a test demonstrating typical DGL use cases - """ - raise NotImplementedError - - # ============================================================================= # Benchmarks # =============================================================================