diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py index 83069a2b1a6..9a9438a018e 100644 --- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py +++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py @@ -126,6 +126,7 @@ def __init__(self, num_workers=None): # Cached property values self.__num_vertices = None + self.__num_vertices_with_properties = None # number of gpu's to use if num_workers is None: @@ -145,6 +146,18 @@ def num_vertices(self): self.__num_vertices = vert_count.compute() return self.__num_vertices + @property + def num_vertices_with_properties(self): + if self.__num_vertices_with_properties is not None: + return self.__num_vertices_with_properties + + if self.__vertex_prop_dataframe is not None: + self.__num_vertices_with_properties = \ + len(self.__vertex_prop_dataframe) + return self.__num_vertices_with_properties + + return 0 + @property def num_edges(self): if self.__edge_prop_dataframe is not None: @@ -156,7 +169,8 @@ def num_edges(self): def edges(self): if self.__edge_prop_dataframe is not None: return self.__edge_prop_dataframe[[self.src_col_name, - self.dst_col_name]] + self.dst_col_name, + self.edge_id_col_name]] return None @property @@ -176,6 +190,8 @@ def edge_property_names(self): props.remove(self.dst_col_name) props.remove(self.edge_id_col_name) props.remove(self.type_col_name) # should "type" be removed? + if self.weight_col_name in props: + props.remove(self.weight_col_name) return props return [] @@ -260,9 +276,10 @@ def add_vertex_data(self, "found in dataframe: " f"{list(invalid_columns)}") - # Clear the cached value for num_vertices since more could be added in - # this method. + # Clear the cached values related to the number of vertices since more + # could be added in this method. self.__num_vertices = None + self.__num_vertices_with_properties = None # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -373,7 +390,7 @@ def add_edge_data(self, f"{list(invalid_columns)}") # Clear the cached value for num_vertices since more could be added in - # this method. + # this method. This method cannot affect num_vertices_with_properties self.__num_vertices = None default_edge_columns = [self.src_col_name, @@ -467,7 +484,9 @@ def extract_subgraph(self, selection=None, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True ): """ Return a subgraph of the overall PropertyGraph containing vertices @@ -495,7 +514,13 @@ def extract_subgraph(self, allow_multi_edges : bool If True, multiple edges should be used to create the return Graph, otherwise multiple edges will be detected and an exception raised. - + renumber_graph : bool (default is True) + If True, return a Graph that has been renumbered for use by graph + algorithms. If False, the returned graph will need to be manually + renumbered prior to calling graph algos. + add_edge_data : bool (default is True) + If True, add meta data about the edges contained in the extracted + graph which are required for future calls to annotate_dataframe(). Returns ------- A Graph instance of the same type as create_using containing only the @@ -556,7 +581,9 @@ def extract_subgraph(self, create_using=create_using, edge_weight_property=edge_weight_property, default_edge_weight=default_edge_weight, - allow_multi_edges=allow_multi_edges) + allow_multi_edges=allow_multi_edges, + renumber_graph=renumber_graph, + add_edge_data=add_edge_data) def annotate_dataframe(self, df, G, edge_vertex_col_names): raise NotImplementedError() @@ -566,7 +593,9 @@ def edge_props_to_graph(self, create_using, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False): + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True): """ Create and return a Graph from the edges in edge_prop_df. """ @@ -594,10 +623,8 @@ def edge_props_to_graph(self, # If a default_edge_weight was specified but an edge_weight_property # was not, a new edge weight column must be added. elif default_edge_weight: - edge_attr = self.__gen_unique_name(edge_prop_df.columns, - prefix=self.weight_col_name) + edge_attr = self.weight_col_name edge_prop_df[edge_attr] = default_edge_weight - else: edge_attr = None @@ -630,18 +657,43 @@ def edge_props_to_graph(self, msg = "default Graph graph type" raise RuntimeError("query resulted in duplicate edges which " f"cannot be represented with the {msg}") - G.from_dask_cudf_edgelist( - edge_prop_df, - source=self.src_col_name, - destination=self.dst_col_name, - edge_attr=edge_attr, renumber=True) - # Set the edge_data on the resulting Graph to a DataFrame containing - # the edges and the edge ID for each. Edge IDs are needed for future - # calls to annotate_dataframe() in order to associate edges with their - # properties, since the PG can contain multiple edges between vertrices - # with different properties. - G.edge_data = self.__create_property_lookup_table(edge_prop_df) - # FIXME: also add vertex_data + + # FIXME: MNMG Graphs required renumber to be True due to requirements + # on legacy code that needed segment offsets, partition offsets, + # etc. which were previously computed during the "legacy" C + # renumbering. The workaround is to pass renumber=True, then manually + # call G.compute_renumber_edge_list(legacy_renum_only=True) to compute + # the required meta-data without changing vertex IDs. + if renumber_graph is False: + renumber = True + else: + renumber = renumber_graph + + col_names = [self.src_col_name, self.dst_col_name] + if edge_attr is not None: + col_names.append(edge_attr) + + G.from_dask_cudf_edgelist(edge_prop_df[col_names], + source=self.src_col_name, + destination=self.dst_col_name, + edge_attr=edge_attr, + renumber=renumber) + # FIXME: see FIXME above - to generate the edgelist, + # compute_renumber_edge_list() must be called, but legacy mode needs to + # be used based on if renumbering was to be done or not. + if renumber_graph is False: + G.compute_renumber_edge_list(legacy_renum_only=True) + else: + G.compute_renumber_edge_list(legacy_renum_only=False) + + if add_edge_data: + # Set the edge_data on the resulting Graph to a DataFrame + # containing the edges and the edge ID for each. Edge IDs are + # needed for future calls to annotate_dataframe() in order to + # associate edges with their properties, since the PG can contain + # multiple edges between vertrices with different properties. + # FIXME: also add vertex_data + G.edge_data = self.__create_property_lookup_table(edge_prop_df) return G @@ -684,18 +736,6 @@ def __get_all_vertices_series(self): vert_sers.append(epd[self.dst_col_name]) return vert_sers - @staticmethod - def __gen_unique_name(current_names, prefix="col"): - """ - Helper function to generate a currently unused name. - """ - name = prefix - counter = 2 - while name in current_names: - name = f"{prefix}{counter}" - counter += 1 - return name - @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index ca796aa4c4f..815192ef7b4 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -135,6 +135,7 @@ def __init__(self): # Cached property values self.__num_vertices = None + self.__num_vertices_with_properties = None # PropertyGraph read-only attributes @property @@ -152,6 +153,18 @@ def num_vertices(self): return self.__num_vertices + @property + def num_vertices_with_properties(self): + if self.__num_vertices_with_properties is not None: + return self.__num_vertices_with_properties + + if self.__vertex_prop_dataframe is not None: + self.__num_vertices_with_properties = \ + len(self.__vertex_prop_dataframe) + return self.__num_vertices_with_properties + + return 0 + @property def num_edges(self): if self.__edge_prop_dataframe is not None: @@ -183,6 +196,8 @@ def edge_property_names(self): props.remove(self.dst_col_name) props.remove(self.edge_id_col_name) props.remove(self.type_col_name) # should "type" be removed? + if self.weight_col_name in props: + props.remove(self.weight_col_name) return props return [] @@ -278,9 +293,10 @@ def add_vertex_data(self, "the PropertyGraph was already initialized " f"using type {self.__dataframe_type}") - # Clear the cached value for num_vertices since more could be added in - # this method. + # Clear the cached values related to the number of vertices since more + # could be added in this method. self.__num_vertices = None + self.__num_vertices_with_properties = None # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -400,7 +416,7 @@ def add_edge_data(self, f"using type {self.__dataframe_type}") # Clear the cached value for num_vertices since more could be added in - # this method. + # this method. This method cannot affect num_vertices_with_properties self.__num_vertices = None default_edge_columns = [self.src_col_name, @@ -551,7 +567,9 @@ def extract_subgraph(self, selection=None, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True ): """ Return a subgraph of the overall PropertyGraph containing vertices @@ -579,6 +597,13 @@ def extract_subgraph(self, allow_multi_edges : bool If True, multiple edges should be used to create the return Graph, otherwise multiple edges will be detected and an exception raised. + renumber_graph : bool (default is True) + If True, return a Graph that has been renumbered for use by graph + algorithms. If False, the returned graph will need to be manually + renumbered prior to calling graph algos. + add_edge_data : bool (default is True) + If True, add meta data about the edges contained in the extracted + graph which are required for future calls to annotate_dataframe(). Returns ------- @@ -641,7 +666,9 @@ def extract_subgraph(self, create_using=create_using, edge_weight_property=edge_weight_property, default_edge_weight=default_edge_weight, - allow_multi_edges=allow_multi_edges) + allow_multi_edges=allow_multi_edges, + renumber_graph=renumber_graph, + add_edge_data=add_edge_data) def annotate_dataframe(self, df, G, edge_vertex_col_names): """ @@ -713,7 +740,9 @@ def edge_props_to_graph(self, create_using, edge_weight_property=None, default_edge_weight=None, - allow_multi_edges=False): + allow_multi_edges=False, + renumber_graph=True, + add_edge_data=True): """ Create and return a Graph from the edges in edge_prop_df. """ @@ -742,10 +771,8 @@ def edge_props_to_graph(self, # If a default_edge_weight was specified but an edge_weight_property # was not, a new edge weight column must be added. elif default_edge_weight: - edge_attr = self.__gen_unique_name(edge_prop_df.columns, - prefix=self.weight_col_name) + edge_attr = self.weight_col_name edge_prop_df[edge_attr] = default_edge_weight - else: edge_attr = None @@ -782,20 +809,21 @@ def edge_props_to_graph(self, create_args = {"source": self.src_col_name, "destination": self.dst_col_name, "edge_attr": edge_attr, - "renumber": True, + "renumber": renumber_graph, } if type(edge_prop_df) is cudf.DataFrame: G.from_cudf_edgelist(edge_prop_df, **create_args) else: G.from_pandas_edgelist(edge_prop_df, **create_args) - # Set the edge_data on the resulting Graph to a DataFrame containing - # the edges and the edge ID for each. Edge IDs are needed for future - # calls to annotate_dataframe() in order to associate edges with their - # properties, since the PG can contain multiple edges between vertrices - # with different properties. - G.edge_data = self.__create_property_lookup_table(edge_prop_df) - # FIXME: also add vertex_data + if add_edge_data: + # Set the edge_data on the resulting Graph to a DataFrame + # containing the edges and the edge ID for each. Edge IDs are + # needed for future calls to annotate_dataframe() in order to + # associate edges with their properties, since the PG can contain + # multiple edges between vertrices with different properties. + # FIXME: also add vertex_data + G.edge_data = self.__create_property_lookup_table(edge_prop_df) return G @@ -862,18 +890,6 @@ def __get_all_vertices_series(self): vert_sers.append(epd[self.dst_col_name]) return vert_sers - @staticmethod - def __gen_unique_name(current_names, prefix="col"): - """ - Helper function to generate a currently unused name. - """ - name = prefix - counter = 2 - while name in current_names: - name = f"{prefix}{counter}" - counter += 1 - return name - @staticmethod def __get_new_column_dtypes(from_df, to_df): """ diff --git a/python/cugraph/cugraph/tests/conftest.py b/python/cugraph/cugraph/tests/conftest.py index f5bcb35995e..775f365042b 100644 --- a/python/cugraph/cugraph/tests/conftest.py +++ b/python/cugraph/cugraph/tests/conftest.py @@ -58,10 +58,7 @@ def dask_client(): yield client Comms.destroy() - # Shut down the connected scheduler and workers - # therefore we will no longer rely on killing the dask cluster ID - # for MNMG runs - client.shutdown() + client.close() if cluster: cluster.close() print("\ndask_client fixture: client.close() called") diff --git a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py index f1ff0a3184e..d69cb600873 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py @@ -12,11 +12,14 @@ # limitations under the License. import gc -import cugraph.dask as dcg + import dask_cudf import pytest import pandas as pd import cudf +from cudf.testing import assert_frame_equal + +import cugraph.dask as dcg from cugraph.testing.utils import RAPIDS_DATASET_ROOT_DIR_PATH from cugraph.testing import utils @@ -51,17 +54,6 @@ (78634, 47906, 0), ] ], - "taxpayers": [ - ["payer_id", "amount"], - [(11, 1123.98), - (4, 3243.7), - (21, 8932.3), - (16, 3241.77), - (86, 789.2), - (89021, 23.98), - (78634, 41.77), - ] - ], "transactions": [ ["user_id", "merchant_id", "volume", "time", "card_num", "card_type"], [(89021, 11, 33.2, 1639084966.5513437, 123456, "MC"), @@ -91,6 +83,17 @@ } +dataset2 = { + "simple": [ + ["src", "dst", "some_property"], + [(99, 22, "a"), + (98, 34, "b"), + (97, 56, "c"), + (96, 88, "d"), + ] + ], +} + # Placeholder for a directed Graph instance. This is not constructed here in # order to prevent cuGraph code from running on import, which would prevent # proper pytest collection if an exception is raised. See setup_function(). @@ -170,7 +173,7 @@ def dataset1_PropertyGraph(request): dataframe_type = request.param[0] from cugraph.experimental import PropertyGraph - (merchants, users, taxpayers, + (merchants, users, transactions, relationships, referrals) = dataset1.values() pG = PropertyGraph() @@ -195,11 +198,6 @@ def dataset1_PropertyGraph(request): type_name="users", vertex_col_name="user_id", property_columns=None) - pG.add_vertex_data(dataframe_type(columns=taxpayers[0], - data=taxpayers[1]), - type_name="taxpayers", - vertex_col_name="payer_id", - property_columns=None) pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), @@ -227,7 +225,7 @@ def dataset1_MGPropertyGraph(dask_client): data added from dataset1, parameterized for different DataFrame types. """ dataframe_type = cudf.DataFrame - (merchants, users, taxpayers, + (merchants, users, transactions, relationships, referrals) = dataset1.values() from cugraph.experimental import MGPropertyGraph mpG = MGPropertyGraph() @@ -256,13 +254,6 @@ def dataset1_MGPropertyGraph(dask_client): vertex_col_name="user_id", property_columns=None) - sg_df = dataframe_type(columns=taxpayers[0], data=taxpayers[1]) - mg_df = dask_cudf.from_cudf(sg_df, npartitions=2) - mpG.add_vertex_data(mg_df, - type_name="taxpayers", - vertex_col_name="payer_id", - property_columns=None) - sg_df = dataframe_type(columns=transactions[0], data=transactions[1]) mg_df = dask_cudf.from_cudf(sg_df, npartitions=2) mpG.add_edge_data(mg_df, @@ -287,6 +278,23 @@ def dataset1_MGPropertyGraph(dask_client): return mpG +@pytest.fixture(scope="module") +def dataset2_MGPropertyGraph(dask_client): + from cugraph.experimental import MGPropertyGraph + + dataframe_type = cudf.DataFrame + simple = dataset2["simple"] + mpG = MGPropertyGraph() + + sg_df = dataframe_type(columns=simple[0], data=simple[1]) + mgdf = dask_cudf.from_cudf(sg_df, npartitions=2) + + mpG.add_edge_data(mgdf, + vertex_col_names=("src", "dst")) + + return (mpG, simple) + + @pytest.fixture(scope="module", params=df_types_fixture_params) def net_MGPropertyGraph(dask_client): """ @@ -377,3 +385,98 @@ def test_frame_data(dataset1_PropertyGraph, dataset1_MGPropertyGraph): mg_ep_df = mgpG._edge_prop_dataframe\ .compute().sort_values(by=edge_sort_col).reset_index(drop=True) assert (sg_ep_df['_SRC_'].equals(mg_ep_df['_SRC_'])) + + +def test_property_names_attrs(dataset1_MGPropertyGraph): + """ + Ensure the correct number of user-visible properties for vertices and edges + are returned. This should exclude the internal bookkeeping properties. + """ + pG = dataset1_MGPropertyGraph + + expected_vert_prop_names = ["merchant_id", "merchant_location", + "merchant_size", "merchant_sales", + "merchant_num_employees", "merchant_name", + "user_id", "user_location", "vertical"] + expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time", + "card_num", "card_type", "user_id_1", + "user_id_2", "relationship_type", "stars"] + + # Extracting a subgraph with weights has/had a side-effect of adding a + # weight column, so call extract_subgraph() to ensure the internal weight + # column name is not present. + pG.extract_subgraph(default_edge_weight=1.0, allow_multi_edges=True) + + actual_vert_prop_names = pG.vertex_property_names + actual_edge_prop_names = pG.edge_property_names + + assert sorted(actual_vert_prop_names) == sorted(expected_vert_prop_names) + assert sorted(actual_edge_prop_names) == sorted(expected_edge_prop_names) + + +def test_extract_subgraph_nonrenumbered_noedgedata(dataset2_MGPropertyGraph): + """ + Ensure a subgraph can be extracted that is not renumbered and contains no + edge_data. + """ + from cugraph import Graph + + (pG, data) = dataset2_MGPropertyGraph + G = pG.extract_subgraph(create_using=Graph(directed=True), + renumber_graph=False, + add_edge_data=False) + + actual_edgelist = G.edgelist.edgelist_df.compute() + + # create a DF without the properties (ie. the last column) + expected_edgelist = cudf.DataFrame(columns=[pG.src_col_name, + pG.dst_col_name], + data=[(i, j) for (i, j, k) in data[1]]) + + assert_frame_equal(expected_edgelist.sort_values(by=pG.src_col_name, + ignore_index=True), + actual_edgelist.sort_values(by=pG.src_col_name, + ignore_index=True)) + assert hasattr(G, "edge_data") is False + + +def test_num_vertices_with_properties(dataset2_MGPropertyGraph): + """ + Checks that the num_vertices_with_properties attr is set to the number of + vertices that have properties, as opposed to just num_vertices which also + includes all verts in the graph edgelist. + """ + (pG, data) = dataset2_MGPropertyGraph + + assert pG.num_vertices == len(data[1]) * 2 # assume no repeated vertices + assert pG.num_vertices_with_properties == 0 + + df = cudf.DataFrame({"vertex": [98, 97], + "some_property": ["a", "b"], + }) + mgdf = dask_cudf.from_cudf(df, npartitions=2) + pG.add_vertex_data(mgdf, vertex_col_name="vertex") + + assert pG.num_vertices == len(data[1]) * 2 # assume no repeated vertices + assert pG.num_vertices_with_properties == 2 + + +def test_edges_attr(dataset2_MGPropertyGraph): + """ + Ensure the edges attr returns the src, dst, edge_id columns properly. + """ + (pG, data) = dataset2_MGPropertyGraph + + # create a DF without the properties (ie. the last column) + expected_edges = cudf.DataFrame(columns=[pG.src_col_name, pG.dst_col_name], + data=[(i, j) for (i, j, k) in data[1]]) + actual_edges = pG.edges[[pG.src_col_name, pG.dst_col_name]].compute() + + assert_frame_equal(expected_edges.sort_values(by=pG.src_col_name, + ignore_index=True), + actual_edges.sort_values(by=pG.src_col_name, + ignore_index=True)) + edge_ids = pG.edges[pG.edge_id_col_name].compute() + expected_num_edges = len(data[1]) + assert len(edge_ids) == expected_num_edges + assert edge_ids.nunique() == expected_num_edges diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 56d0c43bd04..d4d185dac18 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -170,12 +170,19 @@ def dataset1_PropertyGraph(request): type_name="users", vertex_col_name="user_id", property_columns=None) + # Do not add taxpayers since that may now be considered invalid input (it + # adds the same vertices under different types, which leads to the same + # vertex ID appearing in the internal vertex prop table. + # + # FIXME: determine if this should be allowed or not then either remove + # "taxpayers" or uncomment it. + """ pG.add_vertex_data(dataframe_type(columns=taxpayers[0], data=taxpayers[1]), type_name="taxpayers", vertex_col_name="payer_id", property_columns=None) - + """ pG.add_edge_data(dataframe_type(columns=transactions[0], data=transactions[1]), type_name="transactions", @@ -326,8 +333,9 @@ def test_num_vertices(df_type): assert pG.num_vertices == 9 assert pG.num_edges == 0 - # The taxpayers table does not add new vertices, it only adds properties to - # vertices already present in the merchants and users tables. + # The taxpayers table does not add new unique vertices, it only adds + # properties to vertices already present in the merchants and users + # tables. taxpayers = dataset1["taxpayers"] taxpayers_df = df_type(columns=taxpayers[0], data=taxpayers[1]) @@ -341,6 +349,34 @@ def test_num_vertices(df_type): assert pG.num_edges == 0 +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices_with_properties(df_type): + """ + Checks that the num_vertices_with_properties attr is set to the number of + vertices that have properties, as opposed to just num_vertices which also + includes all verts in the graph edgelist. + """ + from cugraph.experimental import PropertyGraph + + pG = PropertyGraph() + df = df_type({"src": [99, 98, 97], + "dst": [22, 34, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, vertex_col_names=("src", "dst")) + + assert pG.num_vertices == 6 + assert pG.num_vertices_with_properties == 0 + + df = df_type({"vertex": [98, 97], + "some_property": ["a", "b"], + }) + pG.add_vertex_data(df, vertex_col_name="vertex") + + assert pG.num_vertices == 6 + assert pG.num_vertices_with_properties == 2 + + @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_null_data(df_type): """ @@ -508,13 +544,18 @@ def test_extract_subgraph_vertex_prop_condition_only(dataset1_PropertyGraph): pG = dataset1_PropertyGraph - selection = pG.select_vertices("(_TYPE_=='taxpayers') & (amount<100)") + # This should result in two users: 78634 and 89216 + selection = pG.select_vertices( + f"({pG.type_col_name}=='users') " + "& ((user_location<78750) | ((user_location==78757) & (vertical==1)))") G = pG.extract_subgraph(selection=selection, create_using=DiGraph_inst, - edge_weight_property="stars") - - expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634], - "weights": [4]}) + edge_weight_property="relationship_type", + default_edge_weight=99) + # Should result in two edges, one a "relationship", the other a "referral" + expected_edgelist = cudf.DataFrame({"src": [89216, 78634], + "dst": [78634, 89216], + "weights": [99, 8]}) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", @@ -620,51 +661,6 @@ def test_extract_subgraph_specific_query(dataset1_PropertyGraph): assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) -def test_edge_props_to_graph(dataset1_PropertyGraph): - """ - Access the property DataFrames directly and use them to perform a more - complex query, then call edge_props_to_graph() to create the corresponding - graph. - """ - from cugraph.experimental import PropertyGraph - - pG = dataset1_PropertyGraph - vcn = PropertyGraph.vertex_col_name - tcn = PropertyGraph.type_col_name - scn = PropertyGraph.src_col_name - dcn = PropertyGraph.dst_col_name - - # Select referrals from only taxpayers who are users (should be 1) - - # Find the list of vertices that are both users and taxpayers - def contains_both(df): - return (df[tcn] == "taxpayers").any() and \ - (df[tcn] == "users").any() - verts = pG._vertex_prop_dataframe.groupby(vcn)\ - .apply(contains_both) - verts = verts[verts].keys() # get an array of only verts that have both - - # Find the "referral" edge_props containing only those verts - referrals = pG._edge_prop_dataframe[tcn] == "referrals" - srcs = pG._edge_prop_dataframe[referrals][scn].isin(verts) - dsts = pG._edge_prop_dataframe[referrals][dcn].isin(verts) - matching_edges = (srcs & dsts) - indices = matching_edges.index[matching_edges] - edge_props = pG._edge_prop_dataframe.loc[indices] - - G = pG.edge_props_to_graph(edge_props, - create_using=DiGraph_inst) - - expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) - actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", - preserve_order=True) - actual_edgelist = G.unrenumber(actual_edgelist, "dst", - preserve_order=True) - - assert G.is_directed() - assert_frame_equal(expected_edgelist, actual_edgelist, check_like=True) - - def test_select_vertices_from_previous_selection(dataset1_PropertyGraph): """ Ensures that the intersection of vertices of multiple types (only vertices @@ -675,14 +671,17 @@ def test_select_vertices_from_previous_selection(dataset1_PropertyGraph): pG = dataset1_PropertyGraph tcn = PropertyGraph.type_col_name - # Select referrals from only taxpayers who are users (should be 1) - selection = pG.select_vertices(f"{tcn} == 'taxpayers'") - selection = pG.select_vertices(f"{tcn} == 'users'", - from_previous_selection=selection) + # Select referrals from only users 89216 and 78634 using an intentionally + # awkward query with separate select calls to test from_previous_selection + selection = pG.select_vertices(f"{tcn} == 'users'") + selection = pG.select_vertices( + "((user_location == 78757) & (vertical == 1)) " + "| (user_location == 47906)", + from_previous_selection=selection) selection += pG.select_edges(f"{tcn} == 'referrals'") G = pG.extract_subgraph(create_using=DiGraph_inst, selection=selection) - expected_edgelist = cudf.DataFrame({"src": [89021], "dst": [78634]}) + expected_edgelist = cudf.DataFrame({"src": [89216], "dst": [78634]}) actual_edgelist = G.unrenumber(G.edgelist.edgelist_df, "src", preserve_order=True) actual_edgelist = G.unrenumber(actual_edgelist, "dst", @@ -869,6 +868,35 @@ def test_extract_subgraph_default_edge_weight_no_property( assert (G.edgelist.edgelist_df["weights"] == edge_weight).all() +def test_extract_subgraph_nonrenumbered_noedgedata(): + """ + Ensure a subgraph can be extracted that is not renumbered and contains no + edge_data. + """ + from cugraph.experimental import PropertyGraph + from cugraph import Graph + + pG = PropertyGraph() + df = cudf.DataFrame({"src": [99, 98, 97], + "dst": [22, 34, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, vertex_col_names=("src", "dst")) + + G = pG.extract_subgraph(create_using=Graph(directed=True), + renumber_graph=False, + add_edge_data=False) + + expected_edgelist = cudf.DataFrame({"src": [99, 98, 97], + "dst": [22, 34, 56], + }) + assert_frame_equal(expected_edgelist.sort_values(by="src", + ignore_index=True), + G.edgelist.edgelist_df.sort_values(by="src", + ignore_index=True)) + assert hasattr(G, "edge_data") is False + + def test_graph_edge_data_added(dataset1_PropertyGraph): """ Ensures the subgraph returned from extract_subgraph() has the edge_data @@ -1031,6 +1059,33 @@ def test_get_edges(dataset1_PropertyGraph): assert (src, dst) in expected_edges +def test_property_names_attrs(dataset1_PropertyGraph): + """ + Ensure the correct number of user-visible properties for vertices and edges + are returned. This should exclude the internal bookkeeping properties. + """ + pG = dataset1_PropertyGraph + + expected_vert_prop_names = ["merchant_id", "merchant_location", + "merchant_size", "merchant_sales", + "merchant_num_employees", "merchant_name", + "user_id", "user_location", "vertical"] + expected_edge_prop_names = ["user_id", "merchant_id", "volume", "time", + "card_num", "card_type", "user_id_1", + "user_id_2", "relationship_type", "stars"] + + # Extracting a subgraph with weights has/had a side-effect of adding a + # weight column, so call extract_subgraph() to ensure the internal weight + # column name is not present. + pG.extract_subgraph(default_edge_weight=1.0, allow_multi_edges=True) + + actual_vert_prop_names = pG.vertex_property_names + actual_edge_prop_names = pG.edge_property_names + + assert sorted(actual_vert_prop_names) == sorted(expected_vert_prop_names) + assert sorted(actual_edge_prop_names) == sorted(expected_edge_prop_names) + + @pytest.mark.skip(reason="unfinished") def test_extract_subgraph_with_vertex_ids(): """ @@ -1042,14 +1097,6 @@ def test_extract_subgraph_with_vertex_ids(): raise NotImplementedError -@pytest.mark.skip(reason="unfinished") -def test_dgl_use_case(): - """ - FIXME: add a test demonstrating typical DGL use cases - """ - raise NotImplementedError - - # ============================================================================= # Benchmarks # =============================================================================