From 6b62002dabdbe4176858cf33fcbe14eec33b5e39 Mon Sep 17 00:00:00 2001 From: Erik Welch Date: Fri, 29 Jul 2022 08:20:37 -0500 Subject: [PATCH] Add get_num_vertices and get_num_edges methods to PropertyGraph. (#2434) Closes #2422. I'll add this to MGPG when we finalize the API and behavior. Authors: - Erik Welch (https://github.com/eriknw) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) - Brad Rees (https://github.com/BradReesWork) - Rick Ratzel (https://github.com/rlratzel) URL: https://github.com/rapidsai/cugraph/pull/2434 --- .../dask/structure/mg_property_graph.py | 176 +++++++++++++---- python/cugraph/cugraph/gnn/graph_store.py | 14 +- .../cugraph/structure/property_graph.py | 177 ++++++++++++++---- .../tests/mg/test_mg_property_graph.py | 14 +- .../cugraph/cugraph/tests/test_graph_store.py | 4 +- .../cugraph/tests/test_property_graph.py | 155 ++++++++++++--- 6 files changed, 415 insertions(+), 125 deletions(-) diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py index 5b064a49c04..7399d818d23 100644 --- a/python/cugraph/cugraph/dask/structure/mg_property_graph.py +++ b/python/cugraph/cugraph/dask/structure/mg_property_graph.py @@ -52,7 +52,6 @@ class EXPERIMENTAL__MGPropertyGraph: Graphs from individual property selections and used later to annotate graph algorithm results with corresponding properties. """ - # column name constants used in internal DataFrames vertex_col_name = "_VERTEX_" src_col_name = "_SRC_" @@ -61,6 +60,7 @@ class EXPERIMENTAL__MGPropertyGraph: edge_id_col_name = "_EDGE_ID_" vertex_id_col_name = "_VERTEX_ID_" weight_col_name = "_WEIGHT_" + _default_type_name = "" def __init__(self, num_workers=None): # The dataframe containing the properties for each vertex. @@ -126,7 +126,8 @@ def __init__(self, num_workers=None): # Cached property values self.__num_vertices = None - self.__num_vertices_with_properties = None + self.__vertex_type_value_counts = None + self.__edge_type_value_counts = None # number of gpu's to use if num_workers is None: @@ -134,37 +135,7 @@ def __init__(self, num_workers=None): else: self.__num_workers = num_workers - @property - def num_vertices(self): - if self.__num_vertices is not None: - return self.__num_vertices - self.__num_vertices = 0 - vert_sers = self.__get_all_vertices_series() - if vert_sers: - if self.__series_type is dask_cudf.Series: - vert_count = dask_cudf.concat(vert_sers).nunique() - self.__num_vertices = vert_count.compute() - return self.__num_vertices - - @property - def num_vertices_with_properties(self): - if self.__num_vertices_with_properties is not None: - return self.__num_vertices_with_properties - - if self.__vertex_prop_dataframe is not None: - self.__num_vertices_with_properties = \ - len(self.__vertex_prop_dataframe) - return self.__num_vertices_with_properties - - return 0 - - @property - def num_edges(self): - if self.__edge_prop_dataframe is not None: - return len(self.__edge_prop_dataframe) - else: - return 0 - + # PropertyGraph read-only attributes @property def edges(self): if self.__edge_prop_dataframe is not None: @@ -195,6 +166,33 @@ def edge_property_names(self): return props return [] + @property + def vertex_types(self): + """The set of vertex type names""" + value_counts = self._vertex_type_value_counts + if value_counts is None: + names = set() + elif self.__series_type is dask_cudf.Series: + names = set(value_counts.index.to_arrow().to_pylist()) + else: + names = set(value_counts.index) + default = self._default_type_name + if default not in names and self.get_num_vertices(default) > 0: + # include "" from vertices that only exist in edge data + names.add(default) + return names + + @property + def edge_types(self): + """The set of edge type names""" + value_counts = self._edge_type_value_counts + if value_counts is None: + return set() + elif self.__series_type is dask_cudf.Series: + return set(value_counts.index.to_arrow().to_pylist()) + else: + return set(value_counts.index) + # PropertyGraph read-only attributes for debugging @property def _vertex_prop_dataframe(self): @@ -204,6 +202,104 @@ def _vertex_prop_dataframe(self): def _edge_prop_dataframe(self): return self.__edge_prop_dataframe + @property + def _vertex_type_value_counts(self): + """A Series of the counts of types in __vertex_prop_dataframe""" + if self.__vertex_prop_dataframe is None: + return + if self.__vertex_type_value_counts is None: + # Types should all be strings; what should we do if we see NaN? + self.__vertex_type_value_counts = ( + self.__vertex_prop_dataframe[self.type_col_name] + .value_counts(sort=False, dropna=False) + .compute() + ) + return self.__vertex_type_value_counts + + @property + def _edge_type_value_counts(self): + """A Series of the counts of types in __edge_prop_dataframe""" + if self.__edge_prop_dataframe is None: + return + if self.__edge_type_value_counts is None: + # Types should all be strings; what should we do if we see NaN? + self.__edge_type_value_counts = ( + self.__edge_prop_dataframe[self.type_col_name] + .value_counts(sort=False, dropna=False) + .compute() + ) + return self.__edge_type_value_counts + + def get_num_vertices(self, type=None, *, include_edge_data=True): + """Return the number of all vertices or vertices of a given type. + + Parameters + ---------- + type : string, optional + If type is None (the default), return the total number of vertices, + otherwise return the number of vertices of the specified type. + include_edge_data : bool (default True) + If True, include vertices that were added in vertex and edge data. + If False, only include vertices that were added in vertex data. + Note that vertices that only exist in edge data are assumed to have + the default type. + + See Also + -------- + PropertyGraph.get_num_edges + """ + if type is None: + if not include_edge_data: + if self.__vertex_prop_dataframe is None: + return 0 + return len(self.__vertex_prop_dataframe) + if self.__num_vertices is not None: + return self.__num_vertices + self.__num_vertices = 0 + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is dask_cudf.Series: + vert_count = dask_cudf.concat(vert_sers).nunique() + self.__num_vertices = vert_count.compute() + return self.__num_vertices + + value_counts = self._vertex_type_value_counts + if type == self._default_type_name and include_edge_data: + # The default type, "", can refer to both vertex and edge data + if self.__vertex_prop_dataframe is None: + return self.get_num_vertices() + return ( + self.get_num_vertices() + - len(self.__vertex_prop_dataframe) + + (value_counts[type] if type in value_counts else 0) + ) + if self.__vertex_prop_dataframe is None: + return 0 + return value_counts[type] if type in value_counts else 0 + + def get_num_edges(self, type=None): + """Return the number of all edges or edges of a given type. + + Parameters + ---------- + type : string, optional + If type is None (the default), return the total number of edges, + otherwise return the number of edges of the specified type. + + See Also + -------- + PropertyGraph.get_num_vertices + """ + if type is None: + if self.__edge_prop_dataframe is not None: + return len(self.__edge_prop_dataframe) + else: + return 0 + if self.__edge_prop_dataframe is None: + return 0 + value_counts = self._edge_type_value_counts + return value_counts[type] if type in value_counts else 0 + def get_vertices(self, selection=None): """ Return a Series containing the unique vertex IDs contained in both @@ -243,7 +339,7 @@ def add_vertex_data(self, The name to be assigned to the type of property being added. For example, if dataframe contains data about users, type_name might be "users". If not specified, the type of properties will be added as - None or NA + the empty string, "". property_columns : list of strings List of column names in dataframe to be added as properties. All other columns in dataframe will be ignored. If not specified, all @@ -265,6 +361,8 @@ def add_vertex_data(self, if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") + if type_name is None: + type_name = self._default_type_name if property_columns: if type(property_columns) is not list: raise TypeError("property_columns must be a list, got: " @@ -279,7 +377,7 @@ def add_vertex_data(self, # Clear the cached values related to the number of vertices since more # could be added in this method. self.__num_vertices = None - self.__num_vertices_with_properties = None + self.__vertex_type_value_counts = None # Could update instead # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -352,7 +450,7 @@ def add_edge_data(self, The name to be assigned to the type of property being added. For example, if dataframe contains data about transactions, type_name might be "transactions". If not specified, the type of properties - will be added as None or NA + will be added as the empty string "". property_columns : list of strings List of column names in dataframe to be added as properties. All other columns in dataframe will be ignored. If not specified, all @@ -378,6 +476,8 @@ def add_edge_data(self, if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") + if type_name is None: + type_name = self._default_type_name if property_columns: if type(property_columns) is not list: raise TypeError("property_columns must be a list, got: " @@ -390,8 +490,9 @@ def add_edge_data(self, f"{list(invalid_columns)}") # Clear the cached value for num_vertices since more could be added in - # this method. This method cannot affect num_vertices_with_properties + # this method. This method cannot affect __node_type_value_counts self.__num_vertices = None + self.__edge_type_value_counts = None # Could update instead default_edge_columns = [self.src_col_name, self.dst_col_name, @@ -521,6 +622,7 @@ def extract_subgraph(self, add_edge_data : bool (default is True) If True, add meta data about the edges contained in the extracted graph which are required for future calls to annotate_dataframe(). + Returns ------- A Graph instance of the same type as create_using containing only the diff --git a/python/cugraph/cugraph/gnn/graph_store.py b/python/cugraph/cugraph/gnn/graph_store.py index 7e77ffcf594..ed78e81d204 100644 --- a/python/cugraph/cugraph/gnn/graph_store.py +++ b/python/cugraph/cugraph/gnn/graph_store.py @@ -117,18 +117,10 @@ def get_edge_storage(self, key, etype=None): ) def num_nodes(self, ntype=None): - if ntype is not None: - s = self.gdata._vertex_prop_dataframe[type_n] == ntype - return s.sum() - else: - return self.gdata.num_vertices + return self.gdata.get_num_vertices(ntype) def num_edges(self, etype=None): - if etype is not None: - s = self.gdata._edge_prop_dataframe[type_n] == etype - return s.sum() - else: - return self.gdata.num_edges + return self.gdata.get_num_edges(etype) @property def ntypes(self): @@ -165,7 +157,7 @@ def gdata(self): ###################################### @property def num_vertices(self): - return self.gdata.num_vertices + return self.gdata.get_num_vertices() def get_vertex_ids(self): return self.gdata.vertices_ids() diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py index f5d2cac8823..6137b6952a0 100644 --- a/python/cugraph/cugraph/structure/property_graph.py +++ b/python/cugraph/cugraph/structure/property_graph.py @@ -27,7 +27,7 @@ class EXPERIMENTAL__PropertySelection: """ Instances of this class are returned from the PropertyGraph.select_*() methods and can be used by the PropertyGraph.extract_subgraph() method to - extrac a Graph containing vertices and edges with only the selected + extract a Graph containing vertices and edges with only the selected properties. """ def __init__(self, @@ -65,6 +65,7 @@ class EXPERIMENTAL__PropertyGraph: edge_id_col_name = "_EDGE_ID_" vertex_id_col_name = "_VERTEX_ID_" weight_col_name = "_WEIGHT_" + _default_type_name = "" def __init__(self): # The dataframe containing the properties for each vertex. @@ -135,43 +136,10 @@ def __init__(self): # Cached property values self.__num_vertices = None - self.__num_vertices_with_properties = None + self.__vertex_type_value_counts = None + self.__edge_type_value_counts = None # PropertyGraph read-only attributes - @property - def num_vertices(self): - if self.__num_vertices is not None: - return self.__num_vertices - - self.__num_vertices = 0 - vert_sers = self.__get_all_vertices_series() - if vert_sers: - if self.__series_type is cudf.Series: - self.__num_vertices = cudf.concat(vert_sers).nunique() - else: - self.__num_vertices = pd.concat(vert_sers).nunique() - - return self.__num_vertices - - @property - def num_vertices_with_properties(self): - if self.__num_vertices_with_properties is not None: - return self.__num_vertices_with_properties - - if self.__vertex_prop_dataframe is not None: - self.__num_vertices_with_properties = \ - len(self.__vertex_prop_dataframe) - return self.__num_vertices_with_properties - - return 0 - - @property - def num_edges(self): - if self.__edge_prop_dataframe is not None: - return len(self.__edge_prop_dataframe) - else: - return 0 - @property def edges(self): if self.__edge_prop_dataframe is not None: @@ -201,6 +169,33 @@ def edge_property_names(self): return props return [] + @property + def vertex_types(self): + """The set of vertex type names""" + value_counts = self._vertex_type_value_counts + if value_counts is None: + names = set() + elif self.__series_type is cudf.Series: + names = set(value_counts.index.to_arrow().to_pylist()) + else: + names = set(value_counts.index) + default = self._default_type_name + if default not in names and self.get_num_vertices(default) > 0: + # include "" from vertices that only exist in edge data + names.add(default) + return names + + @property + def edge_types(self): + """The set of edge type names""" + value_counts = self._edge_type_value_counts + if value_counts is None: + return set() + elif self.__series_type is cudf.Series: + return set(value_counts.index.to_arrow().to_pylist()) + else: + return set(value_counts.index) + # PropertyGraph read-only attributes for debugging @property def _vertex_prop_dataframe(self): @@ -210,6 +205,102 @@ def _vertex_prop_dataframe(self): def _edge_prop_dataframe(self): return self.__edge_prop_dataframe + @property + def _vertex_type_value_counts(self): + """A Series of the counts of types in __vertex_prop_dataframe""" + if self.__vertex_prop_dataframe is None: + return + if self.__vertex_type_value_counts is None: + # Types should all be strings; what should we do if we see NaN? + self.__vertex_type_value_counts = ( + self.__vertex_prop_dataframe[self.type_col_name] + .value_counts(sort=False, dropna=False) + ) + return self.__vertex_type_value_counts + + @property + def _edge_type_value_counts(self): + """A Series of the counts of types in __edge_prop_dataframe""" + if self.__edge_prop_dataframe is None: + return + if self.__edge_type_value_counts is None: + # Types should all be strings; what should we do if we see NaN? + self.__edge_type_value_counts = ( + self.__edge_prop_dataframe[self.type_col_name] + .value_counts(sort=False, dropna=False) + ) + return self.__edge_type_value_counts + + def get_num_vertices(self, type=None, *, include_edge_data=True): + """Return the number of all vertices or vertices of a given type. + + Parameters + ---------- + type : string, optional + If type is None (the default), return the total number of vertices, + otherwise return the number of vertices of the specified type. + include_edge_data : bool (default True) + If True, include vertices that were added in vertex and edge data. + If False, only include vertices that were added in vertex data. + Note that vertices that only exist in edge data are assumed to have + the default type. + + See Also + -------- + PropertyGraph.get_num_edges + """ + if type is None: + if not include_edge_data: + if self.__vertex_prop_dataframe is None: + return 0 + return len(self.__vertex_prop_dataframe) + if self.__num_vertices is not None: + return self.__num_vertices + self.__num_vertices = 0 + vert_sers = self.__get_all_vertices_series() + if vert_sers: + if self.__series_type is cudf.Series: + self.__num_vertices = cudf.concat(vert_sers).nunique() + else: + self.__num_vertices = pd.concat(vert_sers).nunique() + return self.__num_vertices + value_counts = self._vertex_type_value_counts + if type == self._default_type_name and include_edge_data: + # The default type, "", can refer to both vertex and edge data + if self.__vertex_prop_dataframe is None: + return self.get_num_vertices() + return ( + self.get_num_vertices() + - len(self.__vertex_prop_dataframe) + + (value_counts[type] if type in value_counts else 0) + ) + if self.__vertex_prop_dataframe is None: + return 0 + return value_counts[type] if type in value_counts else 0 + + def get_num_edges(self, type=None): + """Return the number of all edges or edges of a given type. + + Parameters + ---------- + type : string, optional + If type is None (the default), return the total number of edges, + otherwise return the number of edges of the specified type. + + See Also + -------- + PropertyGraph.get_num_vertices + """ + if type is None: + if self.__edge_prop_dataframe is not None: + return len(self.__edge_prop_dataframe) + else: + return 0 + if self.__edge_prop_dataframe is None: + return 0 + value_counts = self._edge_type_value_counts + return value_counts[type] if type in value_counts else 0 + def get_vertices(self, selection=None): """ Return a Series containing the unique vertex IDs contained in both @@ -249,7 +340,7 @@ def add_vertex_data(self, The name to be assigned to the type of property being added. For example, if dataframe contains data about users, type_name might be "users". If not specified, the type of properties will be added as - None or NA + the empty string, "". property_columns : list of strings List of column names in dataframe to be added as properties. All other columns in dataframe will be ignored. If not specified, all @@ -272,6 +363,8 @@ def add_vertex_data(self, if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") + if type_name is None: + type_name = self._default_type_name if property_columns: if type(property_columns) is not list: raise TypeError("property_columns must be a list, got: " @@ -296,7 +389,7 @@ def add_vertex_data(self, # Clear the cached values related to the number of vertices since more # could be added in this method. self.__num_vertices = None - self.__num_vertices_with_properties = None + self.__vertex_type_value_counts = None # Could update instead # Initialize the __vertex_prop_dataframe if necessary using the same # type as the incoming dataframe. @@ -367,7 +460,7 @@ def add_edge_data(self, The name to be assigned to the type of property being added. For example, if dataframe contains data about transactions, type_name might be "transactions". If not specified, the type of properties - will be added as None or NA + will be added as the empty string "". property_columns : list of strings List of column names in dataframe to be added as properties. All other columns in dataframe will be ignored. If not specified, all @@ -394,6 +487,8 @@ def add_edge_data(self, if (type_name is not None) and not(isinstance(type_name, str)): raise TypeError("type_name must be a string, got: " f"{type(type_name)}") + if type_name is None: + type_name = self._default_type_name if property_columns: if type(property_columns) is not list: raise TypeError("property_columns must be a list, got: " @@ -416,8 +511,9 @@ def add_edge_data(self, f"using type {self.__dataframe_type}") # Clear the cached value for num_vertices since more could be added in - # this method. This method cannot affect num_vertices_with_properties + # this method. This method cannot affect __node_type_value_counts self.__num_vertices = None + self.__edge_type_value_counts = None # Could update instead default_edge_columns = [self.src_col_name, self.dst_col_name, @@ -754,7 +850,6 @@ def edge_props_to_graph(self, Create and return a Graph from the edges in edge_prop_df. """ # FIXME: check default_edge_weight is valid - if edge_weight_property: if edge_weight_property not in edge_prop_df.columns: raise ValueError("edge_weight_property " diff --git a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py index d69cb600873..bae807d5e3a 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_property_graph.py @@ -327,8 +327,8 @@ def test_extract_subgraph_no_query(net_MGPropertyGraph, net_PropertyGraph): """ dpG = net_MGPropertyGraph pG = net_PropertyGraph - assert pG.num_edges == dpG.num_edges - assert pG.num_vertices == dpG.num_vertices + assert pG.get_num_edges() == dpG.get_num_edges() + assert pG.get_num_vertices() == dpG.get_num_vertices() # tests that the edges are the same in the sg and mg property graph sg_df = \ pG.edges.sort_values(by=['_SRC_', '_DST_']).reset_index(drop=True) @@ -448,8 +448,9 @@ def test_num_vertices_with_properties(dataset2_MGPropertyGraph): """ (pG, data) = dataset2_MGPropertyGraph - assert pG.num_vertices == len(data[1]) * 2 # assume no repeated vertices - assert pG.num_vertices_with_properties == 0 + # assume no repeated vertices + assert pG.get_num_vertices() == len(data[1]) * 2 + assert pG.get_num_vertices(include_edge_data=False) == 0 df = cudf.DataFrame({"vertex": [98, 97], "some_property": ["a", "b"], @@ -457,8 +458,9 @@ def test_num_vertices_with_properties(dataset2_MGPropertyGraph): mgdf = dask_cudf.from_cudf(df, npartitions=2) pG.add_vertex_data(mgdf, vertex_col_name="vertex") - assert pG.num_vertices == len(data[1]) * 2 # assume no repeated vertices - assert pG.num_vertices_with_properties == 2 + # assume no repeated vertices + assert pG.get_num_vertices() == len(data[1]) * 2 + assert pG.get_num_vertices(include_edge_data=False) == 2 def test_edges_attr(dataset2_MGPropertyGraph): diff --git a/python/cugraph/cugraph/tests/test_graph_store.py b/python/cugraph/cugraph/tests/test_graph_store.py index 7cb535da5da..12c825dbb3a 100644 --- a/python/cugraph/cugraph/tests/test_graph_store.py +++ b/python/cugraph/cugraph/tests/test_graph_store.py @@ -60,9 +60,9 @@ def test_using_pgraph(graph_file): gstore = cugraph.gnn.CuGraphStore(graph=pG) - assert g.number_of_edges() == pG.num_edges + assert g.number_of_edges() == pG.get_num_edges() assert g.number_of_edges() == gstore.num_edges() - assert g.number_of_vertices() == pG.num_vertices + assert g.number_of_vertices() == pG.get_num_vertices() assert g.number_of_vertices() == gstore.num_vertices diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index a85f8df25fe..c0fb2299224 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -214,7 +214,6 @@ def dataset1_PropertyGraph(request): vertex_col_names=("user_id_1", "user_id_2"), property_columns=None) - return pG @@ -303,8 +302,9 @@ def test_add_vertex_data(df_type): vertex_col_name="merchant_id", property_columns=None) - assert pG.num_vertices == 5 - assert pG.num_edges == 0 + assert pG.get_num_vertices() == 5 + assert pG.get_num_vertices('merchants') == 5 + assert pG.get_num_edges() == 0 expected_props = merchants[0].copy() assert sorted(pG.vertex_property_names) == sorted(expected_props) @@ -312,7 +312,7 @@ def test_add_vertex_data(df_type): @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) def test_num_vertices(df_type): """ - Ensures num_vertices is correct after various additions of specific data. + Ensures get_num_vertices is correct after various additions of data. """ from cugraph.experimental import PropertyGraph @@ -321,6 +321,9 @@ def test_num_vertices(df_type): data=merchants[1]) pG = PropertyGraph() + assert pG.get_num_vertices() == 0 + assert pG.get_num_vertices('unknown_type') == 0 + assert pG.get_num_edges('unknown_type') == 0 pG.add_vertex_data(merchants_df, type_name="merchants", vertex_col_name="merchant_id", @@ -328,12 +331,12 @@ def test_num_vertices(df_type): # Test caching - the second retrieval should always be faster st = time.time() - assert pG.num_vertices == 5 + assert pG.get_num_vertices() == 5 compute_time = time.time() - st - assert pG.num_edges == 0 + assert pG.get_num_edges() == 0 st = time.time() - assert pG.num_vertices == 5 + assert pG.get_num_vertices() == 5 cache_retrieval_time = time.time() - st assert cache_retrieval_time < compute_time @@ -345,8 +348,10 @@ def test_num_vertices(df_type): vertex_col_name="user_id", property_columns=None) - assert pG.num_vertices == 9 - assert pG.num_edges == 0 + assert pG.get_num_vertices() == 9 + assert pG.get_num_vertices('merchants') == 5 + assert pG.get_num_vertices('users') == 4 + assert pG.get_num_edges() == 0 # The taxpayers table does not add new unique vertices, it only adds # properties to vertices already present in the merchants and users @@ -360,8 +365,90 @@ def test_num_vertices(df_type): vertex_col_name="payer_id", property_columns=None) - assert pG.num_vertices == 9 - assert pG.num_edges == 0 + assert pG.get_num_vertices() == 9 + assert pG.get_num_vertices('merchants') == 5 + assert pG.get_num_vertices('users') == 4 + assert pG.get_num_vertices('unknown_type') == 0 + assert pG.get_num_edges() == 0 + + +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_type_names(df_type): + from cugraph.experimental import PropertyGraph + + pG = PropertyGraph() + assert pG.edge_types == set() + assert pG.vertex_types == set() + + df = df_type({"src": [99, 98, 97], + "dst": [22, 34, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, vertex_col_names=("src", "dst")) + assert pG.edge_types == set([""]) + assert pG.vertex_types == set([""]) + + df = df_type({"vertex": [98, 97], + "some_property": ["a", "b"], + }) + pG.add_vertex_data(df, type_name="vtype", vertex_col_name="vertex") + assert pG.edge_types == set([""]) + assert pG.vertex_types == set(["", "vtype"]) + + df = df_type({"src": [199, 98, 197], + "dst": [22, 134, 56], + "some_property": ["a", "b", "c"], + }) + pG.add_edge_data(df, type_name="etype", vertex_col_names=("src", "dst")) + assert pG.edge_types == set(["", "etype"]) + assert pG.vertex_types == set(["", "vtype"]) + + +@pytest.mark.parametrize("df_type", df_types, ids=df_type_id) +def test_num_vertices_include_edge_data(df_type): + """ + Ensures get_num_vertices is correct after various additions of data. + """ + from cugraph.experimental import PropertyGraph + + (merchants, users, taxpayers, + transactions, relationships, referrals) = dataset1.values() + + pG = PropertyGraph() + assert pG.get_num_vertices(include_edge_data=False) == 0 + assert pG.get_num_vertices("", include_edge_data=False) == 0 + + pG.add_edge_data(df_type(columns=transactions[0], + data=transactions[1]), + type_name="transactions", + vertex_col_names=("user_id", "merchant_id"), + property_columns=None) + + assert pG.get_num_vertices(include_edge_data=False) == 0 + assert pG.get_num_vertices("", include_edge_data=False) == 0 + assert pG.get_num_vertices(include_edge_data=True) == 7 + assert pG.get_num_vertices("", include_edge_data=True) == 7 + pG.add_vertex_data(df_type(columns=merchants[0], + data=merchants[1]), + # type_name="merchants", # Use default! + vertex_col_name="merchant_id", + property_columns=None) + assert pG.get_num_vertices(include_edge_data=False) == 5 + assert pG.get_num_vertices("", include_edge_data=False) == 5 + assert pG.get_num_vertices(include_edge_data=True) == 9 + assert pG.get_num_vertices("", include_edge_data=True) == 9 + pG.add_vertex_data(df_type(columns=users[0], + data=users[1]), + type_name="users", + vertex_col_name="user_id", + property_columns=None) + assert pG.get_num_vertices(include_edge_data=False) == 9 + assert pG.get_num_vertices("", include_edge_data=False) == 5 + assert pG.get_num_vertices("users", include_edge_data=False) == 4 + # All vertices now have vertex data, so this should match + assert pG.get_num_vertices(include_edge_data=True) == 9 + assert pG.get_num_vertices("", include_edge_data=True) == 5 + assert pG.get_num_vertices("users", include_edge_data=True) == 4 @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) @@ -380,16 +467,16 @@ def test_num_vertices_with_properties(df_type): }) pG.add_edge_data(df, vertex_col_names=("src", "dst")) - assert pG.num_vertices == 6 - assert pG.num_vertices_with_properties == 0 + assert pG.get_num_vertices() == 6 + assert pG.get_num_vertices(include_edge_data=False) == 0 df = df_type({"vertex": [98, 97], "some_property": ["a", "b"], }) pG.add_vertex_data(df, vertex_col_name="vertex") - assert pG.num_vertices == 6 - assert pG.num_vertices_with_properties == 2 + assert pG.get_num_vertices() == 6 + assert pG.get_num_vertices(include_edge_data=False) == 2 @pytest.mark.parametrize("df_type", df_types, ids=df_type_id) @@ -401,8 +488,8 @@ def test_null_data(df_type): pG = PropertyGraph() - assert pG.num_vertices == 0 - assert pG.num_edges == 0 + assert pG.get_num_vertices() == 0 + assert pG.get_num_edges() == 0 assert sorted(pG.vertex_property_names) == sorted([]) @@ -424,8 +511,9 @@ def test_add_vertex_data_prop_columns(df_type): vertex_col_name="merchant_id", property_columns=expected_props) - assert pG.num_vertices == 5 - assert pG.num_edges == 0 + assert pG.get_num_vertices() == 5 + assert pG.get_num_vertices('merchants') == 5 + assert pG.get_num_edges() == 0 assert sorted(pG.vertex_property_names) == sorted(expected_props) @@ -486,8 +574,11 @@ def test_add_edge_data(df_type): vertex_col_names=("user_id", "merchant_id"), property_columns=None) - assert pG.num_vertices == 7 - assert pG.num_edges == 4 + assert pG.get_num_vertices() == 7 + # 'transactions' is edge type, not vertex type + assert pG.get_num_vertices('transactions') == 0 + assert pG.get_num_edges() == 4 + assert pG.get_num_edges('transactions') == 4 expected_props = ["merchant_id", "user_id", "volume", "time", "card_num", "card_type"] assert sorted(pG.edge_property_names) == sorted(expected_props) @@ -511,8 +602,11 @@ def test_add_edge_data_prop_columns(df_type): vertex_col_names=("user_id", "merchant_id"), property_columns=expected_props) - assert pG.num_vertices == 7 - assert pG.num_edges == 4 + assert pG.get_num_vertices() == 7 + # 'transactions' is edge type, not vertex type + assert pG.get_num_vertices('transactions') == 0 + assert pG.get_num_edges() == 4 + assert pG.get_num_edges('transactions') == 4 assert sorted(pG.edge_property_names) == sorted(expected_props) @@ -928,7 +1022,15 @@ def test_graph_edge_data_added(dataset1_PropertyGraph): len(dataset1["relationships"][-1]) + \ len(dataset1["referrals"][-1]) - assert pG.num_edges == expected_num_edges + assert pG.get_num_edges() == expected_num_edges + assert ( + pG.get_num_edges("transactions") == len(dataset1["transactions"][-1]) + ) + assert ( + pG.get_num_edges("relationships") == len(dataset1["relationships"][-1]) + ) + assert pG.get_num_edges("referrals") == len(dataset1["referrals"][-1]) + assert pG.get_num_edges("unknown_type") == 0 # extract_subgraph() should return a directed Graph object with additional # meta-data, which includes edge IDs. @@ -1119,10 +1221,7 @@ def test_extract_subgraph_with_vertex_ids(): def bench_num_vertices(gpubenchmark, dataset1_PropertyGraph): pG = dataset1_PropertyGraph - def get_num_vertices(): - return pG.num_vertices - - assert gpubenchmark(get_num_vertices) == 9 + assert gpubenchmark(pG.get_num_vertices) == 9 def bench_get_vertices(gpubenchmark, dataset1_PropertyGraph):