Optimize has_duplicate_edges (rapidsai#2409)

This PR fixes drop duplicates scalability by removing apply which does serial processing. **Benchmark Data** ```python n_nodes = 100_000 n_rows = 1_500_000 df = cudf.DataFrame({'src':cp.random.randint(0,n_nodes,n_rows), 'dst':cp.random.randint(0,n_nodes,n_rows)}) ``` **### After PR:** ```python 17.8 ms ± 536 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` **### Before PR:** ```python 26.3 s ± 78.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` Authors: - Vibhu Jawa (https://github.com/VibhuJawa) Approvers: - Brad Rees (https://github.com/BradReesWork) - Rick Ratzel (https://github.com/rlratzel) URL: rapidsai#2409
oorliu · Jul 15, 2022 · 049d441 · 049d441
1 parent 8ddc7d4
commit 049d441
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 11 deletions.
diff --git a/python/cugraph/cugraph/dask/structure/mg_property_graph.py b/python/cugraph/cugraph/dask/structure/mg_property_graph.py
@@ -650,16 +650,15 @@ def has_duplicate_edges(cls, df):
         """
         Return True if df has >1 of the same src, dst pair
         """
-        # FIXME: this can be very expensive for large DataFrames
         # empty not supported by dask
         if len(df.columns) == 0:
             return False
 
-        def has_duplicate_dst(df):
-            return df[cls.dst_col_name].nunique() != \
-                df[cls.dst_col_name].size
-
-        return df.groupby(cls.src_col_name).apply(has_duplicate_dst).any()
+        unique_pair_len = df.drop_duplicates(split_out=df.npartitions,
+                                             ignore_index=True).shape[0]
+        # if unique_pairs == len(df)
+        # then no duplicate edges
+        return unique_pair_len != df.shape[0]
 
     def __create_property_lookup_table(self, edge_prop_df):
         """

diff --git a/python/cugraph/cugraph/structure/property_graph.py b/python/cugraph/cugraph/structure/property_graph.py
@@ -804,15 +804,16 @@ def has_duplicate_edges(cls, df):
         """
         Return True if df has >1 of the same src, dst pair
         """
-        # FIXME: this can be very expensive for large DataFrames
         if df.empty:
             return False
 
-        def has_duplicate_dst(df):
-            return df[cls.dst_col_name].nunique() != \
-                df[cls.dst_col_name].size
+        unique_pair_len = len(df[[cls.src_col_name,
+                                  cls.dst_col_name]].drop_duplicates(
+                                  ignore_index=True))
 
-        return df.groupby(cls.src_col_name).apply(has_duplicate_dst).any()
+        # if unique_pairs == len(df)
+        # then no duplicate edges
+        return unique_pair_len != len(df)
 
     def __create_property_lookup_table(self, edge_prop_df):
         """