Remove quadratic runtime due to accessing Frame._dtypes in loop

Frame._dtypes maps column names to dtypes, however, it is a property that is computed on-demand. Consequently, a seemingly innocuous dict lookup is actually O(N). When used in a loop over columns, this makes an O(N) loop into an O(N^2) one. This mostly bites on IO when reading data with many thousands of columns. To fix this, manually move access of Frame._dtypes outside of any loop over columns. A more systematic way might be to make this a cached property, but the cache invalidation is rather hard to reason about. - Closes rapidsai#14005
wence- · Sep 1, 2023 · 20c35e5 · 20c35e5
1 parent 27e433a
commit 20c35e5
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 7 deletions.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -313,9 +313,10 @@ def dtypes(self):
         3  object  int64
         """
         index = self.grouping.keys.unique().sort_values().to_pandas()
+        obj_dtypes = self.obj._dtypes
         return pd.DataFrame(
             {
-                name: [self.obj._dtypes[name]] * len(index)
+                name: [obj_dtypes[name]] * len(index)
                 for name in self.grouping.values._column_names
             },
             index=index,

diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
@@ -123,11 +123,12 @@ def read_csv(
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is None else dtype
+        specified_dtypes = {} if dtype is None else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
@@ -158,11 +158,12 @@ def read_json(
     if dtype is True or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is True else dtype
+        specified_dtypes = {} if dtype is True else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}