From 20c35e524b8c168643bfc61bf8a96c21b6577055 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 1 Sep 2023 12:16:05 +0100
Subject: [PATCH] Remove quadratic runtime due to accessing Frame._dtypes in
 loop

Frame._dtypes maps column names to dtypes, however, it is a property
that is computed on-demand. Consequently, a seemingly innocuous dict
lookup is actually O(N). When used in a loop over columns, this makes
an O(N) loop into an O(N^2) one.

This mostly bites on IO when reading data with many thousands of
columns. To fix this, manually move access of Frame._dtypes outside of
any loop over columns.

A more systematic way might be to make this a cached property, but the
cache invalidation is rather hard to reason about.

- Closes #14005
---
 python/cudf/cudf/core/groupby/groupby.py | 3 ++-
 python/cudf/cudf/io/csv.py               | 7 ++++---
 python/cudf/cudf/io/json.py              | 7 ++++---
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 38b07eca330..b300c55b537 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -313,9 +313,10 @@ def dtypes(self):
         3  object  int64
         """
         index = self.grouping.keys.unique().sort_values().to_pandas()
+        obj_dtypes = self.obj._dtypes
         return pd.DataFrame(
             {
-                name: [self.obj._dtypes[name]] * len(index)
+                name: [obj_dtypes[name]] * len(index)
                 for name in self.grouping.values._column_names
             },
             index=index,
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 95e0aa18070..bacc0641639 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -123,11 +123,12 @@ def read_csv(
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is None else dtype
+        specified_dtypes = {} if dtype is None else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 4de9a92a068..efac24aee17 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -158,11 +158,12 @@ def read_json(
     if dtype is True or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
-        dtype = {} if dtype is True else dtype
+        specified_dtypes = {} if dtype is True else dtype
+        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df._dtypes[name]
+            name: df_dtypes[name]
             for name in df._column_names
-            if name not in dtype
+            if name not in specified_dtypes
         }
         default_dtypes = {}