From 20c35e524b8c168643bfc61bf8a96c21b6577055 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Fri, 1 Sep 2023 12:16:05 +0100 Subject: [PATCH] Remove quadratic runtime due to accessing Frame._dtypes in loop Frame._dtypes maps column names to dtypes, however, it is a property that is computed on-demand. Consequently, a seemingly innocuous dict lookup is actually O(N). When used in a loop over columns, this makes an O(N) loop into an O(N^2) one. This mostly bites on IO when reading data with many thousands of columns. To fix this, manually move access of Frame._dtypes outside of any loop over columns. A more systematic way might be to make this a cached property, but the cache invalidation is rather hard to reason about. - Closes #14005 --- python/cudf/cudf/core/groupby/groupby.py | 3 ++- python/cudf/cudf/io/csv.py | 7 ++++--- python/cudf/cudf/io/json.py | 7 ++++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 38b07eca330..b300c55b537 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -313,9 +313,10 @@ def dtypes(self): 3 object int64 """ index = self.grouping.keys.unique().sort_values().to_pandas() + obj_dtypes = self.obj._dtypes return pd.DataFrame( { - name: [self.obj._dtypes[name]] * len(index) + name: [obj_dtypes[name]] * len(index) for name in self.grouping.values._column_names }, index=index, diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 95e0aa18070..bacc0641639 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -123,11 +123,12 @@ def read_csv( if dtype is None or isinstance(dtype, abc.Mapping): # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. - dtype = {} if dtype is None else dtype + specified_dtypes = {} if dtype is None else dtype + df_dtypes = df._dtypes unspecified_dtypes = { - name: df._dtypes[name] + name: df_dtypes[name] for name in df._column_names - if name not in dtype + if name not in specified_dtypes } default_dtypes = {} diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 4de9a92a068..efac24aee17 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -158,11 +158,12 @@ def read_json( if dtype is True or isinstance(dtype, abc.Mapping): # There exists some dtypes in the result columns that is inferred. # Find them and map them to the default dtypes. - dtype = {} if dtype is True else dtype + specified_dtypes = {} if dtype is True else dtype + df_dtypes = df._dtypes unspecified_dtypes = { - name: df._dtypes[name] + name: df_dtypes[name] for name in df._column_names - if name not in dtype + if name not in specified_dtypes } default_dtypes = {}