From c811353ee80a29cc219a9f72bf0d9cf1c02f04b4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 30 Jan 2024 16:45:31 -1000 Subject: [PATCH] CLN: `to_dict` (#57159) * Make to_dict lazier * Remove some extra looping and indexing * Add erroneous ignore --- pandas/core/frame.py | 23 ----------- pandas/core/methods/to_dict.py | 73 +++++++++++++++++++++------------- 2 files changed, 45 insertions(+), 51 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97f4eaa7c208a..d3f1c2970429e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -89,7 +89,6 @@ find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, - maybe_box_native, maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( @@ -1983,28 +1982,6 @@ def to_numpy( return result - def _create_data_for_split_and_tight_to_dict( - self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] - ) -> list: - """ - Simple helper method to create data for to ``to_dict(orient="split")`` and - ``to_dict(orient="tight")`` to create the main output data - """ - if are_all_object_dtype_cols: - data = [ - list(map(maybe_box_native, t)) - for t in self.itertuples(index=False, name=None) - ] - else: - data = [list(t) for t in self.itertuples(index=False, name=None)] - if object_dtype_indices: - # If we have object_dtype_cols, apply maybe_box_naive after list - # comprehension for perf - for row in data: - for i in object_dtype_indices: - row[i] = maybe_box_native(row[i]) - return data - @overload def to_dict( self, diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py index accbd92a91ed6..a88cf88ead66e 100644 --- a/pandas/core/methods/to_dict.py +++ b/pandas/core/methods/to_dict.py @@ -24,11 +24,34 @@ from pandas.core import common as com if TYPE_CHECKING: + from collections.abc import Generator + from pandas._typing import MutableMappingT from pandas import DataFrame +def create_data_for_split( + df: DataFrame, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] +) -> Generator[list, None, None]: + """ + Simple helper method to create data for to ``to_dict(orient="split")`` + to create the main output data + """ + if are_all_object_dtype_cols: + for tup in df.itertuples(index=False, name=None): + yield list(map(maybe_box_native, tup)) + else: + for tup in df.itertuples(index=False, name=None): + data = list(tup) + if object_dtype_indices: + # If we have object_dtype_cols, apply maybe_box_naive after + # for perf + for i in object_dtype_indices: + data[i] = maybe_box_native(data[i]) + yield data + + @overload def to_dict( df: DataFrame, @@ -152,35 +175,38 @@ def to_dict( # GH46470 Return quickly if orient series to avoid creating dtype objects return into_c((k, v) for k, v in df.items()) + if orient == "dict": + return into_c((k, v.to_dict(into=into)) for k, v in df.items()) + box_native_indices = [ i for i, col_dtype in enumerate(df.dtypes.values) if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype) ] - box_na_values = [ - lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA - for i, col_dtype in enumerate(df.dtypes.values) - ] - are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) - if orient == "dict": - return into_c((k, v.to_dict(into=into)) for k, v in df.items()) + are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes) - elif orient == "list": + if orient == "list": object_dtype_indices_as_set: set[int] = set(box_native_indices) + box_na_values = ( + lib.no_default + if not isinstance(col_dtype, BaseMaskedDtype) + else libmissing.NA + for col_dtype in df.dtypes.values + ) return into_c( ( k, - list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i]))) + list(map(maybe_box_native, v.to_numpy(na_value=box_na_value))) if i in object_dtype_indices_as_set else list(map(maybe_box_native, v.to_numpy())), ) - for i, (k, v) in enumerate(df.items()) + for i, (box_na_value, (k, v)) in enumerate(zip(box_na_values, df.items())) ) elif orient == "split": - data = df._create_data_for_split_and_tight_to_dict( - are_all_object_dtype_cols, box_native_indices + data = list( + create_data_for_split(df, are_all_object_dtype_cols, box_native_indices) ) return into_c( @@ -192,10 +218,6 @@ def to_dict( ) elif orient == "tight": - data = df._create_data_for_split_and_tight_to_dict( - are_all_object_dtype_cols, box_native_indices - ) - return into_c( ((("index", df.index.tolist()),) if index else ()) + ( @@ -215,11 +237,9 @@ def to_dict( elif orient == "records": columns = df.columns.tolist() if are_all_object_dtype_cols: - rows = ( - dict(zip(columns, row)) for row in df.itertuples(index=False, name=None) - ) return [ - into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows + into_c(zip(columns, map(maybe_box_native, row))) + for row in df.itertuples(index=False, name=None) ] else: data = [ @@ -235,7 +255,7 @@ def to_dict( for row in data: for col in object_dtype_cols: row[col] = maybe_box_native(row[col]) - return data + return data # type: ignore[return-value] elif orient == "index": if not df.index.is_unique: @@ -248,24 +268,21 @@ def to_dict( ) elif box_native_indices: object_dtype_indices_as_set = set(box_native_indices) - is_object_dtype_by_index = [ - i in object_dtype_indices_as_set for i in range(len(df.columns)) - ] return into_c( ( t[0], { - columns[i]: maybe_box_native(v) - if is_object_dtype_by_index[i] + column: maybe_box_native(v) + if i in object_dtype_indices_as_set else v - for i, v in enumerate(t[1:]) + for i, (column, v) in enumerate(zip(columns, t[1:])) }, ) for t in df.itertuples(name=None) ) else: return into_c( - (t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None) + (t[0], dict(zip(columns, t[1:]))) for t in df.itertuples(name=None) ) else: