CLN: to_dict (#57159)

* Make to_dict lazier * Remove some extra looping and indexing * Add erroneous ignore
pandas-dev · Jan 31, 2024 · c811353 · c811353
1 parent b41ea09
commit c811353
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 51 deletions.
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -89,7 +89,6 @@
     find_common_type,
     infer_dtype_from_scalar,
     invalidate_string_dtypes,
-    maybe_box_native,
     maybe_downcast_to_dtype,
 )
 from pandas.core.dtypes.common import (
@@ -1983,28 +1982,6 @@ def to_numpy(
 
         return result
 
-    def _create_data_for_split_and_tight_to_dict(
-        self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
-    ) -> list:
-        """
-        Simple helper method to create data for to ``to_dict(orient="split")`` and
-        ``to_dict(orient="tight")`` to create the main output data
-        """
-        if are_all_object_dtype_cols:
-            data = [
-                list(map(maybe_box_native, t))
-                for t in self.itertuples(index=False, name=None)
-            ]
-        else:
-            data = [list(t) for t in self.itertuples(index=False, name=None)]
-            if object_dtype_indices:
-                # If we have object_dtype_cols, apply maybe_box_naive after list
-                # comprehension for perf
-                for row in data:
-                    for i in object_dtype_indices:
-                        row[i] = maybe_box_native(row[i])
-        return data
-
     @overload
     def to_dict(
         self,

diff --git a/pandas/core/methods/to_dict.py b/pandas/core/methods/to_dict.py
@@ -24,11 +24,34 @@
 from pandas.core import common as com
 
 if TYPE_CHECKING:
+    from collections.abc import Generator
+
     from pandas._typing import MutableMappingT
 
     from pandas import DataFrame
 
 
+def create_data_for_split(
+    df: DataFrame, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
+) -> Generator[list, None, None]:
+    """
+    Simple helper method to create data for to ``to_dict(orient="split")``
+    to create the main output data
+    """
+    if are_all_object_dtype_cols:
+        for tup in df.itertuples(index=False, name=None):
+            yield list(map(maybe_box_native, tup))
+    else:
+        for tup in df.itertuples(index=False, name=None):
+            data = list(tup)
+            if object_dtype_indices:
+                # If we have object_dtype_cols, apply maybe_box_naive after
+                # for perf
+                for i in object_dtype_indices:
+                    data[i] = maybe_box_native(data[i])
+            yield data
+
+
 @overload
 def to_dict(
     df: DataFrame,
@@ -152,35 +175,38 @@ def to_dict(
         # GH46470 Return quickly if orient series to avoid creating dtype objects
         return into_c((k, v) for k, v in df.items())
 
+    if orient == "dict":
+        return into_c((k, v.to_dict(into=into)) for k, v in df.items())
+
     box_native_indices = [
         i
         for i, col_dtype in enumerate(df.dtypes.values)
         if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
     ]
-    box_na_values = [
-        lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
-        for i, col_dtype in enumerate(df.dtypes.values)
-    ]
-    are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
 
-    if orient == "dict":
-        return into_c((k, v.to_dict(into=into)) for k, v in df.items())
+    are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)
 
-    elif orient == "list":
+    if orient == "list":
         object_dtype_indices_as_set: set[int] = set(box_native_indices)
+        box_na_values = (
+            lib.no_default
+            if not isinstance(col_dtype, BaseMaskedDtype)
+            else libmissing.NA
+            for col_dtype in df.dtypes.values
+        )
         return into_c(
             (
                 k,
-                list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i])))
+                list(map(maybe_box_native, v.to_numpy(na_value=box_na_value)))
                 if i in object_dtype_indices_as_set
                 else list(map(maybe_box_native, v.to_numpy())),
             )
-            for i, (k, v) in enumerate(df.items())
+            for i, (box_na_value, (k, v)) in enumerate(zip(box_na_values, df.items()))
         )
 
     elif orient == "split":
-        data = df._create_data_for_split_and_tight_to_dict(
-            are_all_object_dtype_cols, box_native_indices
+        data = list(
+            create_data_for_split(df, are_all_object_dtype_cols, box_native_indices)
         )
 
         return into_c(
@@ -192,10 +218,6 @@ def to_dict(
         )
 
     elif orient == "tight":
-        data = df._create_data_for_split_and_tight_to_dict(
-            are_all_object_dtype_cols, box_native_indices
-        )
-
         return into_c(
             ((("index", df.index.tolist()),) if index else ())
             + (
@@ -215,11 +237,9 @@ def to_dict(
     elif orient == "records":
         columns = df.columns.tolist()
         if are_all_object_dtype_cols:
-            rows = (
-                dict(zip(columns, row)) for row in df.itertuples(index=False, name=None)
-            )
             return [
-                into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
+                into_c(zip(columns, map(maybe_box_native, row)))
+                for row in df.itertuples(index=False, name=None)
             ]
         else:
             data = [
@@ -235,7 +255,7 @@ def to_dict(
                 for row in data:
                     for col in object_dtype_cols:
                         row[col] = maybe_box_native(row[col])
-            return data
+            return data  # type: ignore[return-value]
 
     elif orient == "index":
         if not df.index.is_unique:
@@ -248,24 +268,21 @@ def to_dict(
             )
         elif box_native_indices:
             object_dtype_indices_as_set = set(box_native_indices)
-            is_object_dtype_by_index = [
-                i in object_dtype_indices_as_set for i in range(len(df.columns))
-            ]
             return into_c(
                 (
                     t[0],
                     {
-                        columns[i]: maybe_box_native(v)
-                        if is_object_dtype_by_index[i]
+                        column: maybe_box_native(v)
+                        if i in object_dtype_indices_as_set
                         else v
-                        for i, v in enumerate(t[1:])
+                        for i, (column, v) in enumerate(zip(columns, t[1:]))
                     },
                 )
                 for t in df.itertuples(name=None)
             )
         else:
             return into_c(
-                (t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None)
+                (t[0], dict(zip(columns, t[1:]))) for t in df.itertuples(name=None)
             )
 
     else: