Skip to content

Commit

Permalink
CLN: to_dict (#57159)
Browse files Browse the repository at this point in the history
* Make to_dict lazier

* Remove some extra looping and indexing

* Add erroneous ignore
  • Loading branch information
mroeschke authored Jan 31, 2024
1 parent b41ea09 commit c811353
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 51 deletions.
23 changes: 0 additions & 23 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,6 @@
find_common_type,
infer_dtype_from_scalar,
invalidate_string_dtypes,
maybe_box_native,
maybe_downcast_to_dtype,
)
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -1983,28 +1982,6 @@ def to_numpy(

return result

def _create_data_for_split_and_tight_to_dict(
self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
) -> list:
"""
Simple helper method to create data for to ``to_dict(orient="split")`` and
``to_dict(orient="tight")`` to create the main output data
"""
if are_all_object_dtype_cols:
data = [
list(map(maybe_box_native, t))
for t in self.itertuples(index=False, name=None)
]
else:
data = [list(t) for t in self.itertuples(index=False, name=None)]
if object_dtype_indices:
# If we have object_dtype_cols, apply maybe_box_naive after list
# comprehension for perf
for row in data:
for i in object_dtype_indices:
row[i] = maybe_box_native(row[i])
return data

@overload
def to_dict(
self,
Expand Down
73 changes: 45 additions & 28 deletions pandas/core/methods/to_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,34 @@
from pandas.core import common as com

if TYPE_CHECKING:
from collections.abc import Generator

from pandas._typing import MutableMappingT

from pandas import DataFrame


def create_data_for_split(
df: DataFrame, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
) -> Generator[list, None, None]:
"""
Simple helper method to create data for to ``to_dict(orient="split")``
to create the main output data
"""
if are_all_object_dtype_cols:
for tup in df.itertuples(index=False, name=None):
yield list(map(maybe_box_native, tup))
else:
for tup in df.itertuples(index=False, name=None):
data = list(tup)
if object_dtype_indices:
# If we have object_dtype_cols, apply maybe_box_naive after
# for perf
for i in object_dtype_indices:
data[i] = maybe_box_native(data[i])
yield data


@overload
def to_dict(
df: DataFrame,
Expand Down Expand Up @@ -152,35 +175,38 @@ def to_dict(
# GH46470 Return quickly if orient series to avoid creating dtype objects
return into_c((k, v) for k, v in df.items())

if orient == "dict":
return into_c((k, v.to_dict(into=into)) for k, v in df.items())

box_native_indices = [
i
for i, col_dtype in enumerate(df.dtypes.values)
if col_dtype == np.dtype(object) or isinstance(col_dtype, ExtensionDtype)
]
box_na_values = [
lib.no_default if not isinstance(col_dtype, BaseMaskedDtype) else libmissing.NA
for i, col_dtype in enumerate(df.dtypes.values)
]
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)

if orient == "dict":
return into_c((k, v.to_dict(into=into)) for k, v in df.items())
are_all_object_dtype_cols = len(box_native_indices) == len(df.dtypes)

elif orient == "list":
if orient == "list":
object_dtype_indices_as_set: set[int] = set(box_native_indices)
box_na_values = (
lib.no_default
if not isinstance(col_dtype, BaseMaskedDtype)
else libmissing.NA
for col_dtype in df.dtypes.values
)
return into_c(
(
k,
list(map(maybe_box_native, v.to_numpy(na_value=box_na_values[i])))
list(map(maybe_box_native, v.to_numpy(na_value=box_na_value)))
if i in object_dtype_indices_as_set
else list(map(maybe_box_native, v.to_numpy())),
)
for i, (k, v) in enumerate(df.items())
for i, (box_na_value, (k, v)) in enumerate(zip(box_na_values, df.items()))
)

elif orient == "split":
data = df._create_data_for_split_and_tight_to_dict(
are_all_object_dtype_cols, box_native_indices
data = list(
create_data_for_split(df, are_all_object_dtype_cols, box_native_indices)
)

return into_c(
Expand All @@ -192,10 +218,6 @@ def to_dict(
)

elif orient == "tight":
data = df._create_data_for_split_and_tight_to_dict(
are_all_object_dtype_cols, box_native_indices
)

return into_c(
((("index", df.index.tolist()),) if index else ())
+ (
Expand All @@ -215,11 +237,9 @@ def to_dict(
elif orient == "records":
columns = df.columns.tolist()
if are_all_object_dtype_cols:
rows = (
dict(zip(columns, row)) for row in df.itertuples(index=False, name=None)
)
return [
into_c((k, maybe_box_native(v)) for k, v in row.items()) for row in rows
into_c(zip(columns, map(maybe_box_native, row)))
for row in df.itertuples(index=False, name=None)
]
else:
data = [
Expand All @@ -235,7 +255,7 @@ def to_dict(
for row in data:
for col in object_dtype_cols:
row[col] = maybe_box_native(row[col])
return data
return data # type: ignore[return-value]

elif orient == "index":
if not df.index.is_unique:
Expand All @@ -248,24 +268,21 @@ def to_dict(
)
elif box_native_indices:
object_dtype_indices_as_set = set(box_native_indices)
is_object_dtype_by_index = [
i in object_dtype_indices_as_set for i in range(len(df.columns))
]
return into_c(
(
t[0],
{
columns[i]: maybe_box_native(v)
if is_object_dtype_by_index[i]
column: maybe_box_native(v)
if i in object_dtype_indices_as_set
else v
for i, v in enumerate(t[1:])
for i, (column, v) in enumerate(zip(columns, t[1:]))
},
)
for t in df.itertuples(name=None)
)
else:
return into_c(
(t[0], dict(zip(df.columns, t[1:]))) for t in df.itertuples(name=None)
(t[0], dict(zip(columns, t[1:]))) for t in df.itertuples(name=None)
)

else:
Expand Down

0 comments on commit c811353

Please sign in to comment.