Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added pyarrow/numpy dtype literals and allowed str | DtypeObj as input for Series.astype #756

Merged
merged 14 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 91 additions & 21 deletions pandas-stubs/_typing.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,11 @@ BooleanDtypeArg: TypeAlias = (
| pd.BooleanDtype
| Literal["boolean"]
# Numpy bool type
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bool_
| type[np.bool_]
| Literal["?", "bool8", "bool_"]
# PyArrow boolean type and its string alias
| Literal["bool[pyarrow]", "boolean[pyarrow]"]
)
IntDtypeArg: TypeAlias = (
# Builtin integer type and its string alias
Expand All @@ -98,22 +102,48 @@ IntDtypeArg: TypeAlias = (
| pd.Int32Dtype
| pd.Int64Dtype
| Literal["Int8", "Int16", "Int32", "Int64"]
# Pandas nullable unsigned integer types and their string aliases
| pd.UInt8Dtype
| pd.UInt16Dtype
| pd.UInt32Dtype
| pd.UInt64Dtype
| Literal["UInt8", "UInt16", "UInt32", "UInt64"]
# Numpy signed integer types and their string aliases
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.byte
| type[np.byte]
| type[np.int8]
| Literal["b", "int8", "byte"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.short
| type[np.int16]
| type[np.int32]
| type[np.int64]
| type[np.intp]
| Literal["byte", "int8", "int16", "int32", "int64", "intp"]
| Literal["h", "int16", "short"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.intc
| type[np.intc]
| Literal["i", "int32", "intc"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.int_
| type[np.int_]
| Literal["l", "int64", "int_", "intp", "long"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.longlong
| type[np.longlong]
| Literal["q", "longlong"] # NOTE: int128 not assigned
# Numpy unsigned integer types and their string aliases
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ubyte
| type[np.ubyte]
| type[np.uint8]
| type[np.uint16]
| type[np.uint32]
| type[np.uint64]
| type[np.uintp]
| Literal["ubyte", "uint8", "uint16", "uint32", "uint64", "uintp"]
| Literal["B", "uint8", "ubyte"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ushort
| type[np.ushort]
| Literal["H", "uint16", "ushort"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uintc
| type[np.uintc]
| Literal["I", "uint32", "uintc"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.uint
| type[np.uint]
| Literal["L", "uint64", "uint", "uintp"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.ulonglong
| type[np.ulonglong]
| Literal["Q", "ulonglong"] # NOTE: uint128 not assigned
# PyArrow integer types and their string aliases
| Literal["int8[pyarrow]", "int16[pyarrow]", "int32[pyarrow]", "int64[pyarrow]"]
# PyArrow unsigned integer types and their string aliases
| Literal["uint8[pyarrow]", "uint16[pyarrow]", "uint32[pyarrow]", "uint64[pyarrow]"]
)
StrDtypeArg: TypeAlias = (
# Builtin str type and its string alias
Expand All @@ -122,6 +152,8 @@ StrDtypeArg: TypeAlias = (
# Pandas nullable string type and its string alias
| pd.StringDtype
| Literal["string"]
# PyArrow string type and its string alias
| Literal["string[pyarrow]"]
)
BytesDtypeArg: TypeAlias = type[bytes]
FloatDtypeArg: TypeAlias = (
Expand All @@ -133,19 +165,43 @@ FloatDtypeArg: TypeAlias = (
| pd.Float64Dtype
| Literal["Float32", "Float64"]
# Numpy float types and their string aliases
| type[np.float16]
| type[np.float32]
| type[np.float64]
| Literal["float16", "float32", "float64"]
# NOTE: Alias np.float16 only on Linux x86_64, use np.half instead
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.half
| type[np.half]
| Literal["e", "float16", "half"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.single
| type[np.single]
| Literal["f", "float32", "single"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.double
| type[np.double]
| Literal["d", "float64", "double", "float_"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.longdouble
| type[np.longdouble]
| Literal["g", "float128", "longdouble", "longfloat"]
# PyArrow floating point types and their string aliases
| Literal[
"float[pyarrow]",
"double[pyarrow]",
"float16[pyarrow]",
"float32[pyarrow]",
"float64[pyarrow]",
]
)
ComplexDtypeArg: TypeAlias = (
# Builtin complex type and its string alias
type[complex] # noqa: Y030
| Literal["complex"]
# Numpy complex types and their aliases
| type[np.complex64]
| type[np.complex128]
| Literal["complex64", "complex128"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.csingle
| type[np.csingle]
| Literal["F", "complex64", "singlecomplex"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.cdouble
| type[np.cdouble]
| Literal["D", "complex128", "cdouble", "cfloat", "complex_"]
# https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.clongdouble
# NOTE: Alias np.complex256 only on Linux x86_64, use np.clongdouble instead
| type[np.clongdouble]
| Literal["G", "complex256", "clongdouble", "clongfloat", "longcomplex"]
)
# Refer to https://numpy.org/doc/stable/reference/arrays.datetime.html#datetime-units
TimedeltaDtypeArg: TypeAlias = Literal[
Expand All @@ -163,6 +219,11 @@ TimedeltaDtypeArg: TypeAlias = Literal[
"timedelta64[ps]",
"timedelta64[fs]",
"timedelta64[as]",
# PyArrow duration type and its string alias
"duration[s][pyarrow]",
"duration[ms][pyarrow]",
"duration[us][pyarrow]",
"duration[ns][pyarrow]",
]
TimestampDtypeArg: TypeAlias = Literal[
"datetime64[Y]",
Expand All @@ -179,9 +240,19 @@ TimestampDtypeArg: TypeAlias = Literal[
"datetime64[ps]",
"datetime64[fs]",
"datetime64[as]",
# PyArrow timestamp type and its string alias
"date32[pyarrow]",
"date64[pyarrow]",
"timestamp[s][pyarrow]",
"timestamp[ms][pyarrow]",
"timestamp[us][pyarrow]",
"timestamp[ns][pyarrow]",
]
CategoryDtypeArg: TypeAlias = CategoricalDtype | Literal["category"]

# DtypeArg specifies all allowable dtypes in a functions its dtype argument
DtypeObj: TypeAlias = np.dtype[np.generic] | ExtensionDtype

AstypeArg: TypeAlias = (
BooleanDtypeArg
| IntDtypeArg
Expand All @@ -192,11 +263,10 @@ AstypeArg: TypeAlias = (
| TimedeltaDtypeArg
| TimestampDtypeArg
| CategoryDtypeArg
| ExtensionDtype
| DtypeObj
| type[object]
| str
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we should accept a general string here. The whole idea of the other aliases is to constrain which strings are acceptable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

str must be acceptable, as e.g. a string might be provided dynamically. Same is if you accept generic dtype object, which one must if s.astye(s.dtype) should work without raising a warning.

The correct way to handle this is to be careful with overload order, going from special to generic.
All of the existing tests still work with this change.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes the type check too wide. It allows pd.DataFrame().astype("foobar") to pass. I'd rather have the stubs catch the most used cases (not dynamic strings). Having said that, if we want s.astype(s.dtype) to work, maybe we should just say that s.dtype() returns AsTypeArg

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there was a way to specify a non-literal string, that would be ideal. If it sees a literal, the literal should be verified. I am not aware of a way to achieve that with current typing features.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another note - I think in subjective cases like this, you have to ask, what is the most common use case. I would contend dynamic/runtime based astype calls are atypical. The majority of the time, it's a static transformation, and the developer is aware of the source dtype and target dtype. In that case, they would hard-code the destination type.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gandhis1 As an example for dynamic astype(string): I have some data-pipelines for reading csv-files where I store config files with the target schema (e.g. column dtypes).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gandhis1 @Dr-Irv In python 3.11 LiteralString was added, I wonder id this can be used to raise a typing error if none of the defined literals is matched? This would give best of both worlds.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I asked this here: python/typing#1434

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, doesn't seem possible currently. Possibly in the future if both Intersection and Not are supported: python/typing#801 (comment)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gandhis1 As an example for dynamic astype(string): I have some data-pipelines for reading csv-files where I store config files with the target schema (e.g. column dtypes).

That's an interesting example, but that's a case where the argument you are using is dynamic, and you're trying to check it in a static typing context. If you config files had an incorrect string "foobar"not covered by the stubs, your code would fail at runtime.

IMHO, the goal of the static type checks is to catch things before runtime. So we have two alternatives:

  1. Do not include str (as it is right now in the stubs) as a valid argument. Disadvantage is that you can't use a dynamic string as an argument as in your use case. Advantage is it covers only the valid strings that are statically declared.
  2. Include str as a valid argument. Disadvantage is that we don't help people who use .astype("ing") instead of .astype("int") prior to run time. Advantage is your use case.

I would argue that people use .astype("some-string") more than have a dynamic string as the argument. So that's why I would prefer (1) above.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This str still has to be removed.

)
# DtypeArg specifies all allowable dtypes in a functions its dtype argument
DtypeObj: TypeAlias = np.dtype[np.generic] | ExtensionDtype

# filenames and file-like-objects
AnyStr_cov = TypeVar("AnyStr_cov", str, bytes, covariant=True)
Expand Down
4 changes: 2 additions & 2 deletions pandas-stubs/core/series.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]):
is_copy: _bool | None = ...,
**kwargs,
) -> Series[S1]: ...
def __getattr__(self, name: str) -> S1: ...
def __getattr__(self, name: _str) -> S1: ...
@overload
def __getitem__(
self,
Expand Down Expand Up @@ -1208,7 +1208,7 @@ class Series(IndexOpsMixin, NDFrame, Generic[S1]):
@overload
def astype(
self,
dtype: type[object] | ExtensionDtype,
dtype: type[object] | ExtensionDtype | DtypeObj | _str,
copy: _bool = ...,
errors: IgnoreRaise = ...,
) -> Series: ...
Expand Down
60 changes: 59 additions & 1 deletion tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1697,8 +1697,12 @@ def test_updated_astype() -> None:
s = pd.Series([3, 4, 5])
s1 = pd.Series(True)

# Boolean types
# dynamically typed
string: str = "int" # not Literal!
check(assert_type(s.astype(string), "pd.Series[Any]"), pd.Series, np.integer)
check(assert_type(s.astype(s.dtype), "pd.Series[Any]"), pd.Series, np.integer)

# Boolean types
# Builtin bool types
check(assert_type(s.astype(bool), "pd.Series[bool]"), pd.Series, np.bool_)
check(assert_type(s.astype("bool"), "pd.Series[bool]"), pd.Series, np.bool_)
Expand Down Expand Up @@ -1791,6 +1795,20 @@ def test_updated_astype() -> None:
check(assert_type(s.astype("float16"), "pd.Series[float]"), pd.Series, np.float16)
check(assert_type(s.astype("float32"), "pd.Series[float]"), pd.Series, np.float32)
check(assert_type(s.astype("float64"), "pd.Series[float]"), pd.Series, np.float64)
check(assert_type(s.astype("float64"), "pd.Series[float]"), pd.Series, np.float64)
check(assert_type(s.astype("float64"), "pd.Series[float]"), pd.Series, np.float64)

# pyarrow
check(
assert_type(s.astype("int64[pyarrow]"), "pd.Series[int]"),
pd.Series,
int,
)
check(
assert_type(s.astype("float[pyarrow]"), "pd.Series[float]"),
pd.Series,
float,
)

# Complex types

Expand Down Expand Up @@ -1889,6 +1907,26 @@ def test_updated_astype() -> None:
pd.Series,
Timedelta,
)
check(
assert_type(s.astype("duration[s][pyarrow]"), TimedeltaSeries),
pd.Series,
datetime.timedelta,
)
check(
assert_type(s.astype("duration[ms][pyarrow]"), TimedeltaSeries),
pd.Series,
datetime.timedelta,
)
check(
assert_type(s.astype("duration[us][pyarrow]"), TimedeltaSeries),
pd.Series,
datetime.timedelta,
)
check(
assert_type(s.astype("duration[ns][pyarrow]"), TimedeltaSeries),
pd.Series,
datetime.timedelta,
)

check(
assert_type(s.astype("datetime64[Y]"), TimestampSeries),
Expand Down Expand Up @@ -1960,6 +1998,26 @@ def test_updated_astype() -> None:
pd.Series,
Timestamp,
)
check(
assert_type(s.astype("timestamp[s][pyarrow]"), TimestampSeries),
pd.Series,
datetime.datetime,
)
check(
assert_type(s.astype("timestamp[ms][pyarrow]"), TimestampSeries),
pd.Series,
datetime.datetime,
)
check(
assert_type(s.astype("timestamp[us][pyarrow]"), TimestampSeries),
pd.Series,
datetime.datetime,
)
check(
assert_type(s.astype("timestamp[ns][pyarrow]"), TimestampSeries),
pd.Series,
datetime.datetime,
)

orseries = pd.Series([Decimal(x) for x in [1, 2, 3]])
newtype = DecimalDtype()
Expand Down