Skip to content

Commit

Permalink
Prefer pandas nullable integers for int fields (#159)
Browse files Browse the repository at this point in the history
  • Loading branch information
cwegrzyn committed Apr 2, 2021
1 parent d91925e commit 3fc894c
Show file tree
Hide file tree
Showing 16 changed files with 304 additions and 181 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.coverage*
.cache
nosetests.xml
coverage.xml
Expand Down
2 changes: 1 addition & 1 deletion metrics/coverage_high_water_mark
Original file line number Diff line number Diff line change
@@ -1 +1 @@
93.6100
93.6700
2 changes: 1 addition & 1 deletion metrics/mypy_high_water_mark
Original file line number Diff line number Diff line change
@@ -1 +1 @@
92.5000
92.1900
53 changes: 22 additions & 31 deletions records_mover/db/mysql/mysql_db_driver.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,7 @@
import sqlalchemy
import sqlalchemy.dialects.mysql
import logging
from ...utils.limits import (INT8_MIN, INT8_MAX,
UINT8_MIN, UINT8_MAX,
INT16_MIN, INT16_MAX,
UINT16_MIN, UINT16_MAX,
INT24_MIN, INT24_MAX,
UINT24_MIN, UINT24_MAX,
INT32_MIN, INT32_MAX,
UINT32_MIN, UINT32_MAX,
INT64_MIN, INT64_MAX,
UINT64_MIN, UINT64_MAX,
from ...utils.limits import (IntegerType,
FLOAT32_SIGNIFICAND_BITS,
FLOAT64_SIGNIFICAND_BITS,
num_digits)
Expand Down Expand Up @@ -48,29 +39,29 @@ def integer_limits(self,
Optional[Tuple[int, int]]:
if isinstance(type_, sqlalchemy.dialects.mysql.TINYINT):
if type_.unsigned:
return (UINT8_MIN, UINT8_MAX)
return IntegerType.UINT8.range
else:
return (INT8_MIN, INT8_MAX)
return IntegerType.INT8.range
elif isinstance(type_, sqlalchemy.dialects.mysql.SMALLINT):
if type_.unsigned:
return (UINT16_MIN, UINT16_MAX)
return IntegerType.UINT16.range
else:
return (INT16_MIN, INT16_MAX)
return IntegerType.INT16.range
elif isinstance(type_, sqlalchemy.dialects.mysql.MEDIUMINT):
if type_.unsigned:
return (UINT24_MIN, UINT24_MAX)
return IntegerType.UINT24.range
else:
return (INT24_MIN, INT24_MAX)
return IntegerType.INT24.range
elif isinstance(type_, sqlalchemy.dialects.mysql.INTEGER):
if type_.unsigned:
return (UINT32_MIN, UINT32_MAX)
return IntegerType.UINT32.range
else:
return (INT32_MIN, INT32_MAX)
return IntegerType.INT32.range
elif isinstance(type_, sqlalchemy.dialects.mysql.BIGINT):
if type_.unsigned:
return (UINT64_MIN, UINT64_MAX)
return IntegerType.UINT64.range
else:
return (INT64_MIN, INT64_MAX)
return IntegerType.INT64.range
return super().integer_limits(type_)

def fp_constraints(self,
Expand All @@ -88,26 +79,26 @@ def type_for_integer(self,
"""Find correct integral column type to fit the given min and max integer values"""

if min_value is not None and max_value is not None:
pass
if min_value >= INT8_MIN and max_value <= INT8_MAX:
int_type = IntegerType.smallest_cover_for(min_value, max_value)
if int_type == IntegerType.INT8:
return sqlalchemy.dialects.mysql.TINYINT()
elif min_value >= UINT8_MIN and max_value <= UINT8_MAX:
elif int_type == IntegerType.UINT8:
return sqlalchemy.dialects.mysql.TINYINT(unsigned=True)
elif min_value >= INT16_MIN and max_value <= INT16_MAX:
elif int_type == IntegerType.INT16:
return sqlalchemy.sql.sqltypes.SMALLINT()
elif min_value >= UINT16_MIN and max_value <= UINT16_MAX:
elif int_type == IntegerType.UINT16:
return sqlalchemy.dialects.mysql.SMALLINT(unsigned=True)
elif min_value >= INT24_MIN and max_value <= INT24_MAX:
elif int_type == IntegerType.INT24:
return sqlalchemy.dialects.mysql.MEDIUMINT()
elif min_value >= UINT24_MIN and max_value <= UINT24_MAX:
elif int_type == IntegerType.UINT24:
return sqlalchemy.dialects.mysql.MEDIUMINT(unsigned=True)
elif min_value >= INT32_MIN and max_value <= INT32_MAX:
elif int_type == IntegerType.INT32:
return sqlalchemy.sql.sqltypes.INTEGER()
elif min_value >= UINT32_MIN and max_value <= UINT32_MAX:
elif int_type == IntegerType.UINT32:
return sqlalchemy.dialects.mysql.INTEGER(unsigned=True)
elif min_value >= INT64_MIN and max_value <= INT64_MAX:
elif int_type == IntegerType.INT64:
return sqlalchemy.sql.sqltypes.BIGINT()
elif min_value >= UINT64_MIN and max_value <= UINT64_MAX:
elif int_type == IntegerType.UINT64:
return sqlalchemy.dialects.mysql.BIGINT(unsigned=True)
else:
num_digits_min = num_digits(min_value)
Expand Down
4 changes: 4 additions & 0 deletions records_mover/records/pandas/prep_for_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def _convert_series_or_index(series_or_index: T,
isinstance(series_or_index[0], datetime.date)):
logger.info(f"Converting {series_or_index.name} from np.datetime64 to "
"string in CSV's format")
logger.debug("Dtype is %s, first element type %s", series_or_index.dtype,
type(series_or_index[0]))
hint_date_format = records_format.hints['dateformat']
assert isinstance(hint_date_format, str)
pandas_date_format = python_date_format_from_hints.get(hint_date_format)
Expand All @@ -49,6 +51,8 @@ def _convert_series_or_index(series_or_index: T,
else:
logger.info(f"Converting {series_or_index.name} from np.datetime64 to string "
"in CSV's format")
logger.debug("Dtype is %s, first element type %s", series_or_index.dtype,
type(series_or_index[0]))
hint_time_format = records_format.hints['timeonlyformat']
assert isinstance(hint_time_format, str)
pandas_time_format = python_time_format_from_hints.get(hint_time_format)
Expand Down
56 changes: 26 additions & 30 deletions records_mover/records/schema/field/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,8 @@
import datetime
from ...processing_instructions import ProcessingInstructions
import logging
from typing import Optional, Dict, Any, Type, cast, Union, TYPE_CHECKING
from ....utils.limits import (INT8_MIN, INT8_MAX,
UINT8_MIN, UINT8_MAX,
INT16_MIN, INT16_MAX,
UINT16_MIN, UINT16_MAX,
INT32_MIN, INT32_MAX,
UINT32_MIN, UINT32_MAX,
INT64_MIN, INT64_MAX,
UINT64_MIN, UINT64_MAX,
FLOAT16_SIGNIFICAND_BITS,
from typing import Optional, Dict, Any, Type, cast, TYPE_CHECKING
from ....utils.limits import (FLOAT16_SIGNIFICAND_BITS,
FLOAT32_SIGNIFICAND_BITS,
FLOAT64_SIGNIFICAND_BITS,
FLOAT80_SIGNIFICAND_BITS)
Expand All @@ -26,6 +18,7 @@
from sqlalchemy.types import TypeEngine
from records_mover.db import DBDriver # noqa
from .field_types import FieldType
from .pandas import Dtype

from mypy_extensions import TypedDict

Expand Down Expand Up @@ -192,40 +185,41 @@ def components_to_time_str(df: pd.DataFrame) -> datetime.time:
return datetime.time(hour=df['hours'],
minute=df['minutes'],
second=df['seconds'])
logger.debug("Applying pd.Timedelta logic on series for %s", self.name)
out = series.dt.components.apply(axis=1, func=components_to_time_str)
return out

return series.astype(self.to_numpy_dtype())
target_type = self.to_pandas_dtype()
logger.debug("Casting field %s from type %r to type %s", self.name, series.dtype,
target_type)
return series.astype(target_type)

def to_numpy_dtype(self) -> Union[Type[Any], str]:
def to_pandas_dtype(self) -> 'Dtype':
import numpy as np
import pandas as pd
from .pandas import supports_nullable_ints, integer_type_for_range

has_extension_types = supports_nullable_ints()

if self.field_type == 'integer':
int_constraints =\
cast(Optional[RecordsSchemaFieldIntegerConstraints], self.constraints)
min_: Optional[int] = None
max_: Optional[int] = None
required = False
if int_constraints:
min_ = int_constraints.min_
max_ = int_constraints.max_
required = int_constraints.required

if not required and not has_extension_types:
logger.warning(f"Dataframe field {self.name} is nullable, but using pandas "
f"{pd.__version__} which does not support nullable integer type")

if min_ is not None and max_ is not None:
if min_ >= INT8_MIN and max_ <= INT8_MAX:
return np.int8
elif min_ >= UINT8_MIN and max_ <= UINT8_MAX:
return np.uint8
elif min_ >= INT16_MIN and max_ <= INT16_MAX:
return np.int16
elif min_ >= UINT16_MIN and max_ <= UINT16_MAX:
return np.uint16
elif min_ >= INT32_MIN and max_ <= INT32_MAX:
return np.int32
elif min_ >= UINT32_MIN and max_ <= UINT32_MAX:
return np.uint32
elif min_ >= INT64_MIN and max_ <= INT64_MAX:
return np.int64
elif min_ >= UINT64_MIN and max_ <= UINT64_MAX:
return np.uint64
dtype = integer_type_for_range(min_, max_, has_extension_types)
if dtype:
return dtype
else:
logger.warning("Asked for a type larger than int64 in dataframe "
f"field '{self.name}' - providing float128, but "
Expand All @@ -235,8 +229,10 @@ def to_numpy_dtype(self) -> Union[Type[Any], str]:
else:
logger.warning(f"No integer constraints provided for field '{self.name}'; "
"using int64")
return np.int64
# return driver.type_for_integer(min_=min_, max_=max_)
if has_extension_types:
return pd.Int64Dtype()
else:
return np.int64
elif self.field_type == 'decimal':
decimal_constraints =\
cast(Optional[RecordsSchemaFieldDecimalConstraints], self.constraints)
Expand Down
3 changes: 3 additions & 0 deletions records_mover/records/schema/field/constraints/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,6 @@ def from_numpy_dtype(dtype: 'np.dtype',

def __str__(self) -> str:
return f"{type(self).__name__}({self.to_data()})"

def __repr__(self) -> str:
return self.__str__()
53 changes: 52 additions & 1 deletion records_mover/records/schema/field/pandas.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,64 @@
import pandas as pd
from pandas import Series, Index
from typing import Any, Type, TYPE_CHECKING
from typing import Any, Type, TYPE_CHECKING, Optional, Mapping, Union
from .statistics import RecordsSchemaFieldStringStatistics
from ...processing_instructions import ProcessingInstructions
from .representation import RecordsSchemaFieldRepresentation
from ....utils.limits import IntegerType
from .numpy import details_from_numpy_dtype
import numpy as np
if TYPE_CHECKING:
from ..field import RecordsSchemaField # noqa
from ..schema import RecordsSchema # noqa
from pandas.core.dtypes.dtypes import ExtensionDtype # noqa

# Cribbed from non-public https://github.com/pandas-dev/pandas/blob/v1.2.1/pandas/_typing.py
Dtype = Union[
"ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]]
]
DtypeObj = Union[np.dtype, "ExtensionDtype"]


def supports_nullable_ints() -> bool:
"""Detects if this version of pandas supports nullable int extension types."""
return 'Int64Dtype' in dir(pd)


def integer_type_mapping(use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
if use_extension_types:
return {
IntegerType.INT8: pd.Int8Dtype(),
IntegerType.UINT8: pd.UInt8Dtype(),
IntegerType.INT16: pd.Int16Dtype(),
IntegerType.UINT16: pd.UInt16Dtype(),
IntegerType.INT24: pd.Int32Dtype(),
IntegerType.UINT24: pd.Int32Dtype(),
IntegerType.INT32: pd.Int32Dtype(),
IntegerType.UINT32: pd.UInt32Dtype(),
IntegerType.INT64: pd.Int64Dtype(),
IntegerType.UINT64: pd.UInt64Dtype(),
}
else:
return {
IntegerType.INT8: np.int8,
IntegerType.UINT8: np.uint8,
IntegerType.INT16: np.int16,
IntegerType.UINT16: np.uint16,
IntegerType.INT24: np.int32,
IntegerType.UINT24: np.uint32,
IntegerType.INT32: np.int32,
IntegerType.UINT32: np.uint32,
IntegerType.INT64: np.int64,
IntegerType.UINT64: np.uint64,
}


def integer_type_for_range(min_: int, max_: int, has_extension_types: bool) -> Optional[DtypeObj]:
int_type = IntegerType.smallest_cover_for(min_, max_)
if int_type:
return integer_type_mapping(has_extension_types).get(int_type)
else:
return None


def field_from_index(index: Index,
Expand Down
33 changes: 33 additions & 0 deletions records_mover/utils/limits.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
from enum import Enum
import math
from typing import Optional

INT8_MAX = 127
INT8_MIN = -128
Expand Down Expand Up @@ -26,6 +28,37 @@
FLOAT80_SIGNIFICAND_BITS = 64


class IntegerType(Enum):
INT8 = (INT8_MIN, INT8_MAX)
UINT8 = (UINT8_MIN, UINT8_MAX)
INT16 = (INT16_MIN, INT16_MAX)
UINT16 = (UINT16_MIN, UINT16_MAX)
INT24 = (INT24_MIN, INT24_MAX)
UINT24 = (UINT24_MIN, UINT24_MAX)
INT32 = (INT32_MIN, INT32_MAX)
UINT32 = (UINT32_MIN, UINT32_MAX)
INT64 = (INT64_MIN, INT64_MAX)
UINT64 = (UINT64_MIN, UINT64_MAX)

def __init__(self, min_: int, max_: int):
self.min_ = min_
self.max_ = max_

def is_cover_for(self, low_value: int, high_value: int) -> bool:
return low_value >= self.min_ and high_value <= self.max_

@property
def range(self):
return (self.min_, self.max_)

@classmethod
def smallest_cover_for(cls, low_value: int, high_value: int) -> Optional['IntegerType']:
for int_type in cls:
if int_type.is_cover_for(low_value, high_value):
return int_type
return None


# https://stackoverflow.com/questions/2189800/length-of-an-integer-in-python
def num_digits(n: int) -> int:
if n > 0:
Expand Down
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,6 @@ ignore_missing_imports = True

[mypy-pyarrow.*]
ignore_missing_imports = True

[mypy-nose.*]
ignore_missing_imports = True
Loading

0 comments on commit 3fc894c

Please sign in to comment.