Prefer pandas nullable integers for int fields (#159)

bluelabsio · Apr 2, 2021 · 3fc894c · 3fc894c
1 parent d91925e
commit 3fc894c
Show file tree

Hide file tree

Showing 16 changed files with 304 additions and 181 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,8 +37,7 @@ pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
-.coverage
-.coverage.*
+.coverage*
 .cache
 nosetests.xml
 coverage.xml

diff --git a/metrics/coverage_high_water_mark b/metrics/coverage_high_water_mark
@@ -1 +1 @@
-93.6100
+93.6700
diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark
@@ -1 +1 @@
-92.5000
+92.1900
diff --git a/records_mover/db/mysql/mysql_db_driver.py b/records_mover/db/mysql/mysql_db_driver.py
@@ -1,16 +1,7 @@
 import sqlalchemy
 import sqlalchemy.dialects.mysql
 import logging
-from ...utils.limits import (INT8_MIN, INT8_MAX,
-                             UINT8_MIN, UINT8_MAX,
-                             INT16_MIN, INT16_MAX,
-                             UINT16_MIN, UINT16_MAX,
-                             INT24_MIN, INT24_MAX,
-                             UINT24_MIN, UINT24_MAX,
-                             INT32_MIN, INT32_MAX,
-                             UINT32_MIN, UINT32_MAX,
-                             INT64_MIN, INT64_MAX,
-                             UINT64_MIN, UINT64_MAX,
+from ...utils.limits import (IntegerType,
                              FLOAT32_SIGNIFICAND_BITS,
                              FLOAT64_SIGNIFICAND_BITS,
                              num_digits)
@@ -48,29 +39,29 @@ def integer_limits(self,
             Optional[Tuple[int, int]]:
         if isinstance(type_, sqlalchemy.dialects.mysql.TINYINT):
             if type_.unsigned:
-                return (UINT8_MIN, UINT8_MAX)
+                return IntegerType.UINT8.range
             else:
-                return (INT8_MIN, INT8_MAX)
+                return IntegerType.INT8.range
         elif isinstance(type_, sqlalchemy.dialects.mysql.SMALLINT):
             if type_.unsigned:
-                return (UINT16_MIN, UINT16_MAX)
+                return IntegerType.UINT16.range
             else:
-                return (INT16_MIN, INT16_MAX)
+                return IntegerType.INT16.range
         elif isinstance(type_, sqlalchemy.dialects.mysql.MEDIUMINT):
             if type_.unsigned:
-                return (UINT24_MIN, UINT24_MAX)
+                return IntegerType.UINT24.range
             else:
-                return (INT24_MIN, INT24_MAX)
+                return IntegerType.INT24.range
         elif isinstance(type_, sqlalchemy.dialects.mysql.INTEGER):
             if type_.unsigned:
-                return (UINT32_MIN, UINT32_MAX)
+                return IntegerType.UINT32.range
             else:
-                return (INT32_MIN, INT32_MAX)
+                return IntegerType.INT32.range
         elif isinstance(type_, sqlalchemy.dialects.mysql.BIGINT):
             if type_.unsigned:
-                return (UINT64_MIN, UINT64_MAX)
+                return IntegerType.UINT64.range
             else:
-                return (INT64_MIN, INT64_MAX)
+                return IntegerType.INT64.range
         return super().integer_limits(type_)
 
     def fp_constraints(self,
@@ -88,26 +79,26 @@ def type_for_integer(self,
         """Find correct integral column type to fit the given min and max integer values"""
 
         if min_value is not None and max_value is not None:
-            pass
-            if min_value >= INT8_MIN and max_value <= INT8_MAX:
+            int_type = IntegerType.smallest_cover_for(min_value, max_value)
+            if int_type == IntegerType.INT8:
                 return sqlalchemy.dialects.mysql.TINYINT()
-            elif min_value >= UINT8_MIN and max_value <= UINT8_MAX:
+            elif int_type == IntegerType.UINT8:
                 return sqlalchemy.dialects.mysql.TINYINT(unsigned=True)
-            elif min_value >= INT16_MIN and max_value <= INT16_MAX:
+            elif int_type == IntegerType.INT16:
                 return sqlalchemy.sql.sqltypes.SMALLINT()
-            elif min_value >= UINT16_MIN and max_value <= UINT16_MAX:
+            elif int_type == IntegerType.UINT16:
                 return sqlalchemy.dialects.mysql.SMALLINT(unsigned=True)
-            elif min_value >= INT24_MIN and max_value <= INT24_MAX:
+            elif int_type == IntegerType.INT24:
                 return sqlalchemy.dialects.mysql.MEDIUMINT()
-            elif min_value >= UINT24_MIN and max_value <= UINT24_MAX:
+            elif int_type == IntegerType.UINT24:
                 return sqlalchemy.dialects.mysql.MEDIUMINT(unsigned=True)
-            elif min_value >= INT32_MIN and max_value <= INT32_MAX:
+            elif int_type == IntegerType.INT32:
                 return sqlalchemy.sql.sqltypes.INTEGER()
-            elif min_value >= UINT32_MIN and max_value <= UINT32_MAX:
+            elif int_type == IntegerType.UINT32:
                 return sqlalchemy.dialects.mysql.INTEGER(unsigned=True)
-            elif min_value >= INT64_MIN and max_value <= INT64_MAX:
+            elif int_type == IntegerType.INT64:
                 return sqlalchemy.sql.sqltypes.BIGINT()
-            elif min_value >= UINT64_MIN and max_value <= UINT64_MAX:
+            elif int_type == IntegerType.UINT64:
                 return sqlalchemy.dialects.mysql.BIGINT(unsigned=True)
             else:
                 num_digits_min = num_digits(min_value)

diff --git a/records_mover/records/pandas/prep_for_csv.py b/records_mover/records/pandas/prep_for_csv.py
@@ -26,6 +26,8 @@ def _convert_series_or_index(series_or_index: T,
            isinstance(series_or_index[0], datetime.date)):
             logger.info(f"Converting {series_or_index.name} from np.datetime64 to "
                         "string in CSV's format")
+            logger.debug("Dtype is %s, first element type %s", series_or_index.dtype,
+                         type(series_or_index[0]))
             hint_date_format = records_format.hints['dateformat']
             assert isinstance(hint_date_format, str)
             pandas_date_format = python_date_format_from_hints.get(hint_date_format)
@@ -49,6 +51,8 @@ def _convert_series_or_index(series_or_index: T,
         else:
             logger.info(f"Converting {series_or_index.name} from np.datetime64 to string "
                         "in CSV's format")
+            logger.debug("Dtype is %s, first element type %s", series_or_index.dtype,
+                         type(series_or_index[0]))
             hint_time_format = records_format.hints['timeonlyformat']
             assert isinstance(hint_time_format, str)
             pandas_time_format = python_time_format_from_hints.get(hint_time_format)

diff --git a/records_mover/records/schema/field/__init__.py b/records_mover/records/schema/field/__init__.py
@@ -1,16 +1,8 @@
 import datetime
 from ...processing_instructions import ProcessingInstructions
 import logging
-from typing import Optional, Dict, Any, Type, cast, Union, TYPE_CHECKING
-from ....utils.limits import (INT8_MIN, INT8_MAX,
-                              UINT8_MIN, UINT8_MAX,
-                              INT16_MIN, INT16_MAX,
-                              UINT16_MIN, UINT16_MAX,
-                              INT32_MIN, INT32_MAX,
-                              UINT32_MIN, UINT32_MAX,
-                              INT64_MIN, INT64_MAX,
-                              UINT64_MIN, UINT64_MAX,
-                              FLOAT16_SIGNIFICAND_BITS,
+from typing import Optional, Dict, Any, Type, cast, TYPE_CHECKING
+from ....utils.limits import (FLOAT16_SIGNIFICAND_BITS,
                               FLOAT32_SIGNIFICAND_BITS,
                               FLOAT64_SIGNIFICAND_BITS,
                               FLOAT80_SIGNIFICAND_BITS)
@@ -26,6 +18,7 @@
     from sqlalchemy.types import TypeEngine
     from records_mover.db import DBDriver  # noqa
     from .field_types import FieldType
+    from .pandas import Dtype
 
     from mypy_extensions import TypedDict
 
@@ -192,40 +185,41 @@ def components_to_time_str(df: pd.DataFrame) -> datetime.time:
                         return datetime.time(hour=df['hours'],
                                              minute=df['minutes'],
                                              second=df['seconds'])
+                    logger.debug("Applying pd.Timedelta logic on series for %s", self.name)
                     out = series.dt.components.apply(axis=1, func=components_to_time_str)
                     return out
 
-        return series.astype(self.to_numpy_dtype())
+        target_type = self.to_pandas_dtype()
+        logger.debug("Casting field %s from type %r to type %s", self.name, series.dtype,
+                     target_type)
+        return series.astype(target_type)
 
-    def to_numpy_dtype(self) -> Union[Type[Any], str]:
+    def to_pandas_dtype(self) -> 'Dtype':
         import numpy as np
+        import pandas as pd
+        from .pandas import supports_nullable_ints, integer_type_for_range
+
+        has_extension_types = supports_nullable_ints()
 
         if self.field_type == 'integer':
             int_constraints =\
                 cast(Optional[RecordsSchemaFieldIntegerConstraints], self.constraints)
             min_: Optional[int] = None
             max_: Optional[int] = None
+            required = False
             if int_constraints:
                 min_ = int_constraints.min_
                 max_ = int_constraints.max_
+                required = int_constraints.required
+
+            if not required and not has_extension_types:
+                logger.warning(f"Dataframe field {self.name} is nullable, but using pandas "
+                               f"{pd.__version__} which does not support nullable integer type")
 
             if min_ is not None and max_ is not None:
-                if min_ >= INT8_MIN and max_ <= INT8_MAX:
-                    return np.int8
-                elif min_ >= UINT8_MIN and max_ <= UINT8_MAX:
-                    return np.uint8
-                elif min_ >= INT16_MIN and max_ <= INT16_MAX:
-                    return np.int16
-                elif min_ >= UINT16_MIN and max_ <= UINT16_MAX:
-                    return np.uint16
-                elif min_ >= INT32_MIN and max_ <= INT32_MAX:
-                    return np.int32
-                elif min_ >= UINT32_MIN and max_ <= UINT32_MAX:
-                    return np.uint32
-                elif min_ >= INT64_MIN and max_ <= INT64_MAX:
-                    return np.int64
-                elif min_ >= UINT64_MIN and max_ <= UINT64_MAX:
-                    return np.uint64
+                dtype = integer_type_for_range(min_, max_, has_extension_types)
+                if dtype:
+                    return dtype
                 else:
                     logger.warning("Asked for a type larger than int64 in dataframe "
                                    f"field '{self.name}' - providing float128, but "
@@ -235,8 +229,10 @@ def to_numpy_dtype(self) -> Union[Type[Any], str]:
             else:
                 logger.warning(f"No integer constraints provided for field '{self.name}'; "
                                "using int64")
-                return np.int64
-            # return driver.type_for_integer(min_=min_, max_=max_)
+                if has_extension_types:
+                    return pd.Int64Dtype()
+                else:
+                    return np.int64
         elif self.field_type == 'decimal':
             decimal_constraints =\
                 cast(Optional[RecordsSchemaFieldDecimalConstraints], self.constraints)

diff --git a/records_mover/records/schema/field/constraints/constraints.py b/records_mover/records/schema/field/constraints/constraints.py
@@ -204,3 +204,6 @@ def from_numpy_dtype(dtype: 'np.dtype',
 
     def __str__(self) -> str:
         return f"{type(self).__name__}({self.to_data()})"
+
+    def __repr__(self) -> str:
+        return self.__str__()
diff --git a/records_mover/records/schema/field/pandas.py b/records_mover/records/schema/field/pandas.py
@@ -1,13 +1,64 @@
+import pandas as pd
 from pandas import Series, Index
-from typing import Any, Type, TYPE_CHECKING
+from typing import Any, Type, TYPE_CHECKING, Optional, Mapping, Union
 from .statistics import RecordsSchemaFieldStringStatistics
 from ...processing_instructions import ProcessingInstructions
 from .representation import RecordsSchemaFieldRepresentation
+from ....utils.limits import IntegerType
 from .numpy import details_from_numpy_dtype
 import numpy as np
 if TYPE_CHECKING:
     from ..field import RecordsSchemaField  # noqa
     from ..schema import RecordsSchema  # noqa
+    from pandas.core.dtypes.dtypes import ExtensionDtype # noqa
+
+# Cribbed from non-public https://github.com/pandas-dev/pandas/blob/v1.2.1/pandas/_typing.py
+Dtype = Union[
+    "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]]
+]
+DtypeObj = Union[np.dtype, "ExtensionDtype"]
+
+
+def supports_nullable_ints() -> bool:
+    """Detects if this version of pandas supports nullable int extension types."""
+    return 'Int64Dtype' in dir(pd)
+
+
+def integer_type_mapping(use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]:
+    if use_extension_types:
+        return {
+            IntegerType.INT8: pd.Int8Dtype(),
+            IntegerType.UINT8: pd.UInt8Dtype(),
+            IntegerType.INT16: pd.Int16Dtype(),
+            IntegerType.UINT16: pd.UInt16Dtype(),
+            IntegerType.INT24: pd.Int32Dtype(),
+            IntegerType.UINT24: pd.Int32Dtype(),
+            IntegerType.INT32: pd.Int32Dtype(),
+            IntegerType.UINT32: pd.UInt32Dtype(),
+            IntegerType.INT64: pd.Int64Dtype(),
+            IntegerType.UINT64: pd.UInt64Dtype(),
+        }
+    else:
+        return {
+            IntegerType.INT8: np.int8,
+            IntegerType.UINT8: np.uint8,
+            IntegerType.INT16: np.int16,
+            IntegerType.UINT16: np.uint16,
+            IntegerType.INT24: np.int32,
+            IntegerType.UINT24: np.uint32,
+            IntegerType.INT32: np.int32,
+            IntegerType.UINT32: np.uint32,
+            IntegerType.INT64: np.int64,
+            IntegerType.UINT64: np.uint64,
+        }
+
+
+def integer_type_for_range(min_: int, max_: int, has_extension_types: bool) -> Optional[DtypeObj]:
+    int_type = IntegerType.smallest_cover_for(min_, max_)
+    if int_type:
+        return integer_type_mapping(has_extension_types).get(int_type)
+    else:
+        return None
 
 
 def field_from_index(index: Index,

diff --git a/records_mover/utils/limits.py b/records_mover/utils/limits.py
@@ -1,4 +1,6 @@
+from enum import Enum
 import math
+from typing import Optional
 
 INT8_MAX = 127
 INT8_MIN = -128
@@ -26,6 +28,37 @@
 FLOAT80_SIGNIFICAND_BITS = 64
 
 
+class IntegerType(Enum):
+    INT8 = (INT8_MIN, INT8_MAX)
+    UINT8 = (UINT8_MIN, UINT8_MAX)
+    INT16 = (INT16_MIN, INT16_MAX)
+    UINT16 = (UINT16_MIN, UINT16_MAX)
+    INT24 = (INT24_MIN, INT24_MAX)
+    UINT24 = (UINT24_MIN, UINT24_MAX)
+    INT32 = (INT32_MIN, INT32_MAX)
+    UINT32 = (UINT32_MIN, UINT32_MAX)
+    INT64 = (INT64_MIN, INT64_MAX)
+    UINT64 = (UINT64_MIN, UINT64_MAX)
+
+    def __init__(self, min_: int, max_: int):
+        self.min_ = min_
+        self.max_ = max_
+
+    def is_cover_for(self, low_value: int, high_value: int) -> bool:
+        return low_value >= self.min_ and high_value <= self.max_
+
+    @property
+    def range(self):
+        return (self.min_, self.max_)
+
+    @classmethod
+    def smallest_cover_for(cls, low_value: int, high_value: int) -> Optional['IntegerType']:
+        for int_type in cls:
+            if int_type.is_cover_for(low_value, high_value):
+                return int_type
+        return None
+
+
 # https://stackoverflow.com/questions/2189800/length-of-an-integer-in-python
 def num_digits(n: int) -> int:
     if n > 0:

diff --git a/setup.cfg b/setup.cfg
@@ -76,3 +76,6 @@ ignore_missing_imports = True
 
 [mypy-pyarrow.*]
 ignore_missing_imports = True
+
+[mypy-nose.*]
+ignore_missing_imports = True