Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RM-48 factor to_pandas_dtype into dictionary #233

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion metrics/flake8_high_water_mark
Original file line number Diff line number Diff line change
@@ -1 +1 @@
5
4
170 changes: 89 additions & 81 deletions records_mover/records/schema/field/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,96 +199,104 @@ def components_to_time_str(df: pd.DataFrame) -> datetime.time:
target_type)
return series.astype(target_type)

def to_pandas_dtype(self) -> 'Dtype':
def process_integer_field_type_to_pd_dtype(self) -> 'Dtype':
import numpy as np
import pandas as pd
from .pandas import supports_nullable_ints, integer_type_for_range
from .pandas import integer_type_for_range, supports_nullable_ints

has_extension_types = supports_nullable_ints()
int_constraints =\
cast(Optional[RecordsSchemaFieldIntegerConstraints], self.constraints)
min_: Optional[int] = None
max_: Optional[int] = None
required = False
if int_constraints:
min_ = int_constraints.min_
max_ = int_constraints.max_
required = int_constraints.required

if not required and not has_extension_types:
logger.warning(f"Dataframe field {self.name} is nullable, but using pandas "
f"{pd.__version__} which does not support nullable integer type")

if min_ is not None and max_ is not None:
dtype = integer_type_for_range(min_, max_, has_extension_types)
if dtype:
return dtype
else:
logger.warning("Asked for a type larger than int64 in dataframe "
f"field '{self.name}' - providing float128, but "
"loss of precision will occur! "
f"Requested min/max values: {min_}/{max_}")
return np.float128
else:
logger.warning(f"No integer constraints provided for field '{self.name}'; "
"using int64")
if has_extension_types:
return pd.Int64Dtype()
else:
return np.int64

if self.field_type == 'integer':
int_constraints =\
cast(Optional[RecordsSchemaFieldIntegerConstraints], self.constraints)
min_: Optional[int] = None
max_: Optional[int] = None
required = False
if int_constraints:
min_ = int_constraints.min_
max_ = int_constraints.max_
required = int_constraints.required

if not required and not has_extension_types:
logger.warning(f"Dataframe field {self.name} is nullable, but using pandas "
f"{pd.__version__} which does not support nullable integer type")

if min_ is not None and max_ is not None:
dtype = integer_type_for_range(min_, max_, has_extension_types)
if dtype:
return dtype
else:
logger.warning("Asked for a type larger than int64 in dataframe "
f"field '{self.name}' - providing float128, but "
"loss of precision will occur! "
f"Requested min/max values: {min_}/{max_}")
def process_decimal_field_type_to_pd_dtype(self) -> 'Dtype':
import numpy as np

decimal_constraints =\
cast(Optional[RecordsSchemaFieldDecimalConstraints], self.constraints)
if decimal_constraints:
if (decimal_constraints.fixed_precision is not None and
decimal_constraints.fixed_scale is not None):
logger.warning("Pandas doesn't support a fixed precision type - "
"using np.float64")
return np.float64
elif (decimal_constraints.fp_total_bits is not None and
decimal_constraints.fp_significand_bits is not None):
if (decimal_constraints.fp_total_bits <= 16 and
decimal_constraints.fp_significand_bits <= FLOAT16_SIGNIFICAND_BITS):
return np.float16
elif (decimal_constraints.fp_total_bits <= 32 and
decimal_constraints.fp_significand_bits <= FLOAT32_SIGNIFICAND_BITS):
return np.float32
elif (decimal_constraints.fp_total_bits <= 64 and
decimal_constraints.fp_significand_bits <= FLOAT64_SIGNIFICAND_BITS):
return np.float64
elif (decimal_constraints.fp_total_bits <= 80 and
decimal_constraints.fp_significand_bits <= FLOAT80_SIGNIFICAND_BITS):
return np.float128
else:
logger.warning(f"No integer constraints provided for field '{self.name}'; "
"using int64")
if has_extension_types:
return pd.Int64Dtype()
else:
return np.int64
elif self.field_type == 'decimal':
decimal_constraints =\
cast(Optional[RecordsSchemaFieldDecimalConstraints], self.constraints)
if decimal_constraints:
if (decimal_constraints.fixed_precision is not None and
decimal_constraints.fixed_scale is not None):
logger.warning("Pandas doesn't support a fixed precision type - "
"using np.float64")
return np.float64
elif (decimal_constraints.fp_total_bits is not None and
decimal_constraints.fp_significand_bits is not None):
if (decimal_constraints.fp_total_bits <= 16 and
decimal_constraints.fp_significand_bits <= FLOAT16_SIGNIFICAND_BITS):
return np.float16
elif (decimal_constraints.fp_total_bits <= 32 and
decimal_constraints.fp_significand_bits <= FLOAT32_SIGNIFICAND_BITS):
return np.float32
elif (decimal_constraints.fp_total_bits <= 64 and
decimal_constraints.fp_significand_bits <= FLOAT64_SIGNIFICAND_BITS):
return np.float64
elif (decimal_constraints.fp_total_bits <= 80 and
decimal_constraints.fp_significand_bits <= FLOAT80_SIGNIFICAND_BITS):
return np.float128
else:
logger.warning("Downgrading float type to np.float128. "
"Requested total bits: "
f"{decimal_constraints.fp_total_bits}. "
"Requested significand bits: "
f"{decimal_constraints.fp_significand_bits}")
return np.float128

logger.warning(f"No decimal constraints provided for field '{self.name}'; "
"using float64")
return np.float64
elif self.field_type == 'boolean':
return np.bool_
elif self.field_type == 'string':
return np.object_
elif self.field_type == 'date':
return np.object_
elif self.field_type == 'datetime':
return 'datetime64[ns]'
elif self.field_type == 'datetimetz':
return 'datetime64[ns, UTC]'
elif self.field_type == 'time':
return np.object_
elif self.field_type == 'timetz':
return np.object_
else:
logger.warning("Downgrading float type to np.float128. "
"Requested total bits: "
f"{decimal_constraints.fp_total_bits}. "
"Requested significand bits: "
f"{decimal_constraints.fp_significand_bits}")
return np.float128

logger.warning(f"No decimal constraints provided for field '{self.name}'; "
"using float64")
return np.float64

def to_pandas_dtype(self) -> 'Dtype':
import numpy as np

field_type_to_pd_dtype_map = {
'integer': self.process_integer_field_type_to_pd_dtype,
'decimal': self.process_decimal_field_type_to_pd_dtype,
'boolean': np.bool_,
'string': np.object_,
'date': np.object_,
'datetime': 'datetime64[ns]',
'datetimetz': 'datetime64[ns, UTC]',
'time': np.object_,
'timetz': np.object_,
}

pd_dtype = field_type_to_pd_dtype_map.get(self.field_type)
if pd_dtype is None:
raise NotImplementedError("Teach me how to handle records schema "
f"type {self.field_type}")
if self.field_type in ('integer', 'decimal') and callable(pd_dtype):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kind of weird to make the map but then have to special case certain types again anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah... the other option was to have everything be a lambda again which was kind of a drag. Open to talking through ideas here on Monday!

pd_dtype = pd_dtype()

return pd_dtype

def to_data(self) -> 'FieldDict':
out: 'FieldDict' = {'type': self.field_type}
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def initialize_options(self) -> None:

redshift_dependencies_base = [
# sqlalchemy-redshift 0.7.7 introduced support for Parquet in UNLOAD
'sqlalchemy-redshift>=0.7.7',
'sqlalchemy-redshift>=0.7.7,<0.8.13',
] + aws_dependencies + db_dependencies

redshift_dependencies_binary = [
Expand Down