-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Also downcast constraints and statistics when downcasting field types #103
Changes from all commits
bb772a5
894c053
d835ad1
5a9b723
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1028 | ||
1049 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
93.4700 | ||
93.5400 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
169 | ||
168 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
92.3100 | ||
92.3300 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,13 +19,13 @@ | |
RecordsSchemaFieldIntegerConstraints, | ||
RecordsSchemaFieldDecimalConstraints) | ||
from .statistics import RecordsSchemaFieldStatistics | ||
from .types import RECORDS_FIELD_TYPES | ||
from .field_types import RECORDS_FIELD_TYPES | ||
if TYPE_CHECKING: | ||
from pandas import Series, Index | ||
from sqlalchemy import Column | ||
from sqlalchemy.types import TypeEngine | ||
from records_mover.db import DBDriver # noqa | ||
from .types import FieldType | ||
from .field_types import FieldType | ||
|
||
from mypy_extensions import TypedDict | ||
|
||
|
@@ -64,9 +64,9 @@ def __init__(self, | |
def refine_from_series(self, | ||
series: 'Series', | ||
total_rows: int, | ||
rows_sampled: int) -> None: | ||
rows_sampled: int) -> 'RecordsSchemaField': | ||
from .pandas import refine_field_from_series | ||
refine_field_from_series(self, series, total_rows, rows_sampled) | ||
return refine_field_from_series(self, series, total_rows, rows_sampled) | ||
|
||
@staticmethod | ||
def is_more_specific_type(a: 'FieldType', b: 'FieldType') -> bool: | ||
|
@@ -77,6 +77,7 @@ def is_more_specific_type(a: 'FieldType', b: 'FieldType') -> bool: | |
@staticmethod | ||
def python_type_to_field_type(specific_type: Type[Any]) -> Optional['FieldType']: | ||
import numpy as np | ||
import pandas as pd | ||
|
||
# Note: records spec doesn't cover complex number types, so | ||
# np.complex_, complex64 and complex128 are not supported | ||
|
@@ -114,6 +115,10 @@ def python_type_to_field_type(specific_type: Type[Any]) -> Optional['FieldType'] | |
datetime.date: 'date', | ||
|
||
datetime.time: 'time', | ||
|
||
datetime.datetime: 'datetime', | ||
|
||
pd.Timestamp: 'datetime', | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The component test I added revealed a couple of types we could handle a little better when downcasting dataframes. |
||
} | ||
if specific_type not in type_mapping: | ||
logger.warning(f"Please teach me how to map {specific_type} into records " | ||
|
@@ -318,3 +323,19 @@ def convert_datetime_to_datetimetz(self) -> 'RecordsSchemaField': | |
constraints=self.constraints, | ||
statistics=self.statistics, | ||
representations=self.representations) | ||
|
||
def cast(self, field_type: 'FieldType') -> 'RecordsSchemaField': | ||
if self.constraints is None: | ||
constraints = None | ||
else: | ||
constraints = self.constraints.cast(field_type) | ||
if self.statistics is None: | ||
statistics = None | ||
else: | ||
statistics = self.statistics.cast(field_type) | ||
field = RecordsSchemaField(name=self.name, | ||
field_type=field_type, | ||
constraints=constraints, | ||
statistics=statistics, | ||
representations=self.representations) | ||
return field |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from typing_extensions import Literal | ||
from typing_inspect import get_args | ||
from typing import List | ||
|
||
FieldType = Literal['integer', | ||
'decimal', | ||
'string', | ||
'boolean', | ||
'date', | ||
'time', | ||
'timetz', | ||
'datetime', | ||
'datetimetz'] | ||
|
||
# Be sure to add new things below in FieldType, too | ||
RECORDS_FIELD_TYPES: List[str] = list(get_args(FieldType)) # type: ignore | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I renamed this from I also dropped the guards, as we now include both typing_inspect and typing_extensions as dependencies. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,7 +44,7 @@ def field_from_series(series: Series, | |
def refine_field_from_series(field: 'RecordsSchemaField', | ||
series: Series, | ||
total_rows: int, | ||
rows_sampled: int) -> None: | ||
rows_sampled: int) -> 'RecordsSchemaField': | ||
from ..field import RecordsSchemaField # noqa | ||
# | ||
# if the series is full of object types that aren't numpy | ||
|
@@ -57,7 +57,7 @@ def refine_field_from_series(field: 'RecordsSchemaField', | |
field_type = field.python_type_to_field_type(unique_python_type) | ||
if field_type is not None: | ||
if RecordsSchemaField.is_more_specific_type(field_type, field.field_type): | ||
field.field_type = field_type | ||
field = field.cast(field_type) | ||
|
||
if field.field_type == 'string': | ||
max_column_length = series.astype('str').map(len).max() | ||
|
@@ -70,7 +70,8 @@ def refine_field_from_series(field: 'RecordsSchemaField', | |
if field.statistics is None: | ||
field.statistics = statistics | ||
elif not isinstance(field.statistics, RecordsSchemaFieldStringStatistics): | ||
raise SyntaxError("Did not expect to see existing statistics " | ||
f"for string type: {field.statistics}") | ||
raise ValueError("Did not expect to see existing statistics " | ||
f"for string type: {field.statistics}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SyntaxError has some special rules around it; it's documented that the 'filename' field must be set. As a result, when it's thrown out of code, the unit tests blow up when they try to read that field ValueError works fine here, though. |
||
else: | ||
field.statistics.merge(statistics) | ||
return field |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,7 @@ | |
from ....db import DBDriver # noqa | ||
from ..field import RecordsSchemaField # noqa | ||
from ..schema import RecordsSchema # noqa | ||
from .types import FieldType # noqa | ||
from .field_types import FieldType # noqa | ||
|
||
|
||
logger = logging.getLogger(__name__) | ||
|
@@ -127,6 +127,10 @@ def field_from_sqlalchemy_column(column: Column, | |
def field_to_sqlalchemy_type(field: 'RecordsSchemaField', | ||
driver: 'DBDriver') -> sqlalchemy.types.TypeEngine: | ||
if field.field_type == 'integer': | ||
if field.constraints and\ | ||
not isinstance(field.constraints, RecordsSchemaFieldIntegerConstraints): | ||
raise ValueError(f"Incorrect constraint type in {field.name}: {field.constraints}") | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We were missing the runtime check before doing a cast() below, which is why folks received a type error in our internal use. |
||
int_constraints =\ | ||
cast(Optional[RecordsSchemaFieldIntegerConstraints], field.constraints) | ||
min_: Optional[int] = None | ||
|
@@ -162,11 +166,11 @@ def field_to_sqlalchemy_type(field: 'RecordsSchemaField', | |
elif field.field_type == 'string': | ||
if field.constraints and\ | ||
not isinstance(field.constraints, RecordsSchemaFieldStringConstraints): | ||
raise SyntaxError(f"Incorrect constraint type: {field.constraints}") | ||
raise ValueError(f"Incorrect constraint type in {field.name}: {field.constraints}") | ||
|
||
if field.statistics and\ | ||
not isinstance(field.statistics, RecordsSchemaFieldStringStatistics): | ||
raise SyntaxError(f"Incorrect statistics type: {field.statistics}") | ||
raise ValueError(f"Incorrect statistics type in {field.name}: {field.statistics}") | ||
|
||
string_constraints =\ | ||
cast(Optional[RecordsSchemaFieldStringConstraints], field.constraints) | ||
|
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,10 @@ def schema_from_dataframe(df: DataFrame, | |
def refine_schema_from_dataframe(records_schema: 'RecordsSchema', | ||
df: DataFrame, | ||
processing_instructions: | ||
ProcessingInstructions = ProcessingInstructions()) -> None: | ||
ProcessingInstructions = ProcessingInstructions()) ->\ | ||
'RecordsSchema': | ||
from records_mover.records.schema import RecordsSchema | ||
|
||
max_sample_size = processing_instructions.max_inference_rows | ||
total_rows = len(df.index) | ||
if max_sample_size is not None and max_sample_size < total_rows: | ||
|
@@ -44,8 +47,11 @@ def refine_schema_from_dataframe(records_schema: 'RecordsSchema', | |
sampled_df = df | ||
rows_sampled = len(sampled_df.index) | ||
|
||
for field in records_schema.fields: | ||
series = sampled_df[field.name] | ||
field.refine_from_series(series, | ||
fields = [ | ||
field.refine_from_series(sampled_df[field.name], | ||
total_rows=total_rows, | ||
rows_sampled=rows_sampled) | ||
for field in records_schema.fields | ||
] | ||
return RecordsSchema(fields=fields, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So much immutability. This PR has made my evening. |
||
known_representations=records_schema.known_representations) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This previously mutated the series passed in; it now either returns the original or a new object. Other methods in this chain of calls are changed similarly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I love immutability.