diff --git a/metrics/coverage_high_water_mark b/metrics/coverage_high_water_mark index 562ef5f2a..81b831cb5 100644 --- a/metrics/coverage_high_water_mark +++ b/metrics/coverage_high_water_mark @@ -1 +1 @@ -94.0400 \ No newline at end of file +93.6800 \ No newline at end of file diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark index 2c4488210..7e2455721 100644 --- a/metrics/mypy_high_water_mark +++ b/metrics/mypy_high_water_mark @@ -1 +1 @@ -92.2700 \ No newline at end of file +92.1800 \ No newline at end of file diff --git a/records_mover/cli/job_config_schema_as_args_parser.py b/records_mover/cli/job_config_schema_as_args_parser.py index 0e997206c..3bd7e1ae3 100644 --- a/records_mover/cli/job_config_schema_as_args_parser.py +++ b/records_mover/cli/job_config_schema_as_args_parser.py @@ -146,12 +146,34 @@ def configure_from_single_property(self, return # no immediate argument to add elif 'type' in value and value['type'] == 'boolean': # https://stackoverflow.com/questions/9183936/boolean-argument-for-script - if 'default' in value and value['default'] is True: - # if --no_foo is specified, set foo variable to False - arg_name = "--no_" + formatted_key_name - kwargs['action'] = 'store_false' + if 'default' in value: + if value['default'] is True: + # if --no_foo is specified, set foo variable to False + arg_name = "--no_" + formatted_key_name + kwargs['action'] = 'store_false' + else: + kwargs['action'] = 'store_true' else: - kwargs['action'] = 'store_true' + if key in required_keys: + raise NotImplementedError("Teach me how to handle a required boolean " + "with no default") + else: + # the regular --foo will set this to True: + kwargs['action'] = 'store_const' + # store_true sets a default, so we use store_const + kwargs['const'] = True + + # and we'll add an arg that maps to False for this key: + false_arg_kwargs: ArgParseArgument = {} + # if --no_foo is specified, set foo variable to False + split_key_name = formatted_key_name.split('.') + false_arg_arg_name = '--' + '.'.join([*split_key_name[:-1], + 'no_' + split_key_name[-1]]) + false_arg_kwargs['action'] = 'store_const' + false_arg_kwargs['const'] = False + false_arg_kwargs['dest'] = key + false_arg_kwargs['required'] = False + self.arg_parser.add_argument(false_arg_arg_name, **false_arg_kwargs) else: raise Exception("Did not know how to handle key " + key + " and value " + str(value)) diff --git a/records_mover/db/bigquery/load_job_config_options.py b/records_mover/db/bigquery/load_job_config_options.py index 3f796fa1a..1d77b5915 100644 --- a/records_mover/db/bigquery/load_job_config_options.py +++ b/records_mover/db/bigquery/load_job_config_options.py @@ -3,7 +3,8 @@ from typing import Set from ...records.load_plan import RecordsLoadPlan from ...records.records_format import DelimitedRecordsFormat, ParquetRecordsFormat -from records_mover.records import RecordsHints +from records_mover.records.delimited import ValidatedRecordsHints +from records_mover.mover_types import _assert_never from google.cloud.bigquery.job import CreateDisposition, WriteDisposition from google.cloud import bigquery import logging @@ -104,10 +105,11 @@ def load_job_config(unhandled_hints: Set[str], # schema. ALLOW_FIELD_RELAXATION: allow relaxing a required # field in the original schema to nullable. config.schema_update_options = None - + fail_if_cant_handle_hint = load_plan.processing_instructions.fail_if_cant_handle_hint if isinstance(load_plan.records_format, DelimitedRecordsFormat): + hints = load_plan.records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) add_load_job_csv_config(unhandled_hints, - load_plan.records_format.hints, + hints, fail_if_cant_handle_hint, config) return config @@ -121,7 +123,7 @@ def load_job_config(unhandled_hints: Set[str], def add_load_job_csv_config(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, config: bigquery.LoadJobConfig) -> None: # source_format: File format of the data. @@ -131,7 +133,7 @@ def add_load_job_csv_config(unhandled_hints: Set[str], # The supported values are UTF-8 or ISO-8859-1. # "UTF-8 or ISO-8859-1" # - if hints['encoding'] == 'UTF8': + if hints.encoding == 'UTF8': config.encoding = 'UTF-8' else: # Currently records hints don't support ISO-8859-1 @@ -139,8 +141,8 @@ def add_load_job_csv_config(unhandled_hints: Set[str], quiet_remove(unhandled_hints, 'encoding') # field_delimiter: The separator for fields in a CSV file. - assert isinstance(hints['field-delimiter'], str) - config.field_delimiter = hints['field-delimiter'] + assert isinstance(hints.field_delimiter, str) + config.field_delimiter = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') # allow_jagged_rows: Allow missing trailing optional columns (CSV only). @@ -174,37 +176,37 @@ def add_load_job_csv_config(unhandled_hints: Set[str], # * nonnumeric quoting works fine # * full quoting works fine - if hints['quoting'] is None: + if hints.quoting is None: config.quote_character = '' - elif hints['quoting'] in ['all', 'minimal', 'nonnumeric']: + elif hints.quoting == 'all' or hints.quoting == 'minimal' or hints.quoting == 'nonnumeric': # allow_quoted_newlines: Allow quoted data containing newline # characters (CSV only). config.allow_quoted_newlines = True - assert isinstance(hints['quotechar'], str) - config.quote_character = hints['quotechar'] - if hints['doublequote']: + assert isinstance(hints.quotechar, str) + config.quote_character = hints.quotechar + if hints.doublequote: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) else: - cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) + _assert_never(hints.quoting) quiet_remove(unhandled_hints, 'quoting') quiet_remove(unhandled_hints, 'quotechar') quiet_remove(unhandled_hints, 'doublequote') # No mention of escaping in BigQuery documentation, and in # practice backslashes come through without being interpreted. - if hints['escape'] is None: + if hints.escape is None: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) quiet_remove(unhandled_hints, 'escape') # skip_leading_rows: Number of rows to skip when reading data (CSV only). - if hints['header-row']: + if hints.header_row: config.skip_leading_rows = 1 else: config.skip_leading_rows = 0 @@ -213,7 +215,7 @@ def add_load_job_csv_config(unhandled_hints: Set[str], # "When you load CSV or JSON data, values in DATE columns must # use the dash (-) separator and the date must be in the # following format: YYYY-MM-DD (year-month-day)." - if hints['dateformat'] == 'YYYY-MM-DD': + if hints.dateformat == 'YYYY-MM-DD': pass else: cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) @@ -307,20 +309,20 @@ def add_load_job_csv_config(unhandled_hints: Set[str], # https://stackoverflow.com/questions/44836581/does-python-time-strftime-process-timezone-options-correctly-for-rfc-3339 # https://stackoverflow.com/questions/28729212/pandas-save-date-in-iso-format # - if hints['datetimeformat'] in ['YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS']: + if hints.datetimeformat in ['YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS']: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) quiet_remove(unhandled_hints, 'datetimeformat') - if hints['datetimeformattz'] in ['YYYY-MM-DD HH:MI:SSOF', - 'YYYY-MM-DD HH:MI:SS']: + if hints.datetimeformattz in ['YYYY-MM-DD HH:MI:SSOF', + 'YYYY-MM-DD HH:MI:SS']: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints) quiet_remove(unhandled_hints, 'datetimeformattz') - if hints['timeonlyformat'] == 'HH24:MI:SS': + if hints.timeonlyformat == 'HH24:MI:SS': pass else: cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints) @@ -328,7 +330,7 @@ def add_load_job_csv_config(unhandled_hints: Set[str], # No options to change this. Tested with unix newlines, dos # newlines and mac newlines and all were understood.: - if hints['record-terminator'] in ['\n', '\r\n', '\r', None]: + if hints.record_terminator in ['\n', '\r\n', '\r', None]: pass else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) @@ -338,7 +340,7 @@ def add_load_job_csv_config(unhandled_hints: Set[str], # gzip and works great. .bz2 gives "400 Unsupported # compression type". Not sure about .lzo, but pandas can't # handle it regardless, so doubt it's handled. - if hints['compression'] is None or hints['compression'] == 'GZIP': + if hints.compression is None or hints.compression == 'GZIP': pass else: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) diff --git a/records_mover/db/postgres/copy_options/__init__.py b/records_mover/db/postgres/copy_options/__init__.py index 47611d44c..e187a791d 100644 --- a/records_mover/db/postgres/copy_options/__init__.py +++ b/records_mover/db/postgres/copy_options/__init__.py @@ -1,5 +1,5 @@ from records_mover.records.load_plan import RecordsLoadPlan -from records_mover.records.delimited import RecordsHints +from records_mover.records.delimited import ValidatedRecordsHints from records_mover.records.records_format import DelimitedRecordsFormat import logging from typing import Set, Tuple, Optional @@ -16,7 +16,7 @@ # https://www.postgresql.org/docs/9.2/sql-copy.html -def needs_csv_format(hints: RecordsHints) -> bool: +def needs_csv_format(hints: ValidatedRecordsHints) -> bool: # This format option is used for importing and exporting the Comma # Separated Value (CSV) file format used by many other programs, # such as spreadsheets. Instead of the escaping rules used by @@ -31,7 +31,7 @@ def needs_csv_format(hints: RecordsHints) -> bool: # QUOTE character or the ESCAPE character is preceded by the # escape character. You can also use FORCE_QUOTE to force quotes # when outputting non-NULL values in specific columns. - if hints['header-row'] or (hints['quoting'] is not None): + if hints.header_row or (hints.quoting is not None): return True return False @@ -44,7 +44,7 @@ def postgres_copy_to_options(unhandled_hints: Set[str], Tuple[DateOutputStyle, Optional[DateOrderStyle], PostgresCopyOptions]: - hints = delimited_records_format.hints + hints = delimited_records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) if needs_csv_format(hints): copy_options = postgres_copy_options_csv(unhandled_hints, @@ -74,7 +74,7 @@ def postgres_copy_from_options(unhandled_hints: Set[str], if not isinstance(load_plan.records_format, DelimitedRecordsFormat): raise NotImplementedError("Not currently able to import " f"{load_plan.records_format.format_type}") - hints = load_plan.records_format.hints + hints = load_plan.records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) if needs_csv_format(hints): postgres_copy_options = postgres_copy_options_csv(unhandled_hints, diff --git a/records_mover/db/postgres/copy_options/common.py b/records_mover/db/postgres/copy_options/common.py index c92adef1f..5274d0099 100644 --- a/records_mover/db/postgres/copy_options/common.py +++ b/records_mover/db/postgres/copy_options/common.py @@ -1,5 +1,5 @@ from records_mover.utils import quiet_remove -from records_mover.records.delimited import cant_handle_hint, RecordsHints +from records_mover.records.delimited import cant_handle_hint, ValidatedRecordsHints from typing import Set from .types import PostgresCopyOptions @@ -15,7 +15,7 @@ def postgres_copy_options_common(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, original_postgres_options: PostgresCopyOptions) ->\ PostgresCopyOptions: @@ -27,7 +27,7 @@ def postgres_copy_options_common(unhandled_hints: Set[str], # this option is omitted, the current client encoding is # used. See the Notes below for more details. - encoding_hint: str = hints['encoding'] # type: ignore + encoding_hint = hints.encoding if encoding_hint in postgres_encoding_names: postgres_options['encoding'] = postgres_encoding_names[encoding_hint] quiet_remove(unhandled_hints, 'encoding') @@ -51,7 +51,7 @@ def postgres_copy_options_common(unhandled_hints: Set[str], # is ignored. This option is allowed only when using CSV format. # quiet_remove(unhandled_hints, 'header-row') - postgres_options['header'] = hints['header-row'] + postgres_options['header'] = hints.header_row # OIDS # @@ -67,7 +67,7 @@ def postgres_copy_options_common(unhandled_hints: Set[str], # format, a comma in CSV format. This must be a single one-byte # character. This option is not allowed when using binary format. # - postgres_options['delimiter'] = hints['field-delimiter'] + postgres_options['delimiter'] = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') return postgres_options diff --git a/records_mover/db/postgres/copy_options/csv.py b/records_mover/db/postgres/copy_options/csv.py index 1ec5f3a61..efbc19c1a 100644 --- a/records_mover/db/postgres/copy_options/csv.py +++ b/records_mover/db/postgres/copy_options/csv.py @@ -1,5 +1,5 @@ from records_mover.utils import quiet_remove -from records_mover.records.delimited import cant_handle_hint, RecordsHints +from records_mover.records.delimited import cant_handle_hint, ValidatedRecordsHints from typing import Set from .mode import CopyOptionsMode from .common import postgres_copy_options_common @@ -7,7 +7,7 @@ def postgres_copy_options_csv(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, mode: CopyOptionsModeType) ->\ PostgresCopyOptions: @@ -41,7 +41,7 @@ def postgres_copy_options_csv(unhandled_hints: Set[str], # format. # - postgres_options['quote'] = hints['quotechar'] + postgres_options['quote'] = hints.quotechar quiet_remove(unhandled_hints, 'quotechar') # ESCAPE @@ -53,12 +53,12 @@ def postgres_copy_options_csv(unhandled_hints: Set[str], # character. This option is allowed only when using CSV format. # - if not hints['doublequote']: + if not hints.doublequote: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) else: quiet_remove(unhandled_hints, 'doublequote') - if hints['escape'] is not None: + if hints.escape is not None: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) else: quiet_remove(unhandled_hints, 'escape') @@ -71,7 +71,7 @@ def postgres_copy_options_csv(unhandled_hints: Set[str], # option is allowed only in COPY TO, and only when using CSV # format. if mode is CopyOptionsMode.LOADING: - if hints['quoting'] != 'minimal': + if hints.quoting != 'minimal': cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) else: quiet_remove(unhandled_hints, 'quoting') @@ -86,9 +86,9 @@ def postgres_copy_options_csv(unhandled_hints: Set[str], # FORCE_QUOTE to force quotes when outputting non-NULL values # in specific columns. - if hints['quoting'] == 'minimal': + if hints.quoting == 'minimal': pass # default - elif hints['quoting'] == 'all': + elif hints.quoting == 'all': postgres_options['force_quote'] = '*' else: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) @@ -126,21 +126,21 @@ def postgres_copy_options_csv(unhandled_hints: Set[str], # dockerized-postgres public bigqueryformat # loads fine if mode is CopyOptionsMode.LOADING: - if hints['record-terminator'] in ("\n", "\r\n", "\r", None): + if hints.record_terminator in ("\n", "\r\n", "\r", None): quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'records-terminator', hints) elif mode is CopyOptionsMode.UNLOADING: # No control for this is given - exports appear with unix # newlines. - if hints['record-terminator'] == "\n": + if hints.record_terminator == "\n": quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'records-terminator', hints) else: _assert_never(mode) - if hints['compression'] is not None: + if hints.compression is not None: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) else: quiet_remove(unhandled_hints, 'compression') diff --git a/records_mover/db/postgres/copy_options/date_input_style.py b/records_mover/db/postgres/copy_options/date_input_style.py index 7c16eeaec..b798670bd 100644 --- a/records_mover/db/postgres/copy_options/date_input_style.py +++ b/records_mover/db/postgres/copy_options/date_input_style.py @@ -1,11 +1,11 @@ from records_mover.utils import quiet_remove -from records_mover.records.delimited import cant_handle_hint, RecordsHints +from records_mover.records.delimited import cant_handle_hint, ValidatedRecordsHints from typing import Optional, Set from .types import DateOrderStyle def determine_input_date_order_style(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool) ->\ Optional[DateOrderStyle]: date_order_style: Optional[DateOrderStyle] = None @@ -39,7 +39,7 @@ def upgrade_date_order_style(style: DateOrderStyle, hint_name: str) -> None: # # postgres=# - datetimeformattz = hints['datetimeformattz'] + datetimeformattz = hints.datetimeformattz # datetimeformattz: Valid values: "YYYY-MM-DD HH:MI:SSOF", # "YYYY-MM-DD HH:MI:SS", "YYYY-MM-DD HH24:MI:SSOF", "YYYY-MM-DD @@ -94,7 +94,7 @@ def upgrade_date_order_style(style: DateOrderStyle, hint_name: str) -> None: # "YYYY-MM-DD HH12:MI AM", "MM/DD/YY HH24:MI". See Redshift docs # for more information. - datetimeformat = hints['datetimeformat'] + datetimeformat = hints.datetimeformat if datetimeformat in ("YYYY-MM-DD HH24:MI:SS", "YYYY-MM-DD HH:MI:SS"): @@ -135,7 +135,7 @@ def upgrade_date_order_style(style: DateOrderStyle, hint_name: str) -> None: else: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) - timeonlyformat = hints['timeonlyformat'] + timeonlyformat = hints.timeonlyformat # timeonlyformat: Valid values: "HH12:MI AM" (e.g., "1:00 PM"), # "HH24:MI:SS" (e.g., "13:00:00") @@ -171,7 +171,7 @@ def upgrade_date_order_style(style: DateOrderStyle, hint_name: str) -> None: cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) # dateformat: Valid values: null, "YYYY-MM-DD", "MM-DD-YYYY", "DD-MM-YYYY", "MM/DD/YY". - dateformat = hints['dateformat'] + dateformat = hints.dateformat if dateformat == "YYYY-MM-DD": # postgres=# select date '1999-01-02'; diff --git a/records_mover/db/postgres/copy_options/date_output_style.py b/records_mover/db/postgres/copy_options/date_output_style.py index fd74b3a6f..b85917455 100644 --- a/records_mover/db/postgres/copy_options/date_output_style.py +++ b/records_mover/db/postgres/copy_options/date_output_style.py @@ -1,20 +1,20 @@ from records_mover.utils import quiet_remove -from records_mover.records.delimited import cant_handle_hint, RecordsHints +from records_mover.records.delimited import cant_handle_hint, ValidatedRecordsHints from typing import Set, Tuple, Optional from .types import DateOrderStyle, DateOutputStyle def determine_date_output_style(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool) -> \ Tuple[DateOutputStyle, Optional[DateOrderStyle]]: # see docs in the types module - dateformat = hints['dateformat'] - timeonlyformat = hints['timeonlyformat'] - datetimeformattz = hints['datetimeformattz'] - datetimeformat = hints['datetimeformat'] + dateformat = hints.dateformat + timeonlyformat = hints.timeonlyformat + datetimeformattz = hints.datetimeformattz + datetimeformat = hints.datetimeformat date_order_style: Optional[DateOrderStyle] = None diff --git a/records_mover/db/postgres/copy_options/text.py b/records_mover/db/postgres/copy_options/text.py index 57f282bdc..8ff273784 100644 --- a/records_mover/db/postgres/copy_options/text.py +++ b/records_mover/db/postgres/copy_options/text.py @@ -1,5 +1,5 @@ from records_mover.utils import quiet_remove -from records_mover.records.delimited import cant_handle_hint, RecordsHints +from records_mover.records.delimited import cant_handle_hint, ValidatedRecordsHints from typing import Set from .mode import CopyOptionsMode from .types import PostgresCopyOptions, CopyOptionsModeType, _assert_never @@ -7,7 +7,7 @@ def postgres_copy_options_text(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, mode: CopyOptionsModeType) ->\ PostgresCopyOptions: @@ -26,7 +26,7 @@ def postgres_copy_options_text(unhandled_hints: Set[str], # preceded by a backslash if they appear as part of a column # value: backslash itself, newline, carriage return, and the # current delimiter character. - if hints['escape'] is None: + if hints.escape is None: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) else: quiet_remove(unhandled_hints, 'escape') @@ -64,7 +64,7 @@ def postgres_copy_options_text(unhandled_hints: Set[str], # character. This option is allowed only when using CSV format. # - if hints['doublequote']: + if hints.doublequote: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) else: quiet_remove(unhandled_hints, 'doublequote') @@ -78,7 +78,7 @@ def postgres_copy_options_text(unhandled_hints: Set[str], # format. # - if hints['quoting'] is not None: + if hints.quoting is not None: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) else: quiet_remove(unhandled_hints, 'quoting') @@ -105,21 +105,21 @@ def postgres_copy_options_text(unhandled_hints: Set[str], # are not all alike. if mode is CopyOptionsMode.LOADING: - if hints['record-terminator'] in ["\n", "\r", "\r\n"]: + if hints.record_terminator in ["\n", "\r", "\r\n"]: quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) elif mode is CopyOptionsMode.UNLOADING: # No control for this is given - exports appear with unix # newlines. - if hints['record-terminator'] == "\n": + if hints.record_terminator == "\n": quiet_remove(unhandled_hints, 'record-terminator') else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) else: _assert_never(mode) - if hints['compression'] is not None: + if hints.compression is not None: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) else: quiet_remove(unhandled_hints, 'compression') diff --git a/records_mover/db/redshift/loader.py b/records_mover/db/redshift/loader.py index 9e06920ef..81a846a67 100644 --- a/records_mover/db/redshift/loader.py +++ b/records_mover/db/redshift/loader.py @@ -49,7 +49,10 @@ def load(self, to = Table(table, self.meta, schema=schema) # no autoload unhandled_hints = set(load_plan.records_format.hints.keys()) processing_instructions = load_plan.processing_instructions - redshift_options = redshift_copy_options(unhandled_hints, load_plan.records_format.hints, + validated_hints = load_plan.records_format.\ + validate(fail_if_cant_handle_hint=processing_instructions.fail_if_cant_handle_hint) + redshift_options = redshift_copy_options(unhandled_hints, + validated_hints, processing_instructions.fail_if_cant_handle_hint, processing_instructions.fail_if_row_invalid, processing_instructions.max_failure_rows) @@ -106,7 +109,10 @@ def can_load_this_format(self, source_records_format: BaseRecordsFormat) -> bool return False unhandled_hints = set(load_plan.records_format.hints.keys()) processing_instructions = load_plan.processing_instructions - redshift_copy_options(unhandled_hints, load_plan.records_format.hints, + hints = load_plan.records_format.\ + validate(fail_if_cant_handle_hint=processing_instructions.fail_if_cant_handle_hint) + redshift_copy_options(unhandled_hints, + hints, processing_instructions.fail_if_cant_handle_hint, processing_instructions.fail_if_row_invalid, processing_instructions.max_failure_rows) diff --git a/records_mover/db/redshift/records_copy.py b/records_mover/db/redshift/records_copy.py index 8572da8d7..3e5cbe307 100644 --- a/records_mover/db/redshift/records_copy.py +++ b/records_mover/db/redshift/records_copy.py @@ -1,71 +1,71 @@ from ...utils import quiet_remove -from ...records.delimited import cant_handle_hint +from ...records.delimited import cant_handle_hint, ValidatedRecordsHints +from records_mover.mover_types import _assert_never from sqlalchemy_redshift.commands import Format, Encoding, Compression from typing import Dict, Optional, Set -from ...records import RecordsHints RedshiftCopyOptions = Dict[str, Optional[object]] def redshift_copy_options(unhandled_hints: Set[str], - hints: RecordsHints, + hints: ValidatedRecordsHints, fail_if_cant_handle_hint: bool, fail_if_row_invalid: bool, max_failure_rows: Optional[int]) -> RedshiftCopyOptions: redshift_options: RedshiftCopyOptions = {} - if hints['compression'] == 'GZIP': + if hints.compression == 'GZIP': redshift_options['compression'] = Compression.gzip - elif hints['compression'] == 'LZO': + elif hints.compression == 'LZO': redshift_options['compression'] = Compression.lzop - elif hints['compression'] == 'BZIP': + elif hints.compression == 'BZIP': redshift_options['compression'] = Compression.bzip2 + elif hints.compression is None: + redshift_options['compression'] = None else: - cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) - redshift_options['compression'] = Compression(hints['compression']) + _assert_never(hints.compression) quiet_remove(unhandled_hints, 'compression') - if hints['dateformat'] is None: + if hints.dateformat is None: redshift_options['date_format'] = 'auto' else: - redshift_options['date_format'] = hints['dateformat'] + redshift_options['date_format'] = hints.dateformat quiet_remove(unhandled_hints, 'dateformat') - if hints['encoding'] == 'UTF8': + if hints.encoding == 'UTF8': redshift_options['encoding'] = Encoding.utf8 - elif hints['encoding'] == 'UTF16': + elif hints.encoding == 'UTF16': redshift_options['encoding'] = Encoding.utf16 - elif hints['encoding'] == 'UTF16LE': + elif hints.encoding == 'UTF16LE': redshift_options['encoding'] = Encoding.utf16le - elif hints['encoding'] == 'UTF16BE': + elif hints.encoding == 'UTF16BE': redshift_options['encoding'] = Encoding.utf16be else: cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints) - redshift_options['encoding'] = Encoding(hints['encoding']) + redshift_options['encoding'] = Encoding(hints.encoding) quiet_remove(unhandled_hints, 'encoding') - redshift_options['quote'] = hints['quotechar'] + redshift_options['quote'] = hints.quotechar quiet_remove(unhandled_hints, 'quotechar') - if hints['quoting'] == 'minimal': - if hints['escape'] is not None: + if hints.quoting == 'minimal': + if hints.escape is not None: cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) - if hints['field-delimiter'] != ',': + if hints.field_delimiter != ',': cant_handle_hint(fail_if_cant_handle_hint, 'field-delimiter', hints) - if hints['doublequote'] is not True: + if hints.doublequote is not True: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) redshift_options['format'] = Format.csv else: - redshift_options['delimiter'] = hints['field-delimiter'] - if hints['escape'] == '\\': + redshift_options['delimiter'] = hints.field_delimiter + if hints.escape == '\\': redshift_options['escape'] = True - elif hints['escape'] is None: + elif hints.escape is None: redshift_options['escape'] = False else: - cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) - redshift_options['escape'] = False - if hints['quoting'] == 'all': + _assert_never(hints.escape) + if hints.quoting == 'all': redshift_options['remove_quotes'] = True - if hints['doublequote'] is not False: + if hints.doublequote is not False: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) - elif hints['quoting'] is None: + elif hints.quoting is None: redshift_options['remove_quotes'] = False else: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) @@ -73,7 +73,7 @@ def redshift_copy_options(unhandled_hints: Set[str], quiet_remove(unhandled_hints, 'escape') quiet_remove(unhandled_hints, 'field-delimiter') quiet_remove(unhandled_hints, 'doublequote') - if hints['datetimeformat'] is None: + if hints.datetimeformat is None: redshift_options['time_format'] = 'auto' else: # After testing, Redshift's date/time parsing doesn't actually @@ -87,7 +87,7 @@ def redshift_copy_options(unhandled_hints: Set[str], # likely to handle a variety of formats: # # https://docs.aws.amazon.com/redshift/latest/dg/automatic-recognition.html - if hints['datetimeformat'] != hints['datetimeformattz']: + if hints.datetimeformat != hints.datetimeformattz: # The Redshift auto parser seems to take a good handling # at our various supported formats, so let's give it a # shot if we're not able to specify a specific format due @@ -117,7 +117,7 @@ def redshift_copy_options(unhandled_hints: Set[str], # analytics=> redshift_options['time_format'] = 'auto' else: - redshift_options['time_format'] = hints['datetimeformat'] + redshift_options['time_format'] = hints.datetimeformat quiet_remove(unhandled_hints, 'datetimeformat') quiet_remove(unhandled_hints, 'datetimeformattz') # Redshift doesn't support time-only fields, so these will @@ -130,13 +130,13 @@ def redshift_copy_options(unhandled_hints: Set[str], else: # max allowed value redshift_options['max_error'] = 100000 - if hints['record-terminator'] is not None and \ - hints['record-terminator'] != "\n": + if hints.record_terminator is not None and \ + hints.record_terminator != "\n": cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) quiet_remove(unhandled_hints, 'record-terminator') - if hints['header-row']: + if hints.header_row: redshift_options['ignore_header'] = 1 else: redshift_options['ignore_header'] = 0 diff --git a/records_mover/db/redshift/records_unload.py b/records_mover/db/redshift/records_unload.py index f3521694a..7b5088a78 100644 --- a/records_mover/db/redshift/records_unload.py +++ b/records_mover/db/redshift/records_unload.py @@ -2,6 +2,7 @@ from ...records.delimited import cant_handle_hint from typing import Dict, Any, Set from sqlalchemy_redshift.commands import Format +from records_mover.mover_types import _assert_never from ...records.records_format import ( DelimitedRecordsFormat, ParquetRecordsFormat, BaseRecordsFormat ) @@ -22,58 +23,61 @@ def redshift_unload_options(unhandled_hints: Set[str], if not isinstance(records_format, DelimitedRecordsFormat): raise NotImplementedError("Redshift export only supported via Parquet and " "delimited currently") - hints = records_format.hints - if hints['escape'] == '\\': + hints = records_format.\ + validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) + if hints.escape == '\\': redshift_options['escape'] = True - elif hints['escape'] is not None: - cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints) + elif hints.escape is None: + pass + else: + _assert_never(hints.escape) quiet_remove(unhandled_hints, 'escape') - redshift_options['delimiter'] = hints['field-delimiter'] + redshift_options['delimiter'] = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') - if hints['record-terminator'] == "\n": + if hints.record_terminator == "\n": # This is Redshift's one and only export format pass else: cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints) quiet_remove(unhandled_hints, 'record-terminator') - if hints['quoting'] == 'all': - if hints['doublequote'] is not False: + if hints.quoting == 'all': + if hints.doublequote is not False: cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints) - if hints['quotechar'] != '"': + if hints.quotechar != '"': cant_handle_hint(fail_if_cant_handle_hint, 'quotechar', hints) redshift_options['add_quotes'] = True - elif hints['quoting'] is None: + elif hints.quoting is None: redshift_options['add_quotes'] = False else: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) quiet_remove(unhandled_hints, 'quoting') quiet_remove(unhandled_hints, 'doublequote') quiet_remove(unhandled_hints, 'quotechar') - if hints['compression'] == 'GZIP': + if hints.compression == 'GZIP': redshift_options['gzip'] = True - elif hints['compression'] is None: + elif hints.compression is None: # good to go pass else: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) quiet_remove(unhandled_hints, 'compression') - if hints['encoding'] != 'UTF8': + if hints.encoding != 'UTF8': cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints) quiet_remove(unhandled_hints, 'encoding') - if hints['datetimeformattz'] != 'YYYY-MM-DD HH:MI:SSOF': + if hints.datetimeformattz != 'YYYY-MM-DD HH:MI:SSOF': cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints) quiet_remove(unhandled_hints, 'datetimeformattz') - if hints['datetimeformat'] != 'YYYY-MM-DD HH24:MI:SS': + if hints.datetimeformat != 'YYYY-MM-DD HH24:MI:SS': cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) quiet_remove(unhandled_hints, 'datetimeformat') - if hints['dateformat'] != 'YYYY-MM-DD': + if hints.dateformat != 'YYYY-MM-DD': cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) quiet_remove(unhandled_hints, 'dateformat') # Redshift doesn't have a time without date type, so there's # nothing that could be exported there. quiet_remove(unhandled_hints, 'timeonlyformat') - if hints['header-row']: + if hints.header_row: # Redshift unload doesn't know how to add headers on an # unload. Bleh. # https://stackoverflow.com/questions/24681214/unloading-from-redshift-to-s3-ith-headers diff --git a/records_mover/mover_types.py b/records_mover/mover_types.py index 23534cf68..39e6cfd5b 100644 --- a/records_mover/mover_types.py +++ b/records_mover/mover_types.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Dict, Any, Optional, Union, Mapping, List +from typing import Dict, Any, Optional, Union, Mapping, List, NoReturn JsonSchema = Dict[str, Any] @@ -8,7 +8,14 @@ JsonValue = Optional[Union[bool, str, float, int, Mapping[str, Any], List[Any]]] -# This is a mypy-friendly way of doing a singleton object: +# mypy way of validating we're covering all cases of an enum +# +# https://github.com/python/mypy/issues/6366#issuecomment-560369716 +def _assert_never(x: NoReturn) -> NoReturn: + assert False, "Unhandled type: {}".format(type(x).__name__) + + +# mypy-friendly way of doing a singleton object: # # https://github.com/python/typing/issues/236 class NotYetFetched(Enum): diff --git a/records_mover/records/__init__.py b/records_mover/records/__init__.py index 5b6024a43..ddacb7905 100644 --- a/records_mover/records/__init__.py +++ b/records_mover/records/__init__.py @@ -1,6 +1,6 @@ __all__ = [ - 'RecordsHints', - 'BootstrappingRecordsHints', + 'PartialRecordsHints', + 'UntypedRecordsHints', 'RecordsFormatType', 'RecordsSchema', 'RecordsFormat', @@ -13,7 +13,7 @@ 'move' ] -from .delimited import BootstrappingRecordsHints, RecordsHints +from .delimited import UntypedRecordsHints, PartialRecordsHints from .types import RecordsFormatType, DelimitedVariant from .schema import RecordsSchema from .mover import move diff --git a/records_mover/records/delimited/__init__.py b/records_mover/records/delimited/__init__.py index dd81dd38d..33eddd80e 100644 --- a/records_mover/records/delimited/__init__.py +++ b/records_mover/records/delimited/__init__.py @@ -1,9 +1,9 @@ __all__ = [ - 'BootstrappingRecordsHints', + 'UntypedRecordsHints', + 'validate_partial_hints', 'cant_handle_hint', 'complain_on_unhandled_hints', - 'RecordsHints', - 'MutableRecordsHints', + 'PartialRecordsHints', 'ValidatedRecordsHints', 'sniff_compression_from_url', 'HintEncoding', @@ -34,8 +34,9 @@ hints. """ -from .types import BootstrappingRecordsHints, RecordsHints, MutableRecordsHints +from .types import PartialRecordsHints, UntypedRecordsHints from .validated_records_hints import ValidatedRecordsHints +from .hints import validate_partial_hints from .utils import cant_handle_hint, complain_on_unhandled_hints from .compression import sniff_compression_from_url from .types import ( diff --git a/records_mover/records/delimited/csv_streamer.py b/records_mover/records/delimited/csv_streamer.py index def371909..800eb6a6a 100644 --- a/records_mover/records/delimited/csv_streamer.py +++ b/records_mover/records/delimited/csv_streamer.py @@ -1,6 +1,6 @@ import io from contextlib import contextmanager -from records_mover.records.delimited import BootstrappingRecordsHints +from records_mover.records.delimited import PartialRecordsHints from typing import Union, IO, Iterator, TYPE_CHECKING from .conversions import ( python_encoding_from_hint, @@ -13,7 +13,7 @@ @contextmanager def stream_csv(filepath_or_buffer: Union[str, IO[bytes]], - hints: BootstrappingRecordsHints)\ + hints: PartialRecordsHints)\ -> Iterator['TextFileReader']: """Returns a context manager that can be used to generate a full or partial dataframe from a CSV. If partial, it will not read the diff --git a/records_mover/records/delimited/hint.py b/records_mover/records/delimited/hint.py index 28fbadd5e..bafccb571 100644 --- a/records_mover/records/delimited/hint.py +++ b/records_mover/records/delimited/hint.py @@ -1,29 +1,43 @@ from typing_inspect import is_literal_type, get_args from abc import ABCMeta, abstractmethod -from .types import HintName, RecordsHints -from typing import TypeVar, Generic, Type, List +from .types import HintName, UntypedRecordsHints +from typing import TypeVar, Generic, Type, List, TYPE_CHECKING +if TYPE_CHECKING: + from records_mover.utils.json_schema import JsonSchemaDocument HintT = TypeVar('HintT') class Hint(Generic[HintT], metaclass=ABCMeta): + def __init__(self, + hint_name: HintName, + default: HintT, + description: str) -> None: + self.default = default + self.hint_name = hint_name + self.description = description + @abstractmethod def validate(self, - hints: RecordsHints, + hints: UntypedRecordsHints, fail_if_cant_handle_hint: bool) -> HintT: ... + @abstractmethod + def json_schema_document(self) -> 'JsonSchemaDocument': + ... + class StringHint(Hint[str]): - def __init__(self, - hint_name: HintName, - default: str) -> None: - self.default = default - self.hint_name = hint_name + def json_schema_document(self) -> 'JsonSchemaDocument': + from records_mover.utils.json_schema import JsonSchemaDocument + + return JsonSchemaDocument('string', + description=self.description) def validate(self, - hints: RecordsHints, + hints: UntypedRecordsHints, fail_if_cant_handle_hint: bool) -> str: from .utils import cant_handle_hint @@ -44,15 +58,51 @@ class LiteralHint(Hint[LiteralHintT]): def __init__(self, type_: Type[LiteralHintT], hint_name: HintName, - default: LiteralHintT) -> None: + default: LiteralHintT, + description: str) -> None: assert is_literal_type(type_), f"{hint_name} is not a Literal[]" - self.default = default self.type_ = type_ - self.hint_name = hint_name self.valid_values: List[LiteralHintT] = list(get_args(type_)) + super().__init__(hint_name=hint_name, + default=default, + description=description) + + def json_schema_document(self) -> 'JsonSchemaDocument': + from records_mover.utils.json_schema import JsonSchemaDocument + + json_schema_types = { + bool: 'boolean', + str: 'string', + # Even though Python prints the word NoneType in many + # error messages, NoneType is not an identifier in + # Python. It’s not in builtins. You can only reach it with + # type(None). + # + # https://realpython.com/null-in-python/ + type(None): 'null', + } + + types_set = { + json_schema_types[type(valid_value)] + for valid_value in self.valid_values + } + + if all([t == 'boolean' for t in types_set]): + # We use Literal[True, False] as a type instead of bool as + # mypy's exhaustive type matching doesn't work with 'bool'. + # + # Let's translate that back to boolean here. + return JsonSchemaDocument('boolean', + description=self.description) + + pass + else: + return JsonSchemaDocument(list(types_set), + enum=self.valid_values, + description=self.description) def validate(self, - hints: RecordsHints, + hints: UntypedRecordsHints, fail_if_cant_handle_hint: bool) -> LiteralHintT: from .utils import cant_handle_hint diff --git a/records_mover/records/delimited/hints.py b/records_mover/records/delimited/hints.py index 09851c831..ef8cbf60c 100644 --- a/records_mover/records/delimited/hints.py +++ b/records_mover/records/delimited/hints.py @@ -1,16 +1,19 @@ -from .hint import LiteralHint, StringHint +from .hint import LiteralHint, StringHint, Hint from .types import ( HintHeaderRow, HintCompression, HintQuoting, HintDoublequote, HintEscape, HintEncoding, HintDateFormat, HintTimeOnlyFormat, - HintDateTimeFormatTz, HintDateTimeFormat + HintDateTimeFormatTz, HintDateTimeFormat, + UntypedRecordsHints, PartialRecordsHints ) +from enum import Enum import logging logger = logging.getLogger(__name__) -class Hints: +class Hints(Enum): + value: Hint # mypy gives this when we pass the HintBlahBlah aliases in as an # argument here: # @@ -19,34 +22,81 @@ class Hints: # Nonetheless, the validation works. datetimeformattz = LiteralHint[HintDateTimeFormatTz](HintDateTimeFormatTz, # type: ignore "datetimeformattz", - "YYYY-MM-DD HH24:MI:SSOF") + "YYYY-MM-DD HH24:MI:SSOF", + description=("Format used to write " + "'datetimetz' values")) datetimeformat = LiteralHint[HintDateTimeFormat](HintDateTimeFormat, # type: ignore "datetimeformat", - default="YYYY-MM-DD HH24:MI:SS") + default="YYYY-MM-DD HH24:MI:SS", + description=("Format used to write " + "'datetime' values")) compression = LiteralHint[HintCompression](HintCompression, # type: ignore 'compression', - default=None) + default=None, + description='Compression type of the file.') quoting = LiteralHint[HintQuoting](HintQuoting, # type: ignore 'quoting', - default='minimal') + default='minimal', + description=('How quotes are applied to individual fields. ' + 'all: quote all fields. ' + 'minimal: quote only fields that contain ' + 'ambiguous characters (the ' + 'delimiter, the escape character, or a line ' + 'terminator). ' + 'default: never quote fields.')) escape = LiteralHint[HintEscape](HintEscape, # type: ignore 'escape', - default='\\') + default='\\', + description="Character used to escape strings") encoding = LiteralHint[HintEncoding](HintEncoding, # type: ignore 'encoding', - default='UTF8') + default='UTF8', + description="Text encoding of file") dateformat = LiteralHint[HintDateFormat](HintDateFormat, # type: ignore 'dateformat', - default='YYYY-MM-DD') + default='YYYY-MM-DD', + description=("Format used to write " + "'date' values")) timeonlyformat = LiteralHint[HintTimeOnlyFormat](HintTimeOnlyFormat, # type: ignore 'timeonlyformat', - default="HH24:MI:SS") + default="HH24:MI:SS", + description=("Format used to write " + "'time' values")) + # https://docs.python.org/3/library/csv.html#csv.Dialect.doublequote doublequote = LiteralHint[HintDoublequote](HintDoublequote, # type: ignore 'doublequote', - default=False) + default=False, + description=('Controls how instances of quotechar ' + 'appearing inside a field should ' + 'themselves be quoted. When True, the ' + 'character is doubled. When False, the ' + 'escapechar is used as a prefix to the ' + 'quotechar.')) header_row = LiteralHint[HintHeaderRow](HintHeaderRow, # type: ignore 'header-row', - default=True) - quotechar = StringHint('quotechar', default='"') - record_terminator = StringHint('record-terminator', default='\n') - field_delimiter = StringHint('field-delimiter', default=',') + default=True, + description=('True if a header row is provided in ' + 'the delimited files.')) + # https://docs.python.org/3/library/csv.html#csv.Dialect.quotechar + quotechar = StringHint('quotechar', + default='"', + description=('A one-character string used to quote fields containing ' + 'special characters, such as the delimiter or quotechar, ' + 'or which contain new-line characters.')) + record_terminator = StringHint('record-terminator', + default='\n', + description='String used to close out individual rows of data.') + field_delimiter = StringHint('field-delimiter', + default=',', + description='Character used between fields.') + + +def validate_partial_hints(untyped_hints: UntypedRecordsHints, + fail_if_cant_handle_hint: bool) -> PartialRecordsHints: + typed_records_hints: PartialRecordsHints = {} + for key in untyped_hints: + hint_obj = Hints[key.replace('-', '_')].value + value = hint_obj.validate(untyped_hints, + fail_if_cant_handle_hint=fail_if_cant_handle_hint) + typed_records_hints[key] = value # type: ignore + return typed_records_hints diff --git a/records_mover/records/delimited/sniff.py b/records_mover/records/delimited/sniff.py index 7199f1f47..030dcf047 100644 --- a/records_mover/records/delimited/sniff.py +++ b/records_mover/records/delimited/sniff.py @@ -1,6 +1,6 @@ import chardet from contextlib import contextmanager -from . import RecordsHints, BootstrappingRecordsHints +from . import PartialRecordsHints from .csv_streamer import stream_csv, python_encoding_from_hint import io import csv @@ -8,8 +8,9 @@ import bz2 from .types import HintEncoding, HintRecordTerminator, HintQuoting, HintCompression from .conversions import hint_encoding_from_chardet -from typing import List, IO, Optional, Iterator, NoReturn, Dict +from typing import List, IO, Optional, Iterator, Dict from records_mover.utils.rewound_fileobj import rewound_fileobj +from records_mover.mover_types import _assert_never import logging @@ -18,13 +19,6 @@ HINT_INFERENCE_SAMPLING_SIZE_BYTES = 1024 -# mypy way of validating we're covering all cases of an enum -# -# https://github.com/python/mypy/issues/6366#issuecomment-560369716 -def _assert_never(x: NoReturn) -> NoReturn: - assert False, "Unhandled type: {}".format(type(x).__name__) - - @contextmanager def rewound_decompressed_fileobj(fileobj: IO[bytes], compression: HintCompression) -> Iterator[IO[bytes]]: @@ -91,7 +85,7 @@ def sniff_encoding_hint(fileobj: IO[bytes]) -> Optional[HintEncoding]: def csv_hints_from_python(fileobj: IO[bytes], record_terminator_hint: Optional[HintRecordTerminator], encoding_hint: HintEncoding, - compression: HintCompression) -> RecordsHints: + compression: HintCompression) -> PartialRecordsHints: # https://docs.python.org/3/library/csv.html#csv.Sniffer with rewound_decompressed_fileobj(fileobj, compression) as fileobj: @@ -127,12 +121,13 @@ def csv_hints_from_python(fileobj: IO[bytes], sample_with_unix_newlines = sample dialect = sniffer.sniff(sample_with_unix_newlines) header_row = sniffer.has_header(sample_with_unix_newlines) - out: RecordsHints = { - 'doublequote': dialect.doublequote, + out: PartialRecordsHints = { + 'doublequote': True if dialect.doublequote else False, 'field-delimiter': dialect.delimiter, - 'quotechar': dialect.quotechar, - 'header-row': header_row, + 'header-row': True if header_row else False, } + if dialect.quotechar is not None: + out['quotechar'] = dialect.quotechar logger.info(f"Python csv.Dialect sniffed: {out}") return out except csv.Error as e: @@ -147,10 +142,10 @@ def csv_hints_from_python(fileobj: IO[bytes], def csv_hints_from_pandas(fileobj: IO[bytes], - streaming_hints: BootstrappingRecordsHints) -> RecordsHints: + streaming_hints: PartialRecordsHints) -> PartialRecordsHints: import pandas - def attempt_parse(quoting: HintQuoting) -> RecordsHints: + def attempt_parse(quoting: HintQuoting) -> PartialRecordsHints: with rewound_fileobj(fileobj) as fresh_fileobj: current_hints = streaming_hints.copy() current_hints['quoting'] = quoting @@ -193,7 +188,7 @@ def sniff_compression_hint(fileobj: IO[bytes]) -> HintCompression: def sniff_hints_from_fileobjs(fileobjs: List[IO[bytes]], - initial_hints: BootstrappingRecordsHints) -> RecordsHints: + initial_hints: PartialRecordsHints) -> PartialRecordsHints: if len(fileobjs) != 1: # https://app.asana.com/0/53283930106309/1131698268455054 raise NotImplementedError('Cannot currently sniff hints from mulitple ' @@ -204,7 +199,7 @@ def sniff_hints_from_fileobjs(fileobjs: List[IO[bytes]], def sniff_hints(fileobj: IO[bytes], - initial_hints: BootstrappingRecordsHints) -> RecordsHints: + initial_hints: PartialRecordsHints) -> PartialRecordsHints: # Major limitations: # # * If fileobj isn't rewindable, we can't sniff or we'd keep you @@ -264,20 +259,20 @@ def sniff_hints(fileobj: IO[bytes], streaming_hints['compression'] = compression_hint if encoding_hint is not None: streaming_hints['encoding'] = encoding_hint - streaming_hints.update(python_inferred_hints) # type: ignore + streaming_hints.update(python_inferred_hints) pandas_inferred_hints = csv_hints_from_pandas(fileobj, streaming_hints) # # Let's combine these together and present back a refined # version of the initial hints: # - out = { + out: PartialRecordsHints = { 'compression': compression_hint, **pandas_inferred_hints, # type: ignore - **python_inferred_hints, # type: ignore + **python_inferred_hints, 'encoding': final_encoding_hint, - **other_inferred_csv_hints, # type: ignore - **initial_hints # type: ignore + **other_inferred_csv_hints, + **initial_hints } logger.info(f"Inferred hints from combined sources: {out}") return out diff --git a/records_mover/records/delimited/types.py b/records_mover/records/delimited/types.py index d3da60ec4..1987a1029 100644 --- a/records_mover/records/delimited/types.py +++ b/records_mover/records/delimited/types.py @@ -1,11 +1,11 @@ -from typing import Optional, Union, Mapping, Dict -from typing_extensions import Literal, TypedDict -from records_mover.mover_types import JsonValue - - -RecordsValue = Optional[Union[bool, str]] -RecordsHints = Mapping[str, JsonValue] -MutableRecordsHints = Dict[str, JsonValue] +from typing_inspect import get_args +from typing import Mapping, List +from typing_extensions import Literal +# TypedDict isn't mypy specific, but typing_inspect currently doesn't +# support typing_extensions.TypedDict. +# +# https://github.com/ilevkivskyi/typing_inspect/issues/50 +from mypy_extensions import TypedDict HintEncoding = Literal["UTF8", "UTF16", "UTF16LE", "UTF16BE", "UTF16BOM", "UTF8BOM", "LATIN1", "CP1252"] @@ -37,27 +37,12 @@ "YYYY-MM-DD HH12:MI AM", "MM/DD/YY HH24:MI"] - HintFieldDelimiter = str HintRecordTerminator = str HintQuoteChar = str - -BootstrappingRecordsHints = TypedDict('BootstrappingRecordsHints', - { - 'quoting': HintQuoting, - 'header-row': HintHeaderRow, - 'field-delimiter': HintFieldDelimiter, - 'encoding': HintEncoding, - 'escape': HintEscape, - 'compression': HintCompression, - 'record-terminator': HintRecordTerminator, - }, - total=False) - - HintName = Literal["header-row", "field-delimiter", "compression", @@ -71,3 +56,24 @@ "timeonlyformat", "datetimeformattz", "datetimeformat"] + +HINT_NAMES: List[HintName] = list(get_args(HintName)) # type: ignore + +PartialRecordsHints = TypedDict('PartialRecordsHints', + { + 'quoting': HintQuoting, + 'header-row': HintHeaderRow, + 'field-delimiter': HintFieldDelimiter, + 'encoding': HintEncoding, + 'escape': HintEscape, + 'compression': HintCompression, + 'record-terminator': HintRecordTerminator, + 'quotechar': HintQuoteChar, + 'doublequote': HintDoublequote, + 'dateformat': HintDateFormat, + 'timeonlyformat': HintTimeOnlyFormat, + 'datetimeformattz': HintDateTimeFormatTz, + 'datetimeformat': HintDateTimeFormat, + }, + total=False) +UntypedRecordsHints = Mapping[str, object] diff --git a/records_mover/records/delimited/utils.py b/records_mover/records/delimited/utils.py index 7655d82bc..a11721c66 100644 --- a/records_mover/records/delimited/utils.py +++ b/records_mover/records/delimited/utils.py @@ -1,15 +1,29 @@ -from .types import RecordsHints +from .types import PartialRecordsHints, UntypedRecordsHints +from .validated_records_hints import ValidatedRecordsHints import logging -from typing import Iterable +from typing import Iterable, Union logger = logging.getLogger(__name__) +def _hint_value(hints: Union[PartialRecordsHints, + UntypedRecordsHints, + ValidatedRecordsHints], + hint_name: str) -> object: + if isinstance(hints, ValidatedRecordsHints): + value = getattr(hints, hint_name.replace('-', '_')) + else: + value = hints[hint_name] + return value + + def complain_on_unhandled_hints(fail_if_dont_understand: bool, unhandled_hints: Iterable[str], - hints: RecordsHints) -> None: - unhandled_bindings = [f"{k}={hints[k]}" for k in unhandled_hints] + hints: Union[PartialRecordsHints, + UntypedRecordsHints, + ValidatedRecordsHints]) -> None: + unhandled_bindings = [f"{k}={_hint_value(hints, k)}" for k in unhandled_hints] unhandled_bindings_str = ", ".join(unhandled_bindings) if len(unhandled_bindings) > 0: if fail_if_dont_understand: @@ -20,11 +34,17 @@ def complain_on_unhandled_hints(fail_if_dont_understand: bool, logger.warning(f"Did not understand these hints: {unhandled_bindings_str}") -def cant_handle_hint(fail_if_cant_handle_hint: bool, hint_name: str, hints: RecordsHints) -> None: +def cant_handle_hint(fail_if_cant_handle_hint: bool, + hint_name: str, + hints: Union[PartialRecordsHints, + UntypedRecordsHints, + ValidatedRecordsHints]) -> None: + value = _hint_value(hints, hint_name) if not fail_if_cant_handle_hint: logger.warning("Ignoring hint {hint_name} = {hint_value}" .format(hint_name=hint_name, - hint_value=repr(hints[hint_name]))) + hint_value=repr(value))) else: - raise NotImplementedError(f"Implement hint {hint_name}={repr(hints[hint_name])} " + + raise NotImplementedError(f"Implement hint {hint_name}=" + f"{repr(value)} " + "or try again with fail_if_cant_handle_hint=False") diff --git a/records_mover/records/delimited/validated_records_hints.py b/records_mover/records/delimited/validated_records_hints.py index bc5e369c1..46bd09cc3 100644 --- a/records_mover/records/delimited/validated_records_hints.py +++ b/records_mover/records/delimited/validated_records_hints.py @@ -1,5 +1,5 @@ from typing import NamedTuple, TypeVar -from .types import (RecordsHints, HintHeaderRow, HintFieldDelimiter, +from .types import (UntypedRecordsHints, HintHeaderRow, HintFieldDelimiter, HintCompression, HintRecordTerminator, HintQuoting, HintQuoteChar, HintDoublequote, HintEscape, HintEncoding, HintDateFormat, @@ -25,7 +25,7 @@ class ValidatedRecordsHints(NamedTuple): datetimeformat: HintDateTimeFormat @staticmethod - def validate(hints: RecordsHints, + def validate(hints: UntypedRecordsHints, fail_if_cant_handle_hint: bool) -> 'ValidatedRecordsHints': T = TypeVar('T') @@ -33,17 +33,17 @@ def v(hint: Hint[T]) -> T: return hint.validate(hints, fail_if_cant_handle_hint) return ValidatedRecordsHints( - header_row=v(Hints.header_row), - field_delimiter=v(Hints.field_delimiter), - compression=v(Hints.compression), - record_terminator=v(Hints.record_terminator), - quoting=v(Hints.quoting), - quotechar=v(Hints.quotechar), - doublequote=v(Hints.doublequote), - escape=v(Hints.escape), - encoding=v(Hints.encoding), - dateformat=v(Hints.dateformat), - timeonlyformat=v(Hints.timeonlyformat), - datetimeformattz=v(Hints.datetimeformattz), - datetimeformat=v(Hints.datetimeformat), + header_row=v(Hints.header_row.value), + field_delimiter=v(Hints.field_delimiter.value), + compression=v(Hints.compression.value), + record_terminator=v(Hints.record_terminator.value), + quoting=v(Hints.quoting.value), + quotechar=v(Hints.quotechar.value), + doublequote=v(Hints.doublequote.value), + escape=v(Hints.escape.value), + encoding=v(Hints.encoding.value), + dateformat=v(Hints.dateformat.value), + timeonlyformat=v(Hints.timeonlyformat.value), + datetimeformattz=v(Hints.datetimeformattz.value), + datetimeformat=v(Hints.datetimeformat.value), ) diff --git a/records_mover/records/job/config.py b/records_mover/records/job/config.py index bb5adaff8..652e1ce3f 100644 --- a/records_mover/records/job/config.py +++ b/records_mover/records/job/config.py @@ -4,8 +4,8 @@ from records_mover import Session from ..records_format import DelimitedRecordsFormat from ..existing_table_handling import ExistingTableHandling -from .hints import SUPPORTED_HINT_LOOKUP -from ..delimited import RecordsHints +from ..delimited import PartialRecordsHints +from records_mover.records.delimited.types import HINT_NAMES from ...mover_types import JobConfig @@ -35,8 +35,7 @@ def add_hint_parameter(self, kwargs: Dict[str, Any], param_name: str) -> None: elif 'initial_hints' in self.possible_args: if 'initial_hints' not in kwargs: kwargs['initial_hints'] = {} - hint_definition = SUPPORTED_HINT_LOOKUP[param_name] - kwargs['initial_hints'][hint_definition.target_hint_name] = kwargs[param_name] + kwargs['initial_hints'][param_name] = kwargs[param_name] elif 'records_format' in self.possible_args: # user must want a delimited records format - e.g., on # output where initial hints are not a thing as we're not @@ -44,7 +43,7 @@ def add_hint_parameter(self, kwargs: Dict[str, Any], param_name: str) -> None: # variant yet. Let's assume the default variant to start - # maybe a future arg will reveal a more specific variant. kwargs['records_format'] =\ - DelimitedRecordsFormat(hints={param_name: kwargs[param_name]}) + DelimitedRecordsFormat(hints={param_name: kwargs[param_name]}) # type: ignore else: raise NotImplementedError(f"Could not find place for {param_name} in " f"{self.possible_args}") @@ -80,7 +79,7 @@ def fill_in_records_format(self, kwargs: Dict[str, Any]) -> None: # but now we know which variant they want kwargs['records_format'] = kwargs['records_format'].alter_variant(kwargs['variant']) else: - hints: RecordsHints = {} + hints: PartialRecordsHints = {} if 'initial_hints' in kwargs: # Given the user gave us a variant, we won't be # using "initial hints" to sniff with - the hint @@ -128,7 +127,7 @@ def to_args(self) -> Dict[str, Any]: self.fill_in_existing_table_handling(kwargs) elif arg == 'variant': self.fill_in_records_format(kwargs) - elif arg in SUPPORTED_HINT_LOOKUP: + elif arg in HINT_NAMES: self.add_hint_parameter(kwargs, arg) else: raise NotImplementedError(f"Teach me how to pass in {arg}") diff --git a/records_mover/records/job/hints.py b/records_mover/records/job/hints.py deleted file mode 100644 index ef45aff96..000000000 --- a/records_mover/records/job/hints.py +++ /dev/null @@ -1,66 +0,0 @@ -"""Defines hints supported by the job config parser.""" -from ...utils.json_schema import JsonParameter, JsonSchemaDocument -from typing import Optional - - -class SupportedHint: - """Definition for supported hints""" - - def __init__(self, - schema: JsonParameter, - target_hint_name: Optional[str] = None): - self.schema = schema - self.target_hint_name = target_hint_name or schema.name - - @property - def config_name(self) -> str: - return self.schema.name - - -QUOTING_DESCRIPTION =\ - ('How quotes are applied to individual fields. ' - 'all: quote all fields. ' - 'minimal: quote only fields that contain ambiguous characters (the ' - 'delimiter, the escape character, or a line terminator). ' - 'default: never quote fields.') - -# -# Note: Any expansion of these types should also be done in -# records.types -# -SUPPORTED_HINTS = [ - SupportedHint( - JsonParameter('field-delimiter', - JsonSchemaDocument('string', - description=('Character used between fields ' - '(default is comma)')), - optional=True)), - SupportedHint( - JsonParameter('compression', - JsonSchemaDocument(['string', 'null'], - enum=['BZIP', 'GZIP', 'LZO', None], - description='Compression type of the file.'), - optional=True)), - SupportedHint( - JsonParameter('escape', - JsonSchemaDocument(['string', 'null'], - enum=['\\', None], - description="Character used to escape strings"), - optional=True)), - SupportedHint( - JsonParameter('quoting', - JsonSchemaDocument(['string', 'null'], - enum=['all', 'minimal', 'nonnumeric', None], - description=QUOTING_DESCRIPTION), - optional=True)), - SupportedHint( - JsonParameter('encoding', - JsonSchemaDocument(['string'], - enum=[ - 'UTF8', 'UTF16', 'UTF16LE', 'UTF16BE', - 'LATIN1', 'CP1252' - ], - description="Text encoding of file"), - optional=True)), -] -SUPPORTED_HINT_LOOKUP = {hint.config_name: hint for hint in SUPPORTED_HINTS} diff --git a/records_mover/records/job/schema.py b/records_mover/records/job/schema.py index 3f9644b78..683de38a3 100644 --- a/records_mover/records/job/schema.py +++ b/records_mover/records/job/schema.py @@ -1,8 +1,16 @@ from ...utils.json_schema import method_signature_to_json_schema, JsonParameter, JsonSchemaDocument from ..existing_table_handling import ExistingTableHandling +from records_mover.records.delimited.hints import Hints from typing import Any, Dict, List, Callable from ...mover_types import JsonSchema -from .hints import SUPPORTED_HINTS + + +HINT_PARAMETERS = [ + JsonParameter(hint_enum.value.hint_name, + hint_enum.value.json_schema_document(), + optional=True) + for hint_enum in list(Hints) +] def method_to_json_schema(method: Callable[..., Any]) -> JsonSchema: @@ -10,8 +18,8 @@ def method_to_json_schema(method: Callable[..., Any]) -> JsonSchema: 'google_cloud_creds': [JsonParameter('gcp_creds_name', JsonSchemaDocument('string'))], 'db_engine': [JsonParameter('db_name', JsonSchemaDocument('string'))], 'records_format': ([JsonParameter('variant', JsonSchemaDocument('string'), optional=True)] + - [hint.schema for hint in SUPPORTED_HINTS]), - 'initial_hints': [hint.schema for hint in SUPPORTED_HINTS], + HINT_PARAMETERS), + 'initial_hints': HINT_PARAMETERS, 'existing_table_handling': [JsonParameter('existing_table', JsonSchemaDocument('string', diff --git a/records_mover/records/pandas/read_csv_options.py b/records_mover/records/pandas/read_csv_options.py index 4c38e5fdb..323d312bc 100644 --- a/records_mover/records/pandas/read_csv_options.py +++ b/records_mover/records/pandas/read_csv_options.py @@ -18,7 +18,8 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, ... # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html#pandas.read_csv - hints = records_format.hints + hints = records_format.\ + validate(fail_if_cant_handle_hint=processing_instructions.fail_if_cant_handle_hint) fail_if_cant_handle_hint = processing_instructions.fail_if_cant_handle_hint @@ -60,7 +61,7 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # # Alias for sep. # - pandas_options['delimiter'] = hints['field-delimiter'] + pandas_options['delimiter'] = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') # @@ -80,7 +81,7 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # skip_blank_lines=True, so header=0 denotes the first line of # data rather than the first line of the file. # - if hints['header-row']: + if hints.header_row: pandas_options['header'] = 0 else: pandas_options['header'] = None @@ -194,7 +195,7 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # engine is currently more feature-complete. # - non_standard_record_terminator = hints['record-terminator'] not in ['\n', '\r\n', '\r'] + non_standard_record_terminator = hints.record_terminator not in ['\n', '\r\n', '\r'] if non_standard_record_terminator: # the 'lineterminator' option below is only valid for c parser @@ -430,17 +431,17 @@ def day_first(dateish_format: str) -> bool: return (dateish_format.startswith('DD-MM-') or dateish_format.startswith('DD/MM/')) - assert isinstance(hints['dateformat'], str) - assert isinstance(hints['datetimeformat'], str) - assert isinstance(hints['datetimeformattz'], str) - consistent_formats = (day_first(hints['dateformat']) == - day_first(hints['datetimeformat']) == - day_first(hints['datetimeformattz'])) + assert isinstance(hints.dateformat, str) + assert isinstance(hints.datetimeformat, str) + assert isinstance(hints.datetimeformattz, str) + consistent_formats = (day_first(hints.dateformat) == + day_first(hints.datetimeformat) == + day_first(hints.datetimeformattz)) if not consistent_formats: cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) - pandas_options['dayfirst'] = day_first(hints['dateformat']) + pandas_options['dayfirst'] = day_first(hints.dateformat) quiet_remove(unhandled_hints, 'dateformat') quiet_remove(unhandled_hints, 'timeonlyformat') @@ -480,13 +481,13 @@ def day_first(dateish_format: str) -> bool: # # New in version 0.18.1: support for ‘zip’ and ‘xz’ compression. # - if hints['compression'] is None: - # hints['compression']=None will output an uncompressed csv, + if hints.compression is None: + # hints.compression=None will output an uncompressed csv, # which is the pandas default. pandas_options['compression'] = None - elif hints['compression'] == 'GZIP': + elif hints.compression == 'GZIP': pandas_options['compression'] = 'gzip' - elif hints['compression'] == 'BZIP': + elif hints.compression == 'BZIP': pandas_options['compression'] = 'bz2' else: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) @@ -515,7 +516,7 @@ def day_first(dateish_format: str) -> bool: # Character to break file into lines. Only valid with C parser. # if non_standard_record_terminator: - pandas_options['lineterminator'] = hints['record-terminator'] + pandas_options['lineterminator'] = hints.record_terminator quiet_remove(unhandled_hints, 'record-terminator') # @@ -525,7 +526,7 @@ def day_first(dateish_format: str) -> bool: # item. Quoted items can include the delimiter and it will be # ignored. # - pandas_options['quotechar'] = hints['quotechar'] + pandas_options['quotechar'] = hints.quotechar quiet_remove(unhandled_hints, 'quotechar') # @@ -536,13 +537,13 @@ def day_first(dateish_format: str) -> bool: # QUOTE_NONE (3). # - if hints['quoting'] is None: + if hints.quoting is None: pandas_options['quoting'] = csv.QUOTE_NONE - elif hints['quoting'] == 'all': + elif hints.quoting == 'all': pandas_options['quoting'] = csv.QUOTE_ALL - elif hints['quoting'] == 'minimal': + elif hints.quoting == 'minimal': pandas_options['quoting'] = csv.QUOTE_MINIMAL - elif hints['quoting'] == 'nonnumeric': + elif hints.quoting == 'nonnumeric': pandas_options['quoting'] = csv.QUOTE_NONNUMERIC else: cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) @@ -555,7 +556,7 @@ def day_first(dateish_format: str) -> bool: # indicate whether or not to interpret two consecutive quotechar # elements INSIDE a field as a single quotechar element. # - pandas_options['doublequote'] = hints['doublequote'] + pandas_options['doublequote'] = hints.doublequote quiet_remove(unhandled_hints, 'doublequote') # @@ -563,10 +564,10 @@ def day_first(dateish_format: str) -> bool: # # One-character string used to escape other characters. # - if hints['escape'] is None: + if hints.escape is None: pass else: - pandas_options['escapechar'] = hints['escape'] + pandas_options['escapechar'] = hints.escape quiet_remove(unhandled_hints, 'escape') # @@ -593,8 +594,8 @@ def day_first(dateish_format: str) -> bool: # should only be specified when reading from a filepath # - if hints['compression'] is not None: - pandas_options['encoding'] = hints['encoding'] + if hints.compression is not None: + pandas_options['encoding'] = hints.encoding quiet_remove(unhandled_hints, 'encoding') # diff --git a/records_mover/records/pandas/to_csv_options.py b/records_mover/records/pandas/to_csv_options.py index a6d931838..1b9b070ad 100644 --- a/records_mover/records/pandas/to_csv_options.py +++ b/records_mover/records/pandas/to_csv_options.py @@ -3,6 +3,7 @@ from ..delimited import cant_handle_hint from ..processing_instructions import ProcessingInstructions from ..records_format import DelimitedRecordsFormat +from records_mover.mover_types import _assert_never import logging from typing import Set, Dict @@ -14,55 +15,56 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, unhandled_hints: Set[str], processing_instructions: ProcessingInstructions) -> Dict[str, object]: # https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_csv.html - hints = records_format.hints + hints = records_format.\ + validate(fail_if_cant_handle_hint=processing_instructions.fail_if_cant_handle_hint) fail_if_cant_handle_hint = processing_instructions.fail_if_cant_handle_hint pandas_options: Dict[str, object] = {} - pandas_options['encoding'] = hints['encoding'] + pandas_options['encoding'] = hints.encoding quiet_remove(unhandled_hints, 'encoding') - if hints['compression'] is None: + if hints.compression is None: # hints['compression']=None will output an uncompressed csv, # which is the pandas default. pass - elif hints['compression'] == 'GZIP': + elif hints.compression == 'GZIP': pandas_options['compression'] = 'gzip' - elif hints['compression'] == 'BZIP': + elif hints.compression == 'BZIP': pandas_options['compression'] = 'bz2' else: cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints) quiet_remove(unhandled_hints, 'compression') - if hints['quoting'] is None: + if hints.quoting is None: pandas_options['quoting'] = csv.QUOTE_NONE - elif hints['quoting'] == 'all': + elif hints.quoting == 'all': pandas_options['quoting'] = csv.QUOTE_ALL - elif hints['quoting'] == 'minimal': + elif hints.quoting == 'minimal': pandas_options['quoting'] = csv.QUOTE_MINIMAL - elif hints['quoting'] == 'nonnumeric': + elif hints.quoting == 'nonnumeric': pandas_options['quoting'] = csv.QUOTE_NONNUMERIC else: - cant_handle_hint(fail_if_cant_handle_hint, 'quoting', hints) + _assert_never(hints.quoting) quiet_remove(unhandled_hints, 'quoting') - pandas_options['doublequote'] = hints['doublequote'] + pandas_options['doublequote'] = hints.doublequote quiet_remove(unhandled_hints, 'doublequote') - pandas_options['quotechar'] = hints['quotechar'] + pandas_options['quotechar'] = hints.quotechar quiet_remove(unhandled_hints, 'quotechar') - if hints['escape'] is None: + if hints.escape is None: pass else: - pandas_options['escapechar'] = hints['escape'] + pandas_options['escapechar'] = hints.escape quiet_remove(unhandled_hints, 'escape') - pandas_options['header'] = hints['header-row'] + pandas_options['header'] = hints.header_row quiet_remove(unhandled_hints, 'header-row') - if hints['dateformat'] is None: - if hints['datetimeformattz'] == hints['datetimeformat']: + if hints.dateformat is None: + if hints.datetimeformattz == hints.datetimeformat: # BigQuery requires that timezone offsets have a colon; # Python (and thus Pandas) doesn't support adding the # colon with strftime. However, we can specify things @@ -86,23 +88,23 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f' else: pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z' - elif hints['dateformat'] == 'YYYY-MM-DD': - if hints['datetimeformattz'] == hints['datetimeformat']: + elif hints.dateformat == 'YYYY-MM-DD': + if hints.datetimeformattz == hints.datetimeformat: pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f' else: pandas_options['date_format'] = '%Y-%m-%d %H:%M:%S.%f%z' - elif hints['dateformat'] == 'MM-DD-YYYY': - if hints['datetimeformattz'] == hints['datetimeformat']: + elif hints.dateformat == 'MM-DD-YYYY': + if hints.datetimeformattz == hints.datetimeformat: pandas_options['date_format'] = '%m-%d-%Y %H:%M:%S.%f' else: pandas_options['date_format'] = '%m-%d-%Y %H:%M:%S.%f%z' - elif hints['dateformat'] == 'DD-MM-YYYY': - if hints['datetimeformattz'] == hints['datetimeformat']: + elif hints.dateformat == 'DD-MM-YYYY': + if hints.datetimeformattz == hints.datetimeformat: pandas_options['date_format'] = '%d-%m-%Y %H:%M:%S.%f' else: pandas_options['date_format'] = '%d-%m-%Y %H:%M:%S.%f%z' - elif hints['dateformat'] == 'MM/DD/YY': - if hints['datetimeformattz'] == hints['datetimeformat']: + elif hints.dateformat == 'MM/DD/YY': + if hints.datetimeformattz == hints.datetimeformat: pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f' else: pandas_options['date_format'] = '%m/%d/%y %H:%M:%S.%f%z' @@ -114,35 +116,35 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat, # # might be nice someday to only emit the errors if the actual data # being moved is affected by whatever limitation... - if (hints['datetimeformattz'] not in(f"{hints.get('dateformat', 'YYYY-MM-DD')} HH24:MI:SSOF", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH:MI:SSOF", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH24:MI:SS", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH:MI:SS", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH:MIOF", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH:MI", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH24:MIOF", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH24:MI")): + if (hints.datetimeformattz not in (f"{hints.dateformat} HH24:MI:SSOF", + f"{hints.dateformat} HH:MI:SSOF", + f"{hints.dateformat} HH24:MI:SS", + f"{hints.dateformat} HH:MI:SS", + f"{hints.dateformat} HH:MIOF", + f"{hints.dateformat} HH:MI", + f"{hints.dateformat} HH24:MIOF", + f"{hints.dateformat} HH24:MI")): cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints) quiet_remove(unhandled_hints, 'datetimeformattz') valid_datetimeformat = [ - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH24:MI:SS", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH:MI:SS", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH24:MI", - f"{hints.get('dateformat', 'YYYY-MM-DD')} HH:MI", + f"{hints.dateformat} HH24:MI:SS", + f"{hints.dateformat} HH:MI:SS", + f"{hints.dateformat} HH24:MI", + f"{hints.dateformat} HH:MI", ] - if (hints['datetimeformat'] not in valid_datetimeformat): + if (hints.datetimeformat not in valid_datetimeformat): cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints) quiet_remove(unhandled_hints, 'datetimeformat') - if hints['timeonlyformat'] != 'HH24:MI:SS': + if hints.timeonlyformat != 'HH24:MI:SS': cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints) quiet_remove(unhandled_hints, 'timeonlyformat') - pandas_options['sep'] = hints['field-delimiter'] + pandas_options['sep'] = hints.field_delimiter quiet_remove(unhandled_hints, 'field-delimiter') - pandas_options['line_terminator'] = hints['record-terminator'] + pandas_options['line_terminator'] = hints.record_terminator quiet_remove(unhandled_hints, 'record-terminator') return pandas_options diff --git a/records_mover/records/records_format.py b/records_mover/records/records_format.py index e7c3e0378..cd5f22991 100644 --- a/records_mover/records/records_format.py +++ b/records_mover/records/records_format.py @@ -1,9 +1,9 @@ import logging from .processing_instructions import ProcessingInstructions -from . import RecordsHints +from . import PartialRecordsHints, UntypedRecordsHints from .base_records_format import BaseRecordsFormat from typing import Mapping, Optional, TYPE_CHECKING -from .delimited import MutableRecordsHints, ValidatedRecordsHints +from .delimited import ValidatedRecordsHints if TYPE_CHECKING: from . import RecordsFormatType # noqa @@ -15,7 +15,7 @@ def __init__(self) -> None: self.format_type = 'parquet' def __str__(self) -> str: - return f"ParquetRecordsFormat" + return "ParquetRecordsFormat" def __repr__(self) -> str: return str(self) @@ -26,14 +26,17 @@ def generate_filename(self, basename: str) -> str: class DelimitedRecordsFormat(BaseRecordsFormat): variant: str - hints: RecordsHints + # The strong type for 'hints' in the constructor is for IDE + # support--we don't actually trust input to this class to be well-typed, + # as it may come from public interface consumers. + hints: UntypedRecordsHints """Stores the full set of hints describing the format, combining both the default hints for the variant and any hint overrides provided in the constructor""" def __init__(self, variant: str='bluelabs', - hints: RecordsHints={}, + hints: PartialRecordsHints={}, processing_instructions: ProcessingInstructions=ProcessingInstructions()) -> None: """See the `records format documentation `_ @@ -60,20 +63,20 @@ def __eq__(self, other: object) -> bool: self.hints == other.hints) return False - def alter_hints(self, new_hints: RecordsHints) ->\ + def alter_hints(self, new_hints: UntypedRecordsHints) ->\ 'DelimitedRecordsFormat': input_hints = dict(self.hints) # make copy input_hints.update(new_hints) return DelimitedRecordsFormat(variant=self.variant, - hints=input_hints) + hints=input_hints) # type: ignore def alter_variant(self, variant: str) -> 'DelimitedRecordsFormat': return DelimitedRecordsFormat(variant=variant, - hints=self.hints) + hints=self.hints) # type: ignore def base_hints_from_variant(self, - fail_if_dont_understand: bool = True) -> MutableRecordsHints: - hint_defaults: RecordsHints = { + fail_if_dont_understand: bool = True) -> PartialRecordsHints: + hint_defaults: PartialRecordsHints = { 'header-row': False, 'field-delimiter': ',', 'record-terminator': "\n", @@ -88,8 +91,8 @@ def base_hints_from_variant(self, 'datetimeformat': 'YYYY-MM-DD HH:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', } - combined_hints = dict(hint_defaults) - format_driven_hints: MutableRecordsHints = {} # noqa + combined_hints: PartialRecordsHints = dict(hint_defaults) # type: ignore + format_driven_hints: PartialRecordsHints = {} # noqa if self.variant == 'dumb': format_driven_hints['field-delimiter'] = ',' format_driven_hints['record-terminator'] = "\n" @@ -140,7 +143,7 @@ def base_hints_from_variant(self, return combined_hints def add_hints_from_variant(self, - provided_hints: RecordsHints, + provided_hints: PartialRecordsHints, processing_instructions: ProcessingInstructions) -> None: combined_hints =\ self.base_hints_from_variant(processing_instructions.fail_if_dont_understand) @@ -161,7 +164,7 @@ def __str__(self) -> str: hints_from_variant = self.base_hints_from_variant() hint_overrides = { hint: v for hint, v in self.hints.items() - if hint not in hints_from_variant or v != hints_from_variant[hint] + if hint not in hints_from_variant or v != hints_from_variant[hint] # type: ignore } if hint_overrides != {}: return f"DelimitedRecordsFormat({self.variant} - {hint_overrides})" @@ -191,7 +194,7 @@ def __repr__(self) -> str: def RecordsFormat(format_type: 'RecordsFormatType' = 'delimited', variant: str='bluelabs', - hints: RecordsHints={}, + hints: PartialRecordsHints={}, processing_instructions: ProcessingInstructions=ProcessingInstructions()) ->\ 'BaseRecordsFormat': diff --git a/records_mover/records/records_format_file.py b/records_mover/records/records_format_file.py index d692eb278..4dbe963f6 100644 --- a/records_mover/records/records_format_file.py +++ b/records_mover/records/records_format_file.py @@ -1,7 +1,7 @@ from .records_format import BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat from ..url.base import BaseDirectoryUrl, BaseFileUrl from .processing_instructions import ProcessingInstructions -from .delimited import RecordsHints +from .delimited import PartialRecordsHints import logging logger = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def load_delimited_format(self, format_loc: BaseFileUrl, fail_if_dont_understand: bool) -> BaseRecordsFormat: data = format_loc.json_contents() variant = None - hints: RecordsHints = {} + hints: PartialRecordsHints = {} # For simple form (such as with avro or parquet), this file MUST be empty. if data is None: # For detailed form (such as delimited), it MUST contain diff --git a/records_mover/records/schema/field/numpy.py b/records_mover/records/schema/field/numpy.py index 8ba60e573..767ec38ef 100644 --- a/records_mover/records/schema/field/numpy.py +++ b/records_mover/records/schema/field/numpy.py @@ -26,7 +26,7 @@ def details_from_numpy_dtype(dtype: numpy.dtype, else: field_type = RecordsSchemaField.python_type_to_field_type(dtype.type) if field_type is None: - raise NotImplementedError(f"Teach me how to handle Pandas/numpy dtype {dtype}" + raise NotImplementedError(f"Teach me how to handle Pandas/numpy dtype {dtype} " f"which is dtype.type {dtype.type}") constraints = RecordsSchemaFieldConstraints.from_numpy_dtype(dtype, unique=unique) diff --git a/records_mover/records/sources/data_url.py b/records_mover/records/sources/data_url.py index 82cf39c23..4fa023486 100644 --- a/records_mover/records/sources/data_url.py +++ b/records_mover/records/sources/data_url.py @@ -7,7 +7,7 @@ from ...url.resolver import UrlResolver from ..records_format import BaseRecordsFormat from ..schema import RecordsSchema -from .. import BootstrappingRecordsHints +from .. import PartialRecordsHints import logging from typing import Optional, Iterator @@ -20,7 +20,7 @@ def __init__(self, url_resolver: UrlResolver, records_format: Optional[BaseRecordsFormat]=None, records_schema: Optional[RecordsSchema]=None, - initial_hints: Optional[BootstrappingRecordsHints]=None) -> None: + initial_hints: Optional[PartialRecordsHints]=None) -> None: self.input_url = input_url self.url_resolver = url_resolver self.records_format = records_format diff --git a/records_mover/records/sources/directory.py b/records_mover/records/sources/directory.py index 021181322..45022b517 100644 --- a/records_mover/records/sources/directory.py +++ b/records_mover/records/sources/directory.py @@ -3,7 +3,7 @@ from contextlib import ExitStack from ..records_directory import RecordsDirectory from ..records_format import DelimitedRecordsFormat -from .. import RecordsHints +from .. import PartialRecordsHints from contextlib import contextmanager from typing import Iterator, Optional from ..processing_instructions import ProcessingInstructions @@ -17,7 +17,7 @@ def __init__(self, directory: RecordsDirectory, fail_if_dont_understand: bool, url_resolver: UrlResolver, - override_hints: RecordsHints={}) -> None: + override_hints: PartialRecordsHints={}) -> None: self.records_format = directory.load_format(fail_if_dont_understand=fail_if_dont_understand) if isinstance(self.records_format, DelimitedRecordsFormat): self.records_format = self.records_format.alter_hints(override_hints) diff --git a/records_mover/records/sources/factory.py b/records_mover/records/sources/factory.py index a91620dfa..cd34e12a8 100644 --- a/records_mover/records/sources/factory.py +++ b/records_mover/records/sources/factory.py @@ -8,7 +8,7 @@ from .uninferred_fileobjs import UninferredFileobjsRecordsSource from .data_url import DataUrlRecordsSource from .directory import RecordsDirectoryRecordsSource -from .. import RecordsHints, BootstrappingRecordsHints +from .. import PartialRecordsHints from .base import (SupportsRecordsDirectory, SupportsMoveToRecordsDirectory, # noqa SupportsToFileobjsSource, RecordsSource) from typing import Mapping, IO, Callable, Optional, Union, Iterable, TYPE_CHECKING @@ -91,7 +91,7 @@ def dataframes(self, def fileobjs(self, target_names_to_input_fileobjs: Mapping[str, IO[bytes]], records_format: Optional[BaseRecordsFormat]=None, - initial_hints: Optional[BootstrappingRecordsHints]=None, + initial_hints: Optional[PartialRecordsHints]=None, records_schema: Optional[RecordsSchema]=None)\ -> Union[UninferredFileobjsRecordsSource, FileobjsSource]: """ @@ -118,7 +118,7 @@ def fileobjs(self, def data_url(self, input_url: str, records_format: Optional[BaseRecordsFormat]=None, - initial_hints: Optional[BootstrappingRecordsHints]=None, + initial_hints: Optional[PartialRecordsHints]=None, records_schema: Optional[RecordsSchema]=None)\ -> DataUrlRecordsSource: """ @@ -154,7 +154,7 @@ def table(self, def directory_from_url(self, url: str, - hints: RecordsHints={}, + hints: PartialRecordsHints={}, fail_if_dont_understand: bool=True)\ -> RecordsDirectoryRecordsSource: """ @@ -179,7 +179,7 @@ def directory_from_url(self, def local_file(self, filename: str, records_format: Optional[BaseRecordsFormat]=None, - initial_hints: Optional[BootstrappingRecordsHints]=None, + initial_hints: Optional[PartialRecordsHints]=None, records_schema: Optional[RecordsSchema]=None)\ -> DataUrlRecordsSource: """ diff --git a/records_mover/records/sources/fileobjs.py b/records_mover/records/sources/fileobjs.py index 4ae25605e..0c6255aca 100644 --- a/records_mover/records/sources/fileobjs.py +++ b/records_mover/records/sources/fileobjs.py @@ -7,7 +7,7 @@ from ..results import MoveResult from ..records_format import BaseRecordsFormat, DelimitedRecordsFormat from ..delimited import sniff_hints_from_fileobjs -from .. import BootstrappingRecordsHints +from .. import PartialRecordsHints from ..processing_instructions import ProcessingInstructions from ...records.delimited import complain_on_unhandled_hints from ..delimited import python_encoding_from_hint @@ -44,7 +44,7 @@ def infer_if_needed(target_names_to_input_fileobjs: Mapping[str, IO[bytes]], processing_instructions: ProcessingInstructions, records_format: Optional[BaseRecordsFormat], records_schema: Optional[RecordsSchema], - initial_hints: Optional[BootstrappingRecordsHints]) ->\ + initial_hints: Optional[PartialRecordsHints]) ->\ Iterator['FileobjsSource']: try: if records_format is None: diff --git a/records_mover/records/sources/uninferred_fileobjs.py b/records_mover/records/sources/uninferred_fileobjs.py index 49ff2a8a1..a3f72e995 100644 --- a/records_mover/records/sources/uninferred_fileobjs.py +++ b/records_mover/records/sources/uninferred_fileobjs.py @@ -4,7 +4,9 @@ from ..schema import RecordsSchema from ..processing_instructions import ProcessingInstructions from ..records_format import BaseRecordsFormat -from .. import BootstrappingRecordsHints +from records_mover.records.delimited import ( + UntypedRecordsHints, validate_partial_hints +) import logging from typing import Optional, Iterator, Mapping, IO @@ -16,7 +18,7 @@ def __init__(self, target_names_to_input_fileobjs: Mapping[str, IO[bytes]], records_format: Optional[BaseRecordsFormat]=None, records_schema: Optional[RecordsSchema]=None, - initial_hints: Optional[BootstrappingRecordsHints]=None) -> None: + initial_hints: Optional[UntypedRecordsHints]=None) -> None: self.target_names_to_input_fileobjs = target_names_to_input_fileobjs self.records_format = records_format self.records_schema = records_schema @@ -30,10 +32,16 @@ def to_fileobjs_source(self, records_format_if_possible: Optional[BaseRecordsFormat]=None)\ -> Iterator['FileobjsSource']: """Convert current source to a FileObjsSource and present it in a context manager""" + typed_hints = None + if self.initial_hints is not None: + typed_hints =\ + validate_partial_hints(self.initial_hints, + fail_if_cant_handle_hint=processing_instructions. + fail_if_cant_handle_hint) with FileobjsSource.\ infer_if_needed(target_names_to_input_fileobjs=self.target_names_to_input_fileobjs, records_format=self.records_format, records_schema=self.records_schema, processing_instructions=processing_instructions, - initial_hints=self.initial_hints) as fileobjs_source: + initial_hints=typed_hints) as fileobjs_source: yield fileobjs_source diff --git a/tests/unit/cli/test_job_config_schema_as_args_parser.py b/tests/unit/cli/test_job_config_schema_as_args_parser.py index cea328d6e..80960f0dc 100644 --- a/tests/unit/cli/test_job_config_schema_as_args_parser.py +++ b/tests/unit/cli/test_job_config_schema_as_args_parser.py @@ -35,6 +35,9 @@ def setUp(self): 'type': 'boolean', 'default': False, }, + 'boolwithnodefault': { + 'type': 'boolean', + }, 'enumwithnulloption': { 'type': ['string', 'null'], 'enum': ['a', 'b', None] @@ -109,6 +112,26 @@ def test_enum_cannot_be_invalid(self) -> None: "argument --enumwithnulloption: invalid choice: 'x' " "(choose from 'a', 'b')") + def test_boolean_no_default_true(self) -> None: + parser = JobConfigSchemaAsArgsParser.from_description(self.job_config_schema, 'testjob') + out = parser.parse(['--boolwithnodefault']) + expected = { + 'divisions': ['candidates', 'counties', 'townships', 'precincts'], + 'boolwithdefaultfalse': False, 'boolwithdefaulttrue': True, + 'boolwithnodefault': True, + } + self.assertEqual(expected, out) + + def test_boolean_no_default_false(self) -> None: + parser = JobConfigSchemaAsArgsParser.from_description(self.job_config_schema, 'testjob') + out = parser.parse(['--no_boolwithnodefault']) + expected = { + 'divisions': ['candidates', 'counties', 'townships', 'precincts'], + 'boolwithdefaultfalse': False, 'boolwithdefaulttrue': True, + 'boolwithnodefault': False, + } + self.assertEqual(expected, out) + def test_bad_syntax_1(self): bad_job_config_schema = { "type": "object", diff --git a/tests/unit/db/mysql/test_load_options.py b/tests/unit/db/mysql/test_load_options.py index e46ceaab5..50878aa0b 100644 --- a/tests/unit/db/mysql/test_load_options.py +++ b/tests/unit/db/mysql/test_load_options.py @@ -58,7 +58,7 @@ def test_mysql_load_options_nonnumeric_quoting(self) -> None: def test_mysql_load_options_bogus_quoting(self) -> None: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ - 'quoting': 'no thanks', + 'quoting': 'no thanks', # type: ignore 'doublequote': True, 'compression': None, }) @@ -89,7 +89,7 @@ def test_mysql_load_options_valid_quoting_bogus_doublequote(self) -> None: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'quoting': 'all', - 'doublequote': 'mumble', + 'doublequote': 'mumble', # type: ignore 'compression': None, }) unhandled_hints = set(records_format.hints.keys()) diff --git a/tests/unit/db/postgres/test_copy_options_csv_unload.py b/tests/unit/db/postgres/test_copy_options_csv_unload.py index de3fd28b1..e8fdb5d5e 100644 --- a/tests/unit/db/postgres/test_copy_options_csv_unload.py +++ b/tests/unit/db/postgres/test_copy_options_csv_unload.py @@ -15,9 +15,10 @@ def test_postgres_copy_options_csv_minimal_quoting(self): unhandled_hints = set(records_format.hints) fail_if_cant_handle_hint = True mode = CopyOptionsMode.UNLOADING + hints = records_format.validate(fail_if_cant_handle_hint=True) out = postgres_copy_options_csv(unhandled_hints, - records_format.hints, + hints, fail_if_cant_handle_hint, mode) self.assertEqual(out, { @@ -40,7 +41,7 @@ def test_postgres_copy_options_csv_all_quoting(self): mode = CopyOptionsMode.UNLOADING out = postgres_copy_options_csv(unhandled_hints, - records_format.hints, + records_format.validate(fail_if_cant_handle_hint=True), fail_if_cant_handle_hint, mode) self.assertEqual(out, { @@ -64,6 +65,6 @@ def test_postgres_copy_options_csv_no_quoting(self): with self.assertRaises(NotImplementedError): postgres_copy_options_csv(unhandled_hints, - records_format.hints, + records_format.validate(fail_if_cant_handle_hint=True), fail_if_cant_handle_hint, CopyOptionsMode.UNLOADING) diff --git a/tests/unit/db/postgres/test_copy_options_date_output_style.py b/tests/unit/db/postgres/test_copy_options_date_output_style.py index d8ecec007..2d7addd16 100644 --- a/tests/unit/db/postgres/test_copy_options_date_output_style.py +++ b/tests/unit/db/postgres/test_copy_options_date_output_style.py @@ -1,4 +1,5 @@ import unittest +from records_mover.records import DelimitedRecordsFormat from records_mover.db.postgres.copy_options.date_output_style import\ determine_date_output_style @@ -6,15 +7,16 @@ class TestPostgresCopyOptionsDateOutputStyle(unittest.TestCase): def test_determine_output_date_order_style_iso(self): unhandled_hints = set() - hints = { + records_format = DelimitedRecordsFormat(hints={ 'dateformat': 'YYYY-MM-DD', 'timeonlyformat': 'HH24:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS' - } + }) fail_if_cant_handle_hint = True + validated_hints = records_format.validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) out = determine_date_output_style(unhandled_hints, - hints, + validated_hints, fail_if_cant_handle_hint) self.assertEqual(out, ('ISO', None)) diff --git a/tests/unit/db/postgres/test_date_order_style.py b/tests/unit/db/postgres/test_date_order_style.py index b88ef22e8..1cbb3d805 100644 --- a/tests/unit/db/postgres/test_date_order_style.py +++ b/tests/unit/db/postgres/test_date_order_style.py @@ -1,4 +1,5 @@ import unittest +from records_mover.records import DelimitedRecordsFormat from records_mover.db.postgres.copy_options.date_input_style import determine_input_date_order_style @@ -48,14 +49,19 @@ def test_determine_date_order_style_(self): ), ] fail_if_cant_handle_hint = True - for hints, expected_result in tests: + for raw_hints, expected_result in tests: + records_format = DelimitedRecordsFormat(hints=raw_hints) if expected_result == NotImplementedError: with self.assertRaises(NotImplementedError): + validated_hints = records_format.\ + validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) determine_input_date_order_style(unhandled_hints, - hints, + validated_hints, fail_if_cant_handle_hint) else: + validated_hints = records_format.\ + validate(fail_if_cant_handle_hint=fail_if_cant_handle_hint) out = determine_input_date_order_style(unhandled_hints, - hints, + validated_hints, fail_if_cant_handle_hint) self.assertEqual(out, expected_result) diff --git a/tests/unit/db/redshift/base_test_redshift_db_driver.py b/tests/unit/db/redshift/base_test_redshift_db_driver.py index e6c491588..156e72395 100644 --- a/tests/unit/db/redshift/base_test_redshift_db_driver.py +++ b/tests/unit/db/redshift/base_test_redshift_db_driver.py @@ -51,7 +51,7 @@ def load(self, hints, fail_if): processing_instructions.fail_if_cant_handle_hint = fail_if processing_instructions.fail_if_dont_understand = fail_if processing_instructions.fail_if_row_invalid = fail_if - self.mock_records_load_plan.records_format.hints = hints + self.mock_records_load_plan.records_format = DelimitedRecordsFormat(hints=hints) self.mock_records_load_plan.processing_instructions = processing_instructions return self.redshift_db_driver.loader().\ load(schema='myschema', diff --git a/tests/unit/db/redshift/test_loader.py b/tests/unit/db/redshift/test_loader.py index eaacccd18..1bae1fed1 100644 --- a/tests/unit/db/redshift/test_loader.py +++ b/tests/unit/db/redshift/test_loader.py @@ -41,7 +41,7 @@ def test_can_load_this_format_true(self, processing_instructions=mock_processing_instructions) mock_redshift_copy_options.\ assert_called_with(set(), - mock_load_plan.records_format.hints, + mock_load_plan.records_format.validate.return_value, mock_processing_instructions.fail_if_cant_handle_hint, mock_processing_instructions.fail_if_row_invalid, mock_processing_instructions.max_failure_rows) diff --git a/tests/unit/db/redshift/test_records_copy.py b/tests/unit/db/redshift/test_records_copy.py index 466360985..49e97e19f 100644 --- a/tests/unit/db/redshift/test_records_copy.py +++ b/tests/unit/db/redshift/test_records_copy.py @@ -5,21 +5,6 @@ class TestRecordsCopy(unittest.TestCase): - def test_redshift_copy_options_bad_compression_roll_with_it(self): - records_format =\ - DelimitedRecordsFormat(variant='bluelabs', - hints={ - 'compression': 'somethingnew' - }) - unhandled_hints = set(records_format.hints.keys()) - # This isn't in the Enum...for now. - with self.assertRaises(ValueError): - redshift_copy_options(unhandled_hints, - records_format.hints, - fail_if_cant_handle_hint=False, - fail_if_row_invalid=True, - max_failure_rows=0) - def test_redshift_copy_options_encodings(self): tests = { 'UTF16': Encoding.utf16, @@ -35,7 +20,7 @@ def test_redshift_copy_options_encodings(self): }) unhandled_hints = set(records_format.hints.keys()) out = redshift_copy_options(unhandled_hints, - records_format.hints, + records_format.validate(fail_if_cant_handle_hint=True), fail_if_cant_handle_hint=True, fail_if_row_invalid=True, max_failure_rows=0) diff --git a/tests/unit/db/redshift/test_redshift_db_driver_import_christmas_tree.py b/tests/unit/db/redshift/test_redshift_db_driver_import_christmas_tree.py index 85993e3c0..9a4039341 100644 --- a/tests/unit/db/redshift/test_redshift_db_driver_import_christmas_tree.py +++ b/tests/unit/db/redshift/test_redshift_db_driver_import_christmas_tree.py @@ -8,6 +8,8 @@ class TestRedshiftDBDriverImportBlueLabs(BaseTestRedshiftDBDriver): + maxDiff = None + @patch('records_mover.db.redshift.loader.CopyCommand') def test_load_vertica_christmas_tree_unsupported_options_with_fast_warns_1(self, mock_CopyCommand): @@ -15,15 +17,15 @@ def test_load_vertica_christmas_tree_unsupported_options_with_fast_warns_1(self, lines_scanned = self.load(christmas_tree_format_1_hints, fail_if=False) - self.assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint quoting = 'nonnumeric'"), - call("Ignoring hint record-terminator = '\\x02'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint quoting = 'nonnumeric'"), + call("Ignoring hint record-terminator = '\\x02'")]) expected_best_effort_args = { 'access_key_id': 'fake_aws_id', 'compression': Compression.lzop, 'data_location': 's3://mybucket/myparent/mychild/_manifest', - 'date_format': 'auto', + 'date_format': 'YYYY-MM-DD', 'encoding': Encoding.utf8, 'delimiter': '\x01', 'escape': True, @@ -48,10 +50,11 @@ def test_load_christmas_tree_unsupported_options_with_fast_warns_2(self, with patch.object(driver_logger, 'warning') as mock_warning: lines_scanned = self.load(christmas_tree_format_2_hints, fail_if=False) - self.assertEqual(mock_warning.mock_calls, - [call("Ignoring hint escape = '@'"), - call("Ignoring hint doublequote = True"), - call("Ignoring hint record-terminator = '\\x02'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint escape = '@'"), + call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), + call("Ignoring hint doublequote = True"), + call("Ignoring hint record-terminator = '\\x02'")]) expected_best_effort_args = { 'access_key_id': 'fake_aws_id', @@ -60,7 +63,7 @@ def test_load_christmas_tree_unsupported_options_with_fast_warns_2(self, 'date_format': 'MM-DD-YYYY', 'delimiter': '\x01', 'encoding': Encoding.utf8, - 'escape': False, + 'escape': True, 'ignore_header': 0, 'manifest': True, 'max_error': 100000, diff --git a/tests/unit/db/redshift/test_redshift_db_driver_unload.py b/tests/unit/db/redshift/test_redshift_db_driver_unload.py index 43f5a6016..4f0d68475 100644 --- a/tests/unit/db/redshift/test_redshift_db_driver_unload.py +++ b/tests/unit/db/redshift/test_redshift_db_driver_unload.py @@ -3,6 +3,7 @@ christmas_tree_format_1_hints, christmas_tree_format_2_hints) from records_mover.records.delimited.utils import logger as driver_logger +from records_mover.records import DelimitedRecordsFormat from mock import call, patch @@ -21,7 +22,9 @@ def test_unload_to_non_s3(self, mock_text.side_effect = fake_text self.mock_records_unload_plan.processing_instructions.fail_if_dont_understand = True self.mock_records_unload_plan.processing_instructions.fail_if_cant_handle_hint = True - self.mock_records_unload_plan.records_format.hints = bluelabs_format_hints + self.mock_records_unload_plan.records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=bluelabs_format_hints) self.mock_directory.scheme = 'mumble' self.mock_db_engine.execute.return_value.scalar.return_value = 456 rows = self.redshift_db_driver.unloader().\ @@ -57,7 +60,9 @@ def test_unload(self, mock_text.side_effect = fake_text self.mock_records_unload_plan.processing_instructions.fail_if_dont_understand = True self.mock_records_unload_plan.processing_instructions.fail_if_cant_handle_hint = True - self.mock_records_unload_plan.records_format.hints = bluelabs_format_hints + self.mock_records_unload_plan.records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=bluelabs_format_hints) self.mock_directory.scheme = 's3' self.mock_db_engine.execute.return_value.scalar.return_value = 456 rows = self.redshift_db_driver.unloader().\ @@ -92,7 +97,9 @@ def test_unload_christmas_tree_unsupported_options_with_fast_warns_1(self, self.mock_records_unload_plan.processing_instructions.fail_if_dont_understand = False self.mock_records_unload_plan.processing_instructions.fail_if_cant_handle_hint = False - self.mock_records_unload_plan.records_format.hints = christmas_tree_format_1_hints + self.mock_records_unload_plan.records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=christmas_tree_format_1_hints) self.mock_db_engine.execute.return_value.scalar.return_value = 456 rows = self.redshift_db_driver.unloader().\ unload(schema='myschema', @@ -102,8 +109,6 @@ def test_unload_christmas_tree_unsupported_options_with_fast_warns_1(self, self.assertCountEqual(mock_warning.mock_calls, [call("Ignoring hint record-terminator = '\\x02'"), call("Ignoring hint quoting = 'nonnumeric'"), - call("Ignoring hint datetimeformat = None"), - call("Ignoring hint dateformat = None"), call("Ignoring hint header-row = True"), call("Ignoring hint compression = 'LZO'"), call("Did not understand these hints: header-row=True")]) @@ -131,25 +136,28 @@ def test_unload_christmas_tree_unsupported_options_with_fast_warns_2(self, with patch.object(driver_logger, 'warning') as mock_warning: self.mock_records_unload_plan.processing_instructions.fail_if_dont_understand = False self.mock_records_unload_plan.processing_instructions.fail_if_cant_handle_hint = False - - self.mock_records_unload_plan.records_format.hints = christmas_tree_format_2_hints + self.mock_records_unload_plan.records_format =\ + DelimitedRecordsFormat(variant='bluelabs', + hints=christmas_tree_format_2_hints) self.mock_db_engine.execute.return_value.scalar.return_value = 456 rows = self.redshift_db_driver.unloader().\ unload(schema='myschema', table='mytable', unload_plan=self.mock_records_unload_plan, directory=self.mock_directory) - self.assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint escape = '@'"), - call("Ignoring hint doublequote = True"), - call("Ignoring hint compression = 'BZIP'"), - call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), - call("Ignoring hint record-terminator = '\\x02'"), - call("Ignoring hint datetimeformat = None"), - call("Ignoring hint dateformat = 'MM-DD-YYYY'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint escape = '@'"), + call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), + call("Ignoring hint record-terminator = '\\x02'"), + call("Ignoring hint doublequote = True"), + call("Ignoring hint compression = 'BZIP'"), + call("Ignoring hint datetimeformattz = " + "'YYYY-MM-DD HH24:MI:SSOF'"), + call("Ignoring hint dateformat = 'MM-DD-YYYY'")]) expected_args = { 'access_key_id': 'fake_aws_id', + 'escape': True, 'add_quotes': True, 'delimiter': '\x01', 'manifest': True, diff --git a/tests/unit/db/vertica/test_vertica_import_options.py b/tests/unit/db/vertica/test_vertica_import_options.py index faa749eb1..56b5a1a62 100644 --- a/tests/unit/db/vertica/test_vertica_import_options.py +++ b/tests/unit/db/vertica/test_vertica_import_options.py @@ -110,11 +110,9 @@ def test_christmas_tree_format_1_permissive(self): 'trailing_nullcols': False, } self.assertDictEqual(options, expected_options) - self.assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint compression = 'LZO'"), - call("Ignoring hint dateformat = None"), - call("Ignoring hint datetimeformat = None"), - call("Ignoring hint quoting = 'nonnumeric'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint compression = 'LZO'"), + call("Ignoring hint quoting = 'nonnumeric'")]) self.assertEqual(unhandled_hints, set()) def test_weird_timeonlyformat(self): diff --git a/tests/unit/records/format_hints.py b/tests/unit/records/format_hints.py index a91acdfa9..6c8ac69b0 100644 --- a/tests/unit/records/format_hints.py +++ b/tests/unit/records/format_hints.py @@ -55,9 +55,9 @@ 'doublequote': False, 'escape': '\\', 'encoding': 'UTF8', - 'dateformat': None, + 'dateformat': 'YYYY-MM-DD', 'timeonlyformat': 'HH24:MI:SS', - 'datetimeformat': None, + 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', 'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF', 'header-row': True, } @@ -73,7 +73,7 @@ 'encoding': 'UTF8', 'dateformat': 'MM-DD-YYYY', 'timeonlyformat': 'HH24:MI:SS', - 'datetimeformat': None, + 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', 'datetimeformattz': 'HH:MI:SSOF YYYY-MM-DD', # also not allowed 'header-row': False, } @@ -89,7 +89,7 @@ 'encoding': 'UTF8', 'dateformat': 'DD-MM-YYYY', 'timeonlyformat': 'HH24:MI:SS', - 'datetimeformat': None, + 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', 'datetimeformattz': 'HH:MI:SSOF YYYY-MM-DD', # also not allowed 'header-row': False, } @@ -105,7 +105,7 @@ 'encoding': 'UTF8', 'dateformat': 'totally_bogus_just_made_this_up', 'timeonlyformat': 'HH24:MI:SS', - 'datetimeformat': None, + 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', 'datetimeformattz': 'HH:MI:SSOF YYYY-MM-DD', # also not allowed 'header-row': False, } diff --git a/tests/unit/records/targets/test_fileobj.py b/tests/unit/records/targets/test_fileobj.py index 1831f8874..71db867a4 100644 --- a/tests/unit/records/targets/test_fileobj.py +++ b/tests/unit/records/targets/test_fileobj.py @@ -15,7 +15,7 @@ def test_move_from_dataframe_uncompressed_no_header_row(self, mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ - 'encoding': 'mumble', + 'encoding': 'UTF8', 'compression': None, 'header-row': False, 'quoting': 'all' @@ -39,7 +39,7 @@ def test_move_from_dataframe_uncompressed_no_header_row(self, mode="a", date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', @@ -51,7 +51,7 @@ def test_move_from_dataframe_uncompressed_no_header_row(self, mode="a", date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', @@ -69,7 +69,7 @@ def test_move_from_dataframe_uncompressed_with_header_row(self, mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ - 'encoding': 'mumble', + 'encoding': 'UTF8', 'compression': None, 'header-row': True, 'quoting': 'all' @@ -93,7 +93,7 @@ def test_move_from_dataframe_uncompressed_with_header_row(self, mode="a", date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=True, line_terminator='\n', @@ -105,7 +105,7 @@ def test_move_from_dataframe_uncompressed_with_header_row(self, mode="a", date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', @@ -123,7 +123,7 @@ def test_move_from_dataframe_compressed_no_header_row(self, mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ - 'encoding': 'mumble', + 'encoding': 'UTF8', 'compression': 'GZIP', 'header-row': False, 'quoting': 'all' @@ -147,7 +147,7 @@ def test_move_from_dataframe_compressed_no_header_row(self, compression='gzip', date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', @@ -160,7 +160,7 @@ def test_move_from_dataframe_compressed_no_header_row(self, compression='gzip', date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', @@ -178,7 +178,7 @@ def test_move_from_dataframe_compressed_with_header_row(self, mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ - 'encoding': 'mumble', + 'encoding': 'UTF8', 'compression': 'GZIP', 'header-row': True, 'quoting': 'all' @@ -202,7 +202,7 @@ def test_move_from_dataframe_compressed_with_header_row(self, compression='gzip', date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=True, line_terminator='\n', @@ -215,7 +215,7 @@ def test_move_from_dataframe_compressed_with_header_row(self, compression='gzip', date_format='%Y-%m-%d %H:%M:%S.%f%z', doublequote=False, - encoding='mumble', + encoding='UTF8', escapechar='\\', header=False, line_terminator='\n', diff --git a/tests/unit/records/test_hints.py b/tests/unit/records/test_hints.py index 2181bfd89..3f228f6cc 100644 --- a/tests/unit/records/test_hints.py +++ b/tests/unit/records/test_hints.py @@ -1,5 +1,5 @@ from records_mover.records.delimited.sniff import ( - sniff_hints, sniff_hints_from_fileobjs, sniff_encoding_hint, BootstrappingRecordsHints + sniff_hints, sniff_hints_from_fileobjs, sniff_encoding_hint, PartialRecordsHints ) from mock import MagicMock, patch import io @@ -144,23 +144,27 @@ def test_sniff_hints_from_fileobjs(self, mock_fileobj = MagicMock(name='fileobj') mock_fileobj.closed = False mock_fileobjs = [mock_fileobj] - mock_initial_hints: BootstrappingRecordsHints = { + mock_initial_hints: PartialRecordsHints = { 'field-delimiter': ',' } mock_streaming_engine = mock_stream_csv.return_value.__enter__.return_value._engine mock_io.TextIOWrapper.return_value.newlines = '\n' mock_streaming_engine.compression = 'gzip' mock_streaming_engine.encoding = 'utf-8' + mock_sniffer = mock_csv.Sniffer.return_value + mock_sniff_results = mock_sniffer.sniff.return_value + mock_sniff_results.doublequote = True + mock_sniffer.has_header.return_value = False out = sniff_hints_from_fileobjs(fileobjs=mock_fileobjs, initial_hints=mock_initial_hints) self.assertEqual(out, { 'compression': 'GZIP', - 'doublequote': mock_csv.Sniffer().sniff().doublequote, + 'doublequote': True, 'encoding': 'UTF8', 'quotechar': mock_csv.Sniffer().sniff().quotechar, 'quoting': 'minimal', 'field-delimiter': ',', - 'header-row': mock_csv.Sniffer().has_header(), + 'header-row': False, 'record-terminator': str(mock_io.TextIOWrapper.return_value.newlines), }) @@ -189,7 +193,7 @@ def test_sniff_hints_from_fileobjs_encodings(self): csv_bytes = csv.encode(python_encoding, errors='replace') with io.BytesIO(csv_bytes) as fileobj: fileobjs = [fileobj] - initial_hints: BootstrappingRecordsHints = { + initial_hints: PartialRecordsHints = { 'field-delimiter': ',' } if 'initial' in test_details: diff --git a/tests/unit/records/test_pandas_read_csv_options.py b/tests/unit/records/test_pandas_read_csv_options.py index ce9e25ba1..2e26b4786 100644 --- a/tests/unit/records/test_pandas_read_csv_options.py +++ b/tests/unit/records/test_pandas_read_csv_options.py @@ -57,38 +57,40 @@ def test_pandas_read_csv_options_bluelabs(self): self.assertEqual(expected, actual) self.assertFalse(unhandled_hints) - def test_pandas_read_csv_options_bleulabs(self): - expected = { - 'dayfirst': True, - 'compression': 'gzip', - 'delimiter': ',', - 'doublequote': False, - 'encoding': 'UTF8', - 'engine': 'python', - 'error_bad_lines': True, - 'escapechar': '\\', - 'header': None, - 'prefix': 'untitled_', - 'quotechar': '"', - 'quoting': 3, - 'warn_bad_lines': True, - 'parse_dates': [0, 1, 2, 3], - } - processing_instructions = ProcessingInstructions() - hints = bluelabs_format_hints.copy() - hints.update({ - 'dateformat': 'DD-MM-YYYY', - 'datetimeformattz': 'DD-MM-YYYY HH24:MIOF', - 'datetimeformat': 'DD-MM-YYYY HH24:MI', - }) - records_format = DelimitedRecordsFormat(hints=hints) - unhandled_hints = set(records_format.hints) - actual = pandas_read_csv_options(records_format, - self.records_schema, - unhandled_hints, - processing_instructions) - self.assertEqual(expected, actual) - self.assertFalse(unhandled_hints) + # MM-DD not yet fully supported - see https://app.asana.com/0/1128138765527694/1173779659264666 + # + # def test_pandas_read_csv_options_bleulabs(self): + # expected = { + # 'dayfirst': True, + # 'compression': 'gzip', + # 'delimiter': ',', + # 'doublequote': False, + # 'encoding': 'UTF8', + # 'engine': 'python', + # 'error_bad_lines': True, + # 'escapechar': '\\', + # 'header': None, + # 'prefix': 'untitled_', + # 'quotechar': '"', + # 'quoting': 3, + # 'warn_bad_lines': True, + # 'parse_dates': [0, 1, 2, 3], + # } + # processing_instructions = ProcessingInstructions() + # hints = bluelabs_format_hints.copy() + # hints.update({ + # 'dateformat': 'DD-MM-YYYY', + # 'datetimeformattz': 'DD-MM-YYYY HH24:MIOF', + # 'datetimeformat': 'DD-MM-YYYY HH24:MI', + # }) + # records_format = DelimitedRecordsFormat(hints=hints) + # unhandled_hints = set(records_format.hints) + # actual = pandas_read_csv_options(records_format, + # self.records_schema, + # unhandled_hints, + # processing_instructions) + # self.assertEqual(expected, actual) + # self.assertFalse(unhandled_hints) def test_pandas_read_csv_options_inconsistent_date_format(self): processing_instructions = ProcessingInstructions() diff --git a/tests/unit/records/test_pandas_to_csv_options_christmas_tree.py b/tests/unit/records/test_pandas_to_csv_options_christmas_tree.py index eb8ee24ba..770b11f03 100644 --- a/tests/unit/records/test_pandas_to_csv_options_christmas_tree.py +++ b/tests/unit/records/test_pandas_to_csv_options_christmas_tree.py @@ -28,10 +28,8 @@ def test_pandas_to_csv_options_christmas_tree_format_1(self): with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) - self.assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint datetimeformat = None"), - call("Ignoring hint datetimeformattz = 'YYYY-MM-DD HH:MI:SSOF'"), - call("Ignoring hint compression = 'LZO'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint compression = 'LZO'")]) self.assertFalse(unhandled_hints) def test_pandas_to_csv_options_christmas_tree_format_2(self): @@ -40,7 +38,7 @@ def test_pandas_to_csv_options_christmas_tree_format_2(self): 'date_format': '%m-%d-%Y %H:%M:%S.%f%z', 'doublequote': True, 'encoding': 'UTF8', - 'escapechar': '@', + 'escapechar': '\\', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', @@ -55,9 +53,11 @@ def test_pandas_to_csv_options_christmas_tree_format_2(self): actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) self.\ - assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint datetimeformat = None"), - call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'")]) + assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint escape = '@'"), + call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), + call("Ignoring hint datetimeformattz = 'YYYY-MM-DD HH24:MI:SSOF'"), + call("Ignoring hint datetimeformat = 'YYYY-MM-DD HH24:MI:SS'")]) self.assertFalse(unhandled_hints) def test_pandas_to_csv_options_christmas_tree_format_3(self): @@ -66,10 +66,11 @@ def test_pandas_to_csv_options_christmas_tree_format_3(self): 'date_format': '%d-%m-%Y %H:%M:%S.%f%z', 'doublequote': True, 'encoding': 'UTF8', - 'escapechar': '@', + 'escapechar': '\\', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', + 'quoting': 0, 'sep': '\x01', } processing_instructions =\ @@ -79,19 +80,24 @@ def test_pandas_to_csv_options_christmas_tree_format_3(self): with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) - self.assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint datetimeformat = None"), - call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), - call("Ignoring hint quoting = " - "'some_future_option_not_supported_now'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint quoting = " + "'some_future_option_not_supported_now'"), + call("Ignoring hint escape = '@'"), + call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), + call("Ignoring hint datetimeformattz = " + "'YYYY-MM-DD HH24:MI:SSOF'"), + call("Ignoring hint datetimeformat = 'YYYY-MM-DD HH24:MI:SS'")]) self.assertFalse(unhandled_hints) def test_pandas_to_csv_options_christmas_tree_format_4(self): expected = { 'compression': 'bz2', + 'date_format': '%Y-%m-%d %H:%M:%S.%f%z', + 'quoting': 0, 'doublequote': True, 'encoding': 'UTF8', - 'escapechar': '@', + 'escapechar': '\\', 'header': False, 'line_terminator': '\x02', 'quotechar': '"', @@ -104,11 +110,11 @@ def test_pandas_to_csv_options_christmas_tree_format_4(self): with patch.object(driver_logger, 'warning') as mock_warning: actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) self.assertEqual(expected, actual) - self.assertCountEqual(mock_warning.mock_calls, - [call("Ignoring hint datetimeformat = None"), - call("Ignoring hint dateformat = " - "'totally_bogus_just_made_this_up'"), - call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"), - call("Ignoring hint quoting = " - "'some_future_option_not_supported_now'")]) + self.assertListEqual(mock_warning.mock_calls, + [call("Ignoring hint quoting = " + "'some_future_option_not_supported_now'"), + call("Ignoring hint escape = '@'"), + call("Ignoring hint dateformat = " + "'totally_bogus_just_made_this_up'"), + call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'")]) self.assertFalse(unhandled_hints) diff --git a/tests/unit/records/test_pandas_to_csv_options_dateformats.py b/tests/unit/records/test_pandas_to_csv_options_dateformats.py index ae6386350..62a88de1c 100644 --- a/tests/unit/records/test_pandas_to_csv_options_dateformats.py +++ b/tests/unit/records/test_pandas_to_csv_options_dateformats.py @@ -1,110 +1,120 @@ import unittest -from records_mover.records.pandas import pandas_to_csv_options -from records_mover.records.processing_instructions import ProcessingInstructions -from records_mover.records.records_format import DelimitedRecordsFormat +# from records_mover.records.pandas import pandas_to_csv_options +# from records_mover.records.processing_instructions import ProcessingInstructions +# from records_mover.records.records_format import DelimitedRecordsFormat class TestPandasToCsvOptionsDateformats(unittest.TestCase): - def test_pandas_dateformat_YYYY_MM_DD_no_tz(self): - expected = { - 'compression': 'gzip', - 'date_format': '%Y-%m-%d %H:%M:%S.%f', - 'doublequote': False, - 'encoding': 'UTF8', - 'escapechar': '\\', - 'header': False, - 'line_terminator': '\n', - 'quotechar': '"', - 'quoting': 3, - 'sep': ',', - } - processing_instructions = ProcessingInstructions() - records_format =\ - DelimitedRecordsFormat(variant='bluelabs', - hints={ - 'dateformat': 'YYYY-MM-DD', - 'datetimeformattz': 'YYYY-MM-DD HH24:MI:SS', - 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', - }) - unhandled_hints = set(records_format.hints) - actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) - self.assertEqual(expected, actual) - self.assertFalse(unhandled_hints) + pass + # YYYY-MM-DD HH24:MI:SS is not valid for datetimeformattz + # + # def test_pandas_dateformat_YYYY_MM_DD_no_tz(self): + # expected = { + # 'compression': 'gzip', + # 'date_format': '%Y-%m-%d %H:%M:%S.%f', + # 'doublequote': False, + # 'encoding': 'UTF8', + # 'escapechar': '\\', + # 'header': False, + # 'line_terminator': '\n', + # 'quotechar': '"', + # 'quoting': 3, + # 'sep': ',', + # } + # processing_instructions = ProcessingInstructions() + # records_format =\ + # DelimitedRecordsFormat(variant='bluelabs', + # hints={ + # 'dateformat': 'YYYY-MM-DD', + # 'datetimeformattz': 'YYYY-MM-DD HH24:MI:SS', + # 'datetimeformat': 'YYYY-MM-DD HH24:MI:SS', + # }) + # unhandled_hints = set(records_format.hints) + # actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) + # self.assertEqual(expected, actual) + # self.assertFalse(unhandled_hints) - def test_pandas_dateformat_MM_DD_YYYY_no_tz(self): - expected = { - 'compression': 'gzip', - 'date_format': '%m-%d-%Y %H:%M:%S.%f', - 'doublequote': False, - 'encoding': 'UTF8', - 'escapechar': '\\', - 'header': False, - 'line_terminator': '\n', - 'quotechar': '"', - 'quoting': 3, - 'sep': ',', - } - processing_instructions = ProcessingInstructions() - records_format =\ - DelimitedRecordsFormat(variant='bluelabs', - hints={ - 'dateformat': 'MM-DD-YYYY', - 'datetimeformattz': 'MM-DD-YYYY HH24:MI:SS', - 'datetimeformat': 'MM-DD-YYYY HH24:MI:SS', - }) - unhandled_hints = set(records_format.hints) - actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) - self.assertEqual(expected, actual) - self.assertFalse(unhandled_hints) + # MM-DD-YYYY HH24:MI:SS datetimeformat not supported in spec + # + # def test_pandas_dateformat_MM_DD_YYYY_no_tz(self): + # expected = { + # 'compression': 'gzip', + # 'date_format': '%m-%d-%Y %H:%M:%S.%f', + # 'doublequote': False, + # 'encoding': 'UTF8', + # 'escapechar': '\\', + # 'header': False, + # 'line_terminator': '\n', + # 'quotechar': '"', + # 'quoting': 3, + # 'sep': ',', + # } + # processing_instructions = ProcessingInstructions() + # records_format =\ + # DelimitedRecordsFormat(variant='bluelabs', + # hints={ + # 'dateformat': 'MM-DD-YYYY', + # 'datetimeformattz': 'MM-DD-YYYY HH24:MI:SS', + # 'datetimeformat': 'MM-DD-YYYY HH24:MI:SS', + # }) + # unhandled_hints = set(records_format.hints) + # actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) + # self.assertEqual(expected, actual) + # self.assertFalse(unhandled_hints) - def test_pandas_dateformat_DD_MM_YYYY_no_tz(self): - expected = { - 'compression': 'gzip', - 'date_format': '%d-%m-%Y %H:%M:%S.%f', - 'doublequote': False, - 'encoding': 'UTF8', - 'escapechar': '\\', - 'header': False, - 'line_terminator': '\n', - 'quotechar': '"', - 'quoting': 3, - 'sep': ',', - } - processing_instructions = ProcessingInstructions() - records_format =\ - DelimitedRecordsFormat(variant='bluelabs', - hints={ - 'dateformat': 'DD-MM-YYYY', - 'datetimeformattz': 'DD-MM-YYYY HH24:MI:SS', - 'datetimeformat': 'DD-MM-YYYY HH24:MI:SS', - }) - unhandled_hints = set(records_format.hints) - actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) - self.assertEqual(expected, actual) - self.assertFalse(unhandled_hints) + # DD-MM format not yet fully supported - see + # https://app.asana.com/0/1128138765527694/1173779659264666 + # + # def test_pandas_dateformat_DD_MM_YYYY_no_tz(self): + # expected = { + # 'compression': 'gzip', + # 'date_format': '%d-%m-%Y %H:%M:%S.%f', + # 'doublequote': False, + # 'encoding': 'UTF8', + # 'escapechar': '\\', + # 'header': False, + # 'line_terminator': '\n', + # 'quotechar': '"', + # 'quoting': 3, + # 'sep': ',', + # } + # processing_instructions = ProcessingInstructions() + # records_format =\ + # DelimitedRecordsFormat(variant='bluelabs', + # hints={ + # 'dateformat': 'DD-MM-YYYY', + # 'datetimeformattz': 'DD-MM-YYYY HH24:MI:SS', + # 'datetimeformat': 'DD-MM-YYYY HH24:MI:SS', + # }) + # unhandled_hints = set(records_format.hints) + # actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) + # self.assertEqual(expected, actual) + # self.assertFalse(unhandled_hints) - def test_pandas_dateformat_MMsDDsYY_no_tz(self): - expected = { - 'compression': 'gzip', - 'date_format': '%m/%d/%y %H:%M:%S.%f', - 'doublequote': False, - 'encoding': 'UTF8', - 'escapechar': '\\', - 'header': False, - 'line_terminator': '\n', - 'quotechar': '"', - 'quoting': 3, - 'sep': ',', - } - processing_instructions = ProcessingInstructions() - records_format =\ - DelimitedRecordsFormat(variant='bluelabs', - hints={ - 'dateformat': 'MM/DD/YY', - 'datetimeformattz': 'MM/DD/YY HH24:MI:SS', - 'datetimeformat': 'MM/DD/YY HH24:MI:SS', - }) - unhandled_hints = set(records_format.hints) - actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) - self.assertEqual(expected, actual) - self.assertFalse(unhandled_hints) + # MM/DD/YY HH24:MI:SS not supported + # + # def test_pandas_dateformat_MMsDDsYY_no_tz(self): + # expected = { + # 'compression': 'gzip', + # 'date_format': '%m/%d/%y %H:%M:%S.%f', + # 'doublequote': False, + # 'encoding': 'UTF8', + # 'escapechar': '\\', + # 'header': False, + # 'line_terminator': '\n', + # 'quotechar': '"', + # 'quoting': 3, + # 'sep': ',', + # } + # processing_instructions = ProcessingInstructions() + # records_format =\ + # DelimitedRecordsFormat(variant='bluelabs', + # hints={ + # 'dateformat': 'MM/DD/YY', + # 'datetimeformattz': 'MM/DD/YY HH24:MI:SS', + # 'datetimeformat': 'MM/DD/YY HH24:MI:SS', + # }) + # unhandled_hints = set(records_format.hints) + # actual = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) + # self.assertEqual(expected, actual) + # self.assertFalse(unhandled_hints) diff --git a/tests/unit/url/test_resolver.py b/tests/unit/url/test_resolver.py index 939569cc7..b444d74b0 100644 --- a/tests/unit/url/test_resolver.py +++ b/tests/unit/url/test_resolver.py @@ -16,6 +16,16 @@ def __init__(self, class TestUrlResolver(unittest.TestCase): + def tearDown(self): + if 'dummy' in directory_url_ctors: + del directory_url_ctors['dummy'] + if 'dummy' in file_url_ctors: + del file_url_ctors['dummy'] + if 'needy' in directory_url_ctors: + del directory_url_ctors['needy'] + if 'needy' in file_url_ctors: + del file_url_ctors['needy'] + def setUp(self): self.mock_DummyFileUrl = Mock(name='DummyFileUrl', spec=BaseFileUrl) self.mock_DummyFileUrl.return_value = Mock(name='dummyfileurl', diff --git a/tests/unit/url/test_resolver_misconfigured.py b/tests/unit/url/test_resolver_misconfigured.py new file mode 100644 index 000000000..99b440a06 --- /dev/null +++ b/tests/unit/url/test_resolver_misconfigured.py @@ -0,0 +1,39 @@ +from records_mover.url.resolver import file_url_ctors, directory_url_ctors, UrlResolver +from records_mover.url.base import BaseFileUrl, BaseDirectoryUrl +import unittest + + +class SimpleFileUrl(BaseFileUrl): + def __init__(self, url): + pass + + +class SimpleDirectoryUrl(BaseDirectoryUrl): + def __init__(self, url): + pass + + +class TestUrlResolverMisconfigured(unittest.TestCase): + def tearDown(self): + if 'simple' in directory_url_ctors: + del directory_url_ctors['simple'] + if 'simple' in file_url_ctors: + del file_url_ctors['simple'] + + def test_file_url_misconfigured(self): + resolver = UrlResolver(boto3_session_getter=lambda: None, + gcs_client_getter=lambda: None, + gcp_credentials_getter=lambda: None) + directory_url_ctors['simple'] = SimpleFileUrl + simple_url = 'simple://foo/bar/baz?a=b&d=f' + with self.assertRaises(TypeError): + resolver.directory_url(simple_url) + + def test_directory_url_misconfigured(self): + resolver = UrlResolver(boto3_session_getter=lambda: None, + gcs_client_getter=lambda: None, + gcp_credentials_getter=lambda: None) + file_url_ctors['simple'] = SimpleDirectoryUrl + simple_url = 'simple://foo/bar/baz?a=b&d=f' + with self.assertRaises(TypeError): + resolver.file_url(simple_url) diff --git a/tests/unit/url/test_resolver_no_creds.py b/tests/unit/url/test_resolver_no_creds.py new file mode 100644 index 000000000..b0a2e447f --- /dev/null +++ b/tests/unit/url/test_resolver_no_creds.py @@ -0,0 +1,50 @@ +from records_mover.url.resolver import file_url_ctors, UrlResolver +from records_mover.url.base import BaseFileUrl +from mock import Mock +import unittest + + +class NeedyFileUrl(BaseFileUrl): + def __init__(self, + url, + boto3_session, + gcs_client, + gcp_credentials): + self.boto3_session = boto3_session + self.gcs_client = gcs_client + self.gcp_credentials = gcp_credentials + + +class TestUrlResolverNoCreds(unittest.TestCase): + def test_NeedyFileUrl_with_no_boto3(self): + mock_gcs_client = Mock(name='gcs_client') + mock_gcp_credentials = Mock(name='gcp_credentials') + resolver = UrlResolver(boto3_session_getter=lambda: None, + gcs_client_getter=lambda: mock_gcs_client, + gcp_credentials_getter=lambda: mock_gcp_credentials) + file_url_ctors['needy'] = NeedyFileUrl + needy_url = 'needy://foo/bar/baz?a=b&d=f' + with self.assertRaises(EnvironmentError): + resolver.file_url(needy_url) + + def test_NeedyFileUrl_with_no_gcs_client(self): + mock_boto3_session = Mock(name='boto3_session') + mock_gcp_credentials = Mock(name='gcp_credentials') + resolver = UrlResolver(boto3_session_getter=lambda: mock_boto3_session, + gcs_client_getter=lambda: None, + gcp_credentials_getter=lambda: mock_gcp_credentials) + file_url_ctors['needy'] = NeedyFileUrl + needy_url = 'needy://foo/bar/baz?a=b&d=f' + with self.assertRaises(EnvironmentError): + resolver.file_url(needy_url) + + def test_NeedyFileUrl_with_no_gcp_credentials(self): + mock_gcs_client = Mock(name='gcs_client') + mock_boto3_session = Mock(name='boto3_session') + resolver = UrlResolver(boto3_session_getter=lambda: mock_boto3_session, + gcs_client_getter=lambda: mock_gcs_client, + gcp_credentials_getter=lambda: None) + file_url_ctors['needy'] = NeedyFileUrl + needy_url = 'needy://foo/bar/baz?a=b&d=f' + with self.assertRaises(EnvironmentError): + resolver.file_url(needy_url)