Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor CLI hint code and types, more CLI hint arguments #66

Merged
merged 30 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
5c04553
Give Hint class a description field and move in existing values
vinceatbluelabs May 23, 2020
1e17cc9
Initial work towards using Hint objects for CLI generation
vinceatbluelabs May 23, 2020
de1eb43
Unratchet temporarily
vinceatbluelabs May 23, 2020
53acc55
Lazily import JsonSchemaDocument
vinceatbluelabs May 23, 2020
04ac539
Drop unneeded import
vinceatbluelabs May 24, 2020
72fbb2e
Get rid of records/job/hints.py entirely
vinceatbluelabs May 24, 2020
d2852a3
Limit bootstrapping parameters
vinceatbluelabs May 24, 2020
d6a46c4
Add TypedRecordsHints type
vinceatbluelabs May 24, 2020
4b5315b
Reflect more realistic input of raw hints
vinceatbluelabs May 24, 2020
1be53a9
Validate hints after they come in from a user as initial hints.
vinceatbluelabs May 25, 2020
1b1c617
Introduce non-total TypedDict of hints
vinceatbluelabs May 25, 2020
af2967e
Fix tests
vinceatbluelabs May 25, 2020
fbe802d
Fix mypy issues
vinceatbluelabs May 26, 2020
0974b1f
flake8 fixes
vinceatbluelabs May 26, 2020
f1c51ce
Convert more hint code
vinceatbluelabs May 26, 2020
c04d857
Convert more uses of record_mover.hints
vinceatbluelabs May 26, 2020
84d74fb
Drop BootstrappingRecordsHints
vinceatbluelabs May 27, 2020
d70fe31
Standardize on PartialRecordsHints name
vinceatbluelabs May 27, 2020
9dd54bb
Move more code to PartialRecordsHints
vinceatbluelabs May 27, 2020
2b32314
Convert over one more use of PartialRecordsHints
vinceatbluelabs May 27, 2020
bf075b5
flake8 fixes
vinceatbluelabs May 28, 2020
2a987e9
Fix flake8 issues
vinceatbluelabs May 28, 2020
1481473
Merge remote-tracking branch 'origin/master' into refactor_cli_hints
vinceatbluelabs May 28, 2020
09a8266
Increase coverage
vinceatbluelabs May 28, 2020
e47454b
Unratchet
vinceatbluelabs May 30, 2020
30ee0ae
Remove unused import
vinceatbluelabs May 30, 2020
7d16295
Handle --header-row and --no_header_row correctly
vinceatbluelabs May 30, 2020
021d008
Remove dead code
vinceatbluelabs May 30, 2020
ee1ea55
Fix flake8 issue
vinceatbluelabs May 30, 2020
2262644
Merge remote-tracking branch 'origin/master' into refactor_cli_hints
vinceatbluelabs May 30, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 43 additions & 8 deletions records_mover/records/delimited/hint.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,37 @@
from typing_inspect import is_literal_type, get_args
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's teach the Hint type to have descriptions and be able to generate JSON schema information!

from abc import ABCMeta, abstractmethod
from .types import HintName, RecordsHints
from records_mover.utils.json_schema import JsonSchemaDocument
from typing import TypeVar, Generic, Type, List


HintT = TypeVar('HintT')


class Hint(Generic[HintT], metaclass=ABCMeta):
def __init__(self,
hint_name: HintName,
default: HintT,
description: str) -> None:
self.default = default
self.hint_name = hint_name
self.description = description

@abstractmethod
def validate(self,
hints: RecordsHints,
fail_if_cant_handle_hint: bool) -> HintT:
...

@abstractmethod
def json_schema_document(self) -> JsonSchemaDocument:
...


class StringHint(Hint[str]):
def __init__(self,
hint_name: HintName,
default: str) -> None:
self.default = default
self.hint_name = hint_name
def json_schema_document(self) -> JsonSchemaDocument:
return JsonSchemaDocument('string',
description=self.description)

def validate(self,
hints: RecordsHints,
Expand All @@ -44,12 +55,36 @@ class LiteralHint(Hint[LiteralHintT]):
def __init__(self,
type_: Type[LiteralHintT],
hint_name: HintName,
default: LiteralHintT) -> None:
default: LiteralHintT,
description: str) -> None:
assert is_literal_type(type_), f"{hint_name} is not a Literal[]"
self.default = default
self.type_ = type_
self.hint_name = hint_name
self.valid_values: List[LiteralHintT] = list(get_args(type_))
super().__init__(hint_name=hint_name,
default=default,
description=description)

def json_schema_document(self) -> JsonSchemaDocument:
json_schema_types = {
bool: 'boolean',
str: 'string',
# Even though Python prints the word NoneType in many
# error messages, NoneType is not an identifier in
# Python. It’s not in builtins. You can only reach it with
# type(None).
#
# https://realpython.com/null-in-python/
type(None): 'null',
}

types_set = {
json_schema_types[type(valid_value)]
for valid_value in self.valid_values
}

return JsonSchemaDocument(list(types_set),
enum=self.valid_values,
description=self.description)

def validate(self,
hints: RecordsHints,
Expand Down
68 changes: 53 additions & 15 deletions records_mover/records/delimited/hints.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from .hint import LiteralHint, StringHint
from .hint import LiteralHint, StringHint, Hint
from .types import (
HintHeaderRow, HintCompression, HintQuoting,
HintDoublequote, HintEscape, HintEncoding, HintDateFormat, HintTimeOnlyFormat,
HintDateTimeFormatTz, HintDateTimeFormat
)
from enum import Enum
import logging


logger = logging.getLogger(__name__)


class Hints:
class Hints(Enum):
value: Hint
# mypy gives this when we pass the HintBlahBlah aliases in as an
# argument here:
#
Expand All @@ -19,34 +21,70 @@ class Hints:
# Nonetheless, the validation works.
datetimeformattz = LiteralHint[HintDateTimeFormatTz](HintDateTimeFormatTz, # type: ignore
"datetimeformattz",
"YYYY-MM-DD HH24:MI:SSOF")
"YYYY-MM-DD HH24:MI:SSOF",
description=("Format used to write "
"'datetimetz' values"))
datetimeformat = LiteralHint[HintDateTimeFormat](HintDateTimeFormat, # type: ignore
"datetimeformat",
default="YYYY-MM-DD HH24:MI:SS")
default="YYYY-MM-DD HH24:MI:SS",
description=("Format used to write "
"'datetime' values"))
compression = LiteralHint[HintCompression](HintCompression, # type: ignore
'compression',
default=None)
default=None,
description='Compression type of the file.')
quoting = LiteralHint[HintQuoting](HintQuoting, # type: ignore
'quoting',
default='minimal')
default='minimal',
description=('How quotes are applied to individual fields. '
'all: quote all fields. '
'minimal: quote only fields that contain '
'ambiguous characters (the '
'delimiter, the escape character, or a line '
'terminator). '
'default: never quote fields.'))
escape = LiteralHint[HintEscape](HintEscape, # type: ignore
'escape',
default='\\')
default='\\',
description="Character used to escape strings")
encoding = LiteralHint[HintEncoding](HintEncoding, # type: ignore
'encoding',
default='UTF8')
default='UTF8',
description="Text encoding of file")
dateformat = LiteralHint[HintDateFormat](HintDateFormat, # type: ignore
'dateformat',
default='YYYY-MM-DD')
default='YYYY-MM-DD',
description=("Format used to write "
"'date' values"))
timeonlyformat = LiteralHint[HintTimeOnlyFormat](HintTimeOnlyFormat, # type: ignore
'timeonlyformat',
default="HH24:MI:SS")
default="HH24:MI:SS",
description=("Format used to write "
"'time' values"))
# https://docs.python.org/3/library/csv.html#csv.Dialect.doublequote
doublequote = LiteralHint[HintDoublequote](HintDoublequote, # type: ignore
'doublequote',
default=False)
default=False,
description=('Controls how instances of quotechar '
'appearing inside a field should '
'themselves be quoted. When True, the '
'character is doubled. When False, the '
'escapechar is used as a prefix to the '
'quotechar.'))
header_row = LiteralHint[HintHeaderRow](HintHeaderRow, # type: ignore
'header-row',
default=True)
quotechar = StringHint('quotechar', default='"')
record_terminator = StringHint('record-terminator', default='\n')
field_delimiter = StringHint('field-delimiter', default=',')
default=True,
description=('True if a header row is provided in '
'the delimited files.'))
# https://docs.python.org/3/library/csv.html#csv.Dialect.quotechar
quotechar = StringHint('quotechar',
default='"',
description=('A one-character string used to quote fields containing '
'special characters, such as the delimiter or quotechar, '
'or which contain new-line characters.'))
record_terminator = StringHint('record-terminator',
default='\n',
description='String used to close out individual rows of data.')
field_delimiter = StringHint('field-delimiter',
default=',',
description='Character used between fields.')
26 changes: 13 additions & 13 deletions records_mover/records/delimited/validated_records_hints.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,17 @@ def v(hint: Hint[T]) -> T:
return hint.validate(hints, fail_if_cant_handle_hint)

return ValidatedRecordsHints(
header_row=v(Hints.header_row),
field_delimiter=v(Hints.field_delimiter),
compression=v(Hints.compression),
record_terminator=v(Hints.record_terminator),
quoting=v(Hints.quoting),
quotechar=v(Hints.quotechar),
doublequote=v(Hints.doublequote),
escape=v(Hints.escape),
encoding=v(Hints.encoding),
dateformat=v(Hints.dateformat),
timeonlyformat=v(Hints.timeonlyformat),
datetimeformattz=v(Hints.datetimeformattz),
datetimeformat=v(Hints.datetimeformat),
header_row=v(Hints.header_row.value),
field_delimiter=v(Hints.field_delimiter.value),
compression=v(Hints.compression.value),
record_terminator=v(Hints.record_terminator.value),
quoting=v(Hints.quoting.value),
quotechar=v(Hints.quotechar.value),
doublequote=v(Hints.doublequote.value),
escape=v(Hints.escape.value),
encoding=v(Hints.encoding.value),
dateformat=v(Hints.dateformat.value),
timeonlyformat=v(Hints.timeonlyformat.value),
datetimeformattz=v(Hints.datetimeformattz.value),
datetimeformat=v(Hints.datetimeformat.value),
)
55 changes: 12 additions & 43 deletions records_mover/records/job/hints.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
"""Defines hints supported by the job config parser."""
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the duplicative descriptions of hints that are no longer needed!

from records_mover.records.delimited.hints import Hints
from ...utils.json_schema import JsonParameter, JsonSchemaDocument
from typing import Optional


SUPPORTED_HINT_NAMES = [
'field-delimiter', 'compression', 'escape', 'quoting', 'encoding', 'header-row'
]


# TODO: This class isn't needed anymore if we can just provide Hint class
vinceatbluelabs marked this conversation as resolved.
Show resolved Hide resolved
class SupportedHint:
"""Definition for supported hints"""

Expand All @@ -17,50 +24,12 @@ def config_name(self) -> str:
return self.schema.name


QUOTING_DESCRIPTION =\
('How quotes are applied to individual fields. '
'all: quote all fields. '
'minimal: quote only fields that contain ambiguous characters (the '
'delimiter, the escape character, or a line terminator). '
'default: never quote fields.')

#
# Note: Any expansion of these types should also be done in
# records.types
#
SUPPORTED_HINTS = [
SupportedHint(
JsonParameter('field-delimiter',
JsonSchemaDocument('string',
description=('Character used between fields '
'(default is comma)')),
optional=True)),
SupportedHint(
JsonParameter('compression',
JsonSchemaDocument(['string', 'null'],
enum=['BZIP', 'GZIP', 'LZO', None],
description='Compression type of the file.'),
optional=True)),
SupportedHint(
JsonParameter('escape',
JsonSchemaDocument(['string', 'null'],
enum=['\\', None],
description="Character used to escape strings"),
optional=True)),
SupportedHint(
JsonParameter('quoting',
JsonSchemaDocument(['string', 'null'],
enum=['all', 'minimal', 'nonnumeric', None],
description=QUOTING_DESCRIPTION),
optional=True)),
SupportedHint(
JsonParameter('encoding',
JsonSchemaDocument(['string'],
enum=[
'UTF8', 'UTF16', 'UTF16LE', 'UTF16BE',
'LATIN1', 'CP1252'
],
description="Text encoding of file"),
optional=True)),
JsonParameter(hint_enum.value.hint_name,
hint_enum.value.json_schema_document(),
optional=True))
for hint_enum in list(Hints)
]

SUPPORTED_HINT_LOOKUP = {hint.config_name: hint for hint in SUPPORTED_HINTS}