From 98638e1d09a6ffef6828cf0b37c7bf117f379ee1 Mon Sep 17 00:00:00 2001 From: vinceatbluelabs Date: Tue, 21 Apr 2020 19:52:53 -0400 Subject: [PATCH] Initial MySQL support (#31) --- .circleci/config.yml | 73 ++++- DRIVERS.md | 156 ++++++++++ Dockerfile | 6 +- deps.sh | 6 +- itest | 41 ++- metrics/bigfiles_high_water_mark | 2 +- metrics/coverage_high_water_mark | 2 +- metrics/flake8_high_water_mark | 2 +- metrics/mdl_high_water_mark | 2 +- metrics/mypy_high_water_mark | 2 +- records_mover/db/factory.py | 4 + records_mover/db/mysql/__init__.py | 0 records_mover/db/mysql/mysql_db_driver.py | 149 ++++++++++ .../db/postgres/sqlalchemy_postgres_copy.pyi | 9 +- records_mover/db/vertica/import_sql.py | 12 +- .../db/vertica/records_export_options.py | 5 +- records_mover/logging.py | 9 +- records_mover/pandas/__init__.py | 2 +- records_mover/records/__init__.py | 13 +- records_mover/records/hints.py | 11 + records_mover/records/pandas/__init__.py | 12 +- records_mover/records/pandas/prep_for_csv.py | 104 +++++++ .../records/pandas/read_csv_options.py | 44 ++- .../records/records_schema_json_file.py | 4 +- .../records/schema/field/__init__.py | 29 ++ .../schema/field/constraints/constraints.py | 2 +- .../records/schema/field/statistics.py | 2 +- .../records/schema/schema/__init__.py | 7 +- records_mover/records/sources/dataframes.py | 14 +- records_mover/records/sources/fileobjs.py | 1 + records_mover/records/targets/fileobj.py | 9 + records_mover/url/base.py | 6 +- records_mover/utils/limits.py | 4 + setup.py | 8 +- tests/integration/bin/db-mysql | 7 + tests/integration/circleci-dbfacts.yml | 9 + tests/integration/docker-compose.yml | 19 ++ tests/integration/inside-docker-dbfacts.yml | 9 + .../records/directory_validator.py | 17 +- .../records/expected_column_types.py | 130 ++++++--- tests/integration/records/mover_test_case.py | 120 ++++++++ .../multi_db/test_records_table2table.py | 7 +- .../records/purge_old_test_sheets.py | 5 +- .../records/records_database_fixture.py | 15 + .../records_numeric_database_fixture.py | 59 ++++ .../records/single_db/base_records_test.py | 7 +- .../records/single_db/numeric_expectations.py | 78 +++++ .../records/single_db/test_records_load.py | 2 +- .../records/single_db/test_records_numeric.py | 1 + .../records/table_timezone_validator.py | 169 +++++++++++ tests/integration/records/table_validator.py | 237 ++++++--------- tests/integration/resources/README.md | 7 + ...limited-bigquery-no-header-pandas-notz.csv | 2 + ...limited-bluelabs-no-header-pandas-notz.csv | 2 + ...mited-bluelabs-with-header-pandas-notz.csv | 3 + ...elimited-vertica-no-header-pandas-notz.csv | 2 + ...imited-vertica-with-header-pandas-notz.csv | 2 + tests/unit/db/mysql/__init__.py | 0 tests/unit/db/mysql/test_mysql_db_driver.py | 128 ++++++++ .../db/vertica/base_test_vertica_db_driver.py | 1 + .../unit/db/vertica/test_vertica_db_driver.py | 7 + tests/unit/records/pandas/__init__.py | 0 .../unit/records/pandas/test_prep_for_csv.py | 122 ++++++++ tests/unit/records/schema/field/test_field.py | 33 +++ .../records/schema/test_records_schema.py | 7 +- tests/unit/records/sources/test_dataframes.py | 25 +- tests/unit/records/sources/test_fileobjs.py | 5 +- tests/unit/records/targets/test_fileobj.py | 20 +- .../records/test_pandas_read_csv_options.py | 100 ++++++- tests/unit/test_session.py | 46 +++ tests/unit/url/test_filesystem.py | 3 +- tests/unit/utils/test_json_schema.py | 2 +- types/stubs/boto3/session/__init__.pyi | 3 + .../oauth2/service_account/__init__.pyi | 2 +- types/stubs/logging/__init__.pyi | 269 +++++++++++++++++ types/stubs/logging/config.pyi | 79 +++++ types/stubs/logging/filterer.py | 9 + types/stubs/logging/handlers.pyi | 273 ++++++++++++++++++ types/stubs/logging/log_record.pyi | 40 +++ types/stubs/logging/logger.pyi | 49 ++++ wait-for-mysql.sh | 7 + 81 files changed, 2579 insertions(+), 312 deletions(-) create mode 100644 DRIVERS.md create mode 100644 records_mover/db/mysql/__init__.py create mode 100644 records_mover/db/mysql/mysql_db_driver.py create mode 100644 records_mover/records/pandas/prep_for_csv.py create mode 100755 tests/integration/bin/db-mysql create mode 100644 tests/integration/records/mover_test_case.py create mode 100644 tests/integration/records/table_timezone_validator.py create mode 100644 tests/integration/resources/delimited-bigquery-no-header-pandas-notz.csv create mode 100644 tests/integration/resources/delimited-bluelabs-no-header-pandas-notz.csv create mode 100644 tests/integration/resources/delimited-bluelabs-with-header-pandas-notz.csv create mode 100644 tests/integration/resources/delimited-vertica-no-header-pandas-notz.csv create mode 100644 tests/integration/resources/delimited-vertica-with-header-pandas-notz.csv create mode 100644 tests/unit/db/mysql/__init__.py create mode 100644 tests/unit/db/mysql/test_mysql_db_driver.py create mode 100644 tests/unit/records/pandas/__init__.py create mode 100644 tests/unit/records/pandas/test_prep_for_csv.py create mode 100644 types/stubs/logging/__init__.pyi create mode 100644 types/stubs/logging/config.pyi create mode 100644 types/stubs/logging/filterer.py create mode 100644 types/stubs/logging/handlers.pyi create mode 100644 types/stubs/logging/log_record.pyi create mode 100644 types/stubs/logging/logger.pyi create mode 100755 wait-for-mysql.sh diff --git a/.circleci/config.yml b/.circleci/config.yml index af62b6b77..262e0f0ba 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -46,6 +46,29 @@ commands: key: deps-v1-<>-<>-<>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }} paths: - "venv" + wait_for_db: + description: "Pause until database answers allowing time to startup. Abort if startup got hung in CircleCI." + parameters: + db_name: + type: string + connect_command: + type: string + steps: + - run: + name: Waiting for <> + command: | + # Bail out trying after 30 seconds + end=$((SECONDS+30)) + echo "Starting at second ${SECONDS:?} - ending at ${end:?}" + db_connect() { + <> + } + while ! db_connect && [[ "${SECONDS:?}" -lt "${end:?}" ]] + do + echo "Waiting for <>..." + sleep 5 + done + db_connect jobs: test: @@ -136,6 +159,17 @@ jobs: - image: postgres:latest environment: POSTGRES_PASSWORD: 'hunter2' + # MySQL after 5 (they bumped version to 8) uses a new auth protocol + # that is not well supported by clients - including the + # Debian-installable client packages. + # + # https://mysqlserverteam.com/mysql-8-0-4-new-default-authentication-plugin-caching_sha2_password/ + - image: mysql:5 + environment: + MYSQL_ROOT_PASSWORD: 'hunter2root' + MYSQL_DATABASE: 'mysqlitest' + MYSQL_USER: mysqluser + MYSQL_PASSWORD: 'hunter2' steps: - checkout - add_ssh_keys: @@ -175,17 +209,18 @@ jobs: command: | sudo apt-get update -y && sudo apt-get install -y postgresql-client - run: - name: Wait for vertica + name: Install mysql command: | - # Bail out trying after 30 seconds - end=$((SECONDS+30)) - echo "Starting at second ${SECONDS:?} - ending at ${end:?}" - while ! vsql -h 127.0.0.1 -U dbadmin -c 'select 1;' && [[ "${SECONDS:?}" -lt "${end:?}" ]] - do - echo "Waiting for vertica..." - sleep 5 - done - vsql -h 127.0.0.1 -U dbadmin -c 'select 1;' + sudo apt-get update -y && sudo apt-get install -y default-mysql-client + - wait_for_db: + db_name: Vertica + connect_command: vsql -h 127.0.0.1 -U dbadmin -c 'select 1;' + - wait_for_db: + db_name: MySQL + connect_command: echo 'select 1;' | mysql --password=hunter2 --host=127.0.0.1 -u mysqluser mysqlitest + - wait_for_db: + db_name: Postgres + connect_command: psql -h 127.0.0.1 -U postgres -c 'select 1;' - run: name: Run tests command: "<>" @@ -363,6 +398,23 @@ workflows: filters: tags: only: /v\d+\.\d+\.\d+(-[\w]+)?/ + - integration_test_with_dbs: + name: mysql-itest + extras: '[mysql,itest]' + python_version: "3.6" + command: | + . venv/bin/activate + export PATH=${PATH}:${PWD}/tests/integration/bin:/opt/vertica/bin + export DB_FACTS_PATH=${PWD}/tests/integration/circleci-dbfacts.yml + export RECORDS_MOVER_SESSION_TYPE=env + mkdir -p test-reports/itest + cd tests/integration/records/single_db + with-db dockerized-mysql nosetests --with-xunit --xunit-file=../../../../test-reports/itest/junit.xml . + requires: + - redshift-itest + filters: + tags: + only: /v\d+\.\d+\.\d+(-[\w]+)?/ - integration_test_with_dbs: name: vertica-s3-itest extras: '[vertica,aws,itest]' @@ -502,6 +554,7 @@ workflows: - redshift-itest-old-pandas - redshift-itest-no-pandas - postgres-itest + - mysql-itest - cli-1-itest - cli-2-itest - cli-3-itest diff --git a/DRIVERS.md b/DRIVERS.md new file mode 100644 index 000000000..be34e5a62 --- /dev/null +++ b/DRIVERS.md @@ -0,0 +1,156 @@ +# Database drivers + +Adding a database driver to records_mover can be divided up into three steps: + +* Add integration testing and subclass DBDriver to get tests to pass +* Add code for native bulk import support +* Add code for native bulk export support + +Here are the basic things you'll need to do to get through the +process. Every database is different, and this document only gets +updated periodically, so you may run into additional things you need +to do or figure out. If they seem like things people will hit in the +future, add them to this document! + +## Basic support + +1. Create a feature branch +2. Get a test database set up + * Modify `tests/integration/docker-compose.yml` to include a Docker + image for your new database. If your database can't be run in a + Docker image, you'll need some way to get to a database that can + be used during integration testing. Be sure to add a link from + the 'records_mover' container to your new container. + * Run `./itest-dc up -d` to bring up the docker-compose environment. + * Run `./itest-dc start` to start the docker-compose environment. + * Watch logs with `./itest-dc logs -f` + * Fix any issues and repeat until it is successful and the logs look right. +2. Set up the `itest` script to be able to test your new database. + * Modify the `local_dockerized_dbfacts` function in `./itest` to + point to the new database. + * Create a `wait-for-${your-new-db-type:?}.sh` script matching + `wait-for-postgres.sh`. + * Modify `tests/integration/inside-docker-dbfacts.yml` to include an + entry for your new database. + * Modify `tests/integration/bin/db-connect` to handle your new + database type if needed. + * Modify `Dockerfile` to add any new client binaries needed for your + database and run `./itest --docker build` to build the new image. + * Run `./itest shell`, which will start the docker-compose and start + a shell with the db-facts you just created set. + * Run `db ${your-new-db-name:?}` within that shell and verify it + connects. + * Exit out of the `./itest shell` session. + * Run `./itest ${your-new-db-type:?}` and verify it doesn't + recognize the argument. + * Search down for instances of 'postgres' in the `itest` script and + come up with the equivalent for your new database. + * Run `./itest ${your-new-db-type:?}` again and verify thigns fail + somewhere else (e.g., a Python package not being installed or a + test failing most likely) + * Push up your changes to the feature branch. +2. Now work to get the same failure out of CircleCI: + * Replicate the current `postgres_itest` in `.circleci/config.yml`, + including matching all of the references to it. + * Be sure to change the `with-db dockerized-postgres` line to refer + to your database type. + * Push up changes and verify that tests fail because your new + database "is not a valid DB name". + * Note that you can (temporarily!) allow your new integration test + to run without waiting for unit and Redshift tests to run by + commenting out the dependency like this - just be sure to leave an + annotation comment reminding you to fix it before the PR is + merged! + ```yaml + # requires: # T ODO restore this + # - redshift-itest + ``` + * Modify the `integration_test_with_dbs` job to include a Docker + image for your new database, similar to `docker-compose.yml` + above. + * Modify `tests/integration/circleci-dbfacts.yml` to point to your + new integration test database account, whether in Docker or + cloud-hosted. + * Iterate on the errors until you get the same errors you got in + your `./itest` runs. +3. Fix these "singledb" tests! Now that you have tests running (and + failing), you can address the problems one by one. Here are things + you are likely to need to do--I'd suggest waiting for the problem + to come up via the test and then applying the fix until the tests + pass. If you encounter things not on the list below, add them here + for the next person (unless the fix you put in will address for all + future databses with the same issue). + * Add Python driver (either SQLAlchemy or if SQLAlchemy supports it + natively, maybe just the DBAPI driver) as a transtive dependency + in `setup.py`. Rerun `./deps.sh` and then `./itest --docker + build` to re-install locally. + * If database connections aren't working, you may want to insert + some debugging into `records_mover/db/connect.py` to figure out + what's going on. + * Access errors trying to drop a table in the `public` schema: + Probably means whatever default schema comes with your database + user doesn't match the default assumption - modify + `tests/integration/records/single_db/base_records_test.py` to + match. + * `NotImplementedError: Please teach me how to integration test + mysql`: Add information for your new database in + `tests/integration/records/expected_column_types.py`, + `tests/integration/records/mover_test_case.py`, + `tests/integration/records/records_database_fixture.py` and + `tests/integration/records/records_numeric_database_fixture.py`. + This is where you'll start to get familiar with the different + column types available for your database. Be sure to be as + thorough as practical for your database so we can support both + exporting a wide variety of column types and so that we can + support space-efficient use on import. + + For the numeric tests, when re-running you'll probably need to + start filling out a subclass of DBDriver. Relevant methods: + `type_for_fixed_point()`, `type_for_floating_point()`, + `fp_constraints()`, and `integer_limits()`. + * `KeyError: 'mysql'` in + `tests/integration/records/single_db/test_records_numeric.py`: + There are test expectations to set here based on the numeric types + supported by your database. Once you set them, you'll probably + need to add ad `type_for_integer()` method covering things + correctly. + * `AssertionError: ['INTEGER(11)', 'VARCHAR(3)', 'VARCHAR(3)', + 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', + 'TIME', 'DATETIME', 'DATETIME']`: Double check the types assigned. + You may need to subclass DBDriver and implement to convince + records mover to create the types you expect. + * Errors from `tests/integration/records/directory_validator.py`: + ```console + AssertionError: + received ['integer', 'string', 'string', 'string', 'string', 'string', 'string', 'date', 'time', 'datetime', 'datetime'], + expected [['integer', 'string', 'string', 'string', 'string', 'string', 'string', 'date', 'time', 'datetime', 'datetimetz'], ['integer', 'string', 'string', 'string', 'string', 'string', 'string', 'date', 'string', 'datetime', 'datetimetz']] + ``` + + To address, make sure the types returned are as expected for this database. + * `KeyError: 'mysql'`: + `tests/integration/records/single_db/test_records_numeric.py` + needs to be modified to set expectations for this database type. + You can set this to 'bluelabs' as we haven't yet taught + records-mover to do bulk imports, so we have no idea what the + ideal records format variant is for that yet. + * `AssertionError` in + `tests/integration/records/table_validator.py`: There are various + checks here, including things dealing with how datetimes get + rendered. Examine carefully the existing predicates defined + within and add new ones judiciously if it appears the behavior + you are seeing is correct but not currently anticipated. +4. If there are things you see below that you know are needed from the + above list, but the tests are passing, consider adding an + integration test to match. +5. Edit + `tests/integration/records/multi_db/test_records_table2table.py` to + include the new test database and run `./itest table2table` to run + tests. Fix errors as they pop up. +7. Add support for bulk import if the database supports it (and add + more detail here on how to do that!). + * `tests/integration/records/single_db/test_records_numeric.py` + needs to be modified to set the best loading records type for + this database type - pick a type which can be loaded natively + without using Pandas. +8. Add support for bulk export if the database supports it (and add + more detail here on how to do that!). diff --git a/Dockerfile b/Dockerfile index 9b13cb171..592f08e14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,10 @@ FROM python:3.6 -# # database connection scripts, psql CLI client for postgres and -# Redshift, Vertica vsql client and misc shell tools for +# Redshift, Vertica vsql client, MySQL client and misc shell tools for # integration tests -# -RUN apt-get update && apt-get install -y netcat jq postgresql-client curl +RUN apt-get update && apt-get install -y netcat jq postgresql-client curl default-mysql-client # google-cloud-sdk for dbcli and bigquery in integration tests RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - && apt-get update -y && apt-get install google-cloud-sdk -y diff --git a/deps.sh b/deps.sh index c40331adc..1538fc38d 100755 --- a/deps.sh +++ b/deps.sh @@ -16,4 +16,8 @@ pyenv virtualenv "${python_version:?}" records-mover-"${python_version:?}" || tr pyenv local records-mover-"${python_version:?}" pip3 install --upgrade pip -pip3 install -r requirements.txt -e '.[unittest,itest]' +# +# It's nice to unit test, integration test, and run the CLI in +# a development pyenv. +# +pip3 install -r requirements.txt -e '.[unittest,itest,cli]' diff --git a/itest b/itest index 14a17f21b..e8882a855 100755 --- a/itest +++ b/itest @@ -59,6 +59,10 @@ def local_dockerized_dbfacts(): subprocess.check_output([ './itest-dc', 'port', 'postgresdb', '5432' ]).decode('utf8').rstrip().split(':')[1] + mysql_port =\ + subprocess.check_output([ + './itest-dc', 'port', 'mysqldb', '3306' + ]).decode('utf8').rstrip().split(':')[1] db_facts = { 'dbs': { 'dockerized-vertica': { @@ -82,7 +86,23 @@ def local_dockerized_dbfacts(): 'user': 'postgres', 'password': 'hunter2', } - } + }, + 'dockerized-mysql': { + 'exports': { + # This needs to be 127.0.0.1, because if + # this is the string 'localhost', the + # MySQL driver wants to use Unix domain + # sockets to connect, which won't work + # because this is a tunnelled port. + 'host': '127.0.0.1', + 'port': mysql_port, + 'database': 'mysqlitest', + 'type': 'mysql', + 'protocol': 'mysql', + 'user': 'mysqluser', + 'password': 'hunter2', + } + }, } } yaml_output = yaml.dump(db_facts) @@ -122,13 +142,15 @@ def docker_compose_shell() -> None: def docker_compose_start() -> None: - print("Running docker_compose start verticadb postgresdb...", file=sys.stderr) + print("Running docker_compose start verticadb postgresdb mysqldb...", file=sys.stderr) docker_compose(["up", "--no-start"]) - docker_compose(["start", "verticadb", "postgresdb"]) + docker_compose(["start", "verticadb", "postgresdb", "mysqldb"]) docker_compose_run(['./wait-for-vertica.sh']) print("Verified Vertica is up and listening", file=sys.stderr) docker_compose_run(['./wait-for-postgres.sh']) print("Verified Postgres is up and listening", file=sys.stderr) + docker_compose_run(['./wait-for-mysql.sh']) + print("Verified MySQL is up and listening", file=sys.stderr) def run_test(args, target, parser): @@ -196,6 +218,18 @@ def run_test(args, target, parser): "with-aws-creds", "circleci", "nosetests", "--xunit-file=nosetests.xml", "."], cwd="tests/integration/records/single_db") + elif (target == 'mysql'): + with dockerized_dbs(): + if (args.docker): + docker_compose_run(['with-db', 'dockerized-mysql', + 'nosetests', '--xunit-file=nosetests.xml', '.'], + prefixes=["with-aws-creds", "circleci"], + cwd="/usr/src/app/tests/integration/records/single_db") + else: + with local_dockerized_dbfacts(): + subprocess.check_call(["with-db", "dockerized-mysql", + "nosetests", "--xunit-file=nosetests.xml", "."], + cwd="tests/integration/records/single_db") elif (target == 'postgres'): with dockerized_dbs(): if (args.docker): @@ -250,6 +284,7 @@ def run_test(args, target, parser): def main(): tests = { 'cli': 'Run bash-based multi-source/target copy tests', + 'mysql': 'Run load/unload suite against Dockerized MySQL', 'postgres': 'Run load/unload suite against Dockerized PostgreSQL', 'vertica-s3': 'Run load/unload suite against Dockerized Vertica, using S3', 'vertica-no-s3': 'Run load/unload suite against Dockerized Vertica, using streams', diff --git a/metrics/bigfiles_high_water_mark b/metrics/bigfiles_high_water_mark index 8a655068e..ccb8fd57a 100644 --- a/metrics/bigfiles_high_water_mark +++ b/metrics/bigfiles_high_water_mark @@ -1 +1 @@ -897 +967 diff --git a/metrics/coverage_high_water_mark b/metrics/coverage_high_water_mark index 9c2857e0b..31efd80ed 100644 --- a/metrics/coverage_high_water_mark +++ b/metrics/coverage_high_water_mark @@ -1 +1 @@ -93.6200 +93.700 \ No newline at end of file diff --git a/metrics/flake8_high_water_mark b/metrics/flake8_high_water_mark index 6c412452b..eec49411b 100644 --- a/metrics/flake8_high_water_mark +++ b/metrics/flake8_high_water_mark @@ -1 +1 @@ -189 +177 diff --git a/metrics/mdl_high_water_mark b/metrics/mdl_high_water_mark index 00750edc0..7ed6ff82d 100644 --- a/metrics/mdl_high_water_mark +++ b/metrics/mdl_high_water_mark @@ -1 +1 @@ -3 +5 diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark index 7015e71d2..7fc2b7b72 100644 --- a/metrics/mypy_high_water_mark +++ b/metrics/mypy_high_water_mark @@ -1 +1 @@ -90.1200 \ No newline at end of file +91.5500 \ No newline at end of file diff --git a/records_mover/db/factory.py b/records_mover/db/factory.py index 2b055d425..cd0f67512 100644 --- a/records_mover/db/factory.py +++ b/records_mover/db/factory.py @@ -24,5 +24,9 @@ def db_driver(db: Union[sqlalchemy.engine.Engine, from .postgres.postgres_db_driver import PostgresDBDriver return PostgresDBDriver(db, **kwargs) + elif engine.name == 'mysql': + from .mysql.mysql_db_driver import MySQLDBDriver + + return MySQLDBDriver(db, **kwargs) else: return DBDriver(db, **kwargs) diff --git a/records_mover/db/mysql/__init__.py b/records_mover/db/mysql/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/records_mover/db/mysql/mysql_db_driver.py b/records_mover/db/mysql/mysql_db_driver.py new file mode 100644 index 000000000..a02ca26f4 --- /dev/null +++ b/records_mover/db/mysql/mysql_db_driver.py @@ -0,0 +1,149 @@ +import sqlalchemy +import sqlalchemy.dialects.mysql +import logging +from ...utils.limits import (INT8_MIN, INT8_MAX, + UINT8_MIN, UINT8_MAX, + INT16_MIN, INT16_MAX, + UINT16_MIN, UINT16_MAX, + INT24_MIN, INT24_MAX, + UINT24_MIN, UINT24_MAX, + INT32_MIN, INT32_MAX, + UINT32_MIN, UINT32_MAX, + INT64_MIN, INT64_MAX, + UINT64_MIN, UINT64_MAX, + FLOAT32_SIGNIFICAND_BITS, + FLOAT64_SIGNIFICAND_BITS, + num_digits) +from ..driver import DBDriver +from typing import Optional, Tuple + + +logger = logging.getLogger(__name__) + + +class MySQLDBDriver(DBDriver): + # https://dev.mysql.com/doc/refman/8.0/en/integer-types.html + def integer_limits(self, + type_: sqlalchemy.types.Integer) ->\ + Optional[Tuple[int, int]]: + if isinstance(type_, sqlalchemy.dialects.mysql.TINYINT): + if type_.unsigned: + return (UINT8_MIN, UINT8_MAX) + else: + return (INT8_MIN, INT8_MAX) + elif isinstance(type_, sqlalchemy.dialects.mysql.SMALLINT): + if type_.unsigned: + return (UINT16_MIN, UINT16_MAX) + else: + return (INT16_MIN, INT16_MAX) + elif isinstance(type_, sqlalchemy.dialects.mysql.MEDIUMINT): + if type_.unsigned: + return (UINT24_MIN, UINT24_MAX) + else: + return (INT24_MIN, INT24_MAX) + elif isinstance(type_, sqlalchemy.dialects.mysql.INTEGER): + if type_.unsigned: + return (UINT32_MIN, UINT32_MAX) + else: + return (INT32_MIN, INT32_MAX) + elif isinstance(type_, sqlalchemy.dialects.mysql.BIGINT): + if type_.unsigned: + return (UINT64_MIN, UINT64_MAX) + else: + return (INT64_MIN, INT64_MAX) + return super().integer_limits(type_) + + def fp_constraints(self, + type_: sqlalchemy.types.Float) ->\ + Optional[Tuple[int, int]]: + if isinstance(type_, sqlalchemy.dialects.mysql.DOUBLE): + return (64, FLOAT64_SIGNIFICAND_BITS) + elif isinstance(type_, sqlalchemy.sql.sqltypes.FLOAT): + return (32, FLOAT32_SIGNIFICAND_BITS) + return super().fp_constraints(type_) + + def type_for_integer(self, + min_value: Optional[int], + max_value: Optional[int]) -> sqlalchemy.types.TypeEngine: + """Find correct integral column type to fit the given min and max integer values""" + + if min_value is not None and max_value is not None: + pass + if min_value >= INT8_MIN and max_value <= INT8_MAX: + return sqlalchemy.dialects.mysql.TINYINT() + elif min_value >= UINT8_MIN and max_value <= UINT8_MAX: + return sqlalchemy.dialects.mysql.TINYINT(unsigned=True) + elif min_value >= INT16_MIN and max_value <= INT16_MAX: + return sqlalchemy.sql.sqltypes.SMALLINT() + elif min_value >= UINT16_MIN and max_value <= UINT16_MAX: + return sqlalchemy.dialects.mysql.SMALLINT(unsigned=True) + elif min_value >= INT24_MIN and max_value <= INT24_MAX: + return sqlalchemy.dialects.mysql.MEDIUMINT() + elif min_value >= UINT24_MIN and max_value <= UINT24_MAX: + return sqlalchemy.dialects.mysql.MEDIUMINT(unsigned=True) + elif min_value >= INT32_MIN and max_value <= INT32_MAX: + return sqlalchemy.sql.sqltypes.INTEGER() + elif min_value >= UINT32_MIN and max_value <= UINT32_MAX: + return sqlalchemy.dialects.mysql.INTEGER(unsigned=True) + elif min_value >= INT64_MIN and max_value <= INT64_MAX: + return sqlalchemy.sql.sqltypes.BIGINT() + elif min_value >= UINT64_MIN and max_value <= UINT64_MAX: + return sqlalchemy.dialects.mysql.BIGINT(unsigned=True) + else: + num_digits_min = num_digits(min_value) + num_digits_max = num_digits(max_value) + digit_count = max(num_digits_min, num_digits_max) + return self.type_for_fixed_point(precision=digit_count, + scale=0) + return super().type_for_integer(min_value, max_value) + + def type_for_floating_point(self, + fp_total_bits: int, + fp_significand_bits: int) -> sqlalchemy.sql.sqltypes.Numeric: + # https://dev.mysql.com/doc/refman/8.0/en/floating-point-types.html + # + # "A precision from 0 to 23 results in a 4-byte + # single-precision FLOAT column. A precision from 24 to 53 + # results in an 8-byte double-precision DOUBLE column." + if fp_significand_bits > FLOAT64_SIGNIFICAND_BITS: + logger.warning(f"Falling back to MySQL DOUBLE type, as MySQL " + "doesn't support fp_significand_bits>{FLOAT64_SIGNIFICAND_BITS} " + f"(requested: {fp_significand_bits}") + return sqlalchemy.sql.sqltypes.Float(precision=FLOAT64_SIGNIFICAND_BITS) + return super().type_for_floating_point(fp_total_bits=fp_total_bits, + fp_significand_bits=fp_significand_bits) + + def type_for_fixed_point(self, + precision: int, + scale: int) -> sqlalchemy.sql.sqltypes.Numeric: + # "The maximum number of digits for DECIMAL is 65, but the + # actual range for a given DECIMAL column can be constrained + # by the precision or scale for a given column. When such a + # column is assigned a value with more digits following the + # decimal point than are permitted by the specified scale, the + # value is converted to that scale. (The precise behavior is + # operating system-specific, but generally the effect is + # truncation to the permissible number of digits.)" + # + # https://dev.mysql.com/doc/refman/8.0/en/fixed-point-types.html + if precision > 65: + logger.warning('Using MySQL DOUBLE type to represent ' + f'NUMERIC({precision},{scale}))') + return sqlalchemy.dialects.mysql.DOUBLE() + else: + return super().type_for_fixed_point(precision=precision, + scale=scale) + + def varchar_length_is_in_chars(self) -> bool: + # This is assuming folks are using MySQL 5+ + # https://stackoverflow.com/questions/1997540/mysql-varchar-lengths-and-utf-8 + return True + + def type_for_date_plus_time(self, has_tz: bool = False) -> sqlalchemy.sql.sqltypes.DateTime: + # Support six digits of fractional seconds to match other + # databases and general expectations for a datetime + # + # Never has timezone, as the one type with a timezone + # (TIMESTAMP) doesn't allow for dates before Jan 1, 1970, so + # it's not generally useful. + return sqlalchemy.dialects.mysql.DATETIME(fsp=6) diff --git a/records_mover/db/postgres/sqlalchemy_postgres_copy.pyi b/records_mover/db/postgres/sqlalchemy_postgres_copy.pyi index 1d60c4d0a..0d2e828b8 100644 --- a/records_mover/db/postgres/sqlalchemy_postgres_copy.pyi +++ b/records_mover/db/postgres/sqlalchemy_postgres_copy.pyi @@ -1,2 +1,9 @@ -def copy_from(source, dest, engine_or_conn, columns=(), **flags): +from typing import Union, IO +import sqlalchemy + + +def copy_from(source: IO[bytes], + dest: sqlalchemy.schema.Table, + engine_or_conn: Union[sqlalchemy.engine.Engine, sqlalchemy.engine.Connection], + **flags: object) -> None: ... diff --git a/records_mover/db/vertica/import_sql.py b/records_mover/db/vertica/import_sql.py index 15755548c..da7662043 100644 --- a/records_mover/db/vertica/import_sql.py +++ b/records_mover/db/vertica/import_sql.py @@ -16,12 +16,12 @@ def vertica_import_sql(schema: str, abort_on_error: bool, load_method: str, no_commit: bool, - escape_as: Optional[str]=None, - enclosed_by: Optional[str]=None, - stream_name: Optional[str]=None, - skip=0, - rejected_data_table: Optional[str]=None, - rejected_data_schema: Optional[str]=None) -> str: + escape_as: Optional[str] = None, + enclosed_by: Optional[str] = None, + stream_name: Optional[str] = None, + skip: int = 0, + rejected_data_table: Optional[str] = None, + rejected_data_schema: Optional[str] = None) -> str: # https://my.vertica.com/docs/8.1.x/HTML/index.htm#Authoring/SQLReferenceManual/Statements/COPY/COPY.htm # https://my.vertica.com/docs/8.1.x/HTML/index.htm#Authoring/SQLReferenceManual/Statements/COPY/Parameters.htm diff --git a/records_mover/db/vertica/records_export_options.py b/records_mover/db/vertica/records_export_options.py index 6909cf3db..90cbb38e4 100644 --- a/records_mover/db/vertica/records_export_options.py +++ b/records_mover/db/vertica/records_export_options.py @@ -2,10 +2,11 @@ from ...records.hints import cant_handle_hint from ...records.unload_plan import RecordsUnloadPlan from ...records.records_format import DelimitedRecordsFormat -from typing import Set +from typing import Set, Dict, Any -def vertica_export_options(unhandled_hints: Set[str], unload_plan: RecordsUnloadPlan): +def vertica_export_options(unhandled_hints: Set[str], + unload_plan: RecordsUnloadPlan) -> Dict[str, Any]: if not isinstance(unload_plan.records_format, DelimitedRecordsFormat): raise NotImplementedError("Not currently able to export " f"{unload_plan.records_format.format_type}") diff --git a/records_mover/logging.py b/records_mover/logging.py index 12734059e..50b12ea43 100644 --- a/records_mover/logging.py +++ b/records_mover/logging.py @@ -64,14 +64,7 @@ def set_stream_logging(name: str = 'records_mover', logger = logging.getLogger(name) logger.setLevel(adjusted_level) wrapper = SecretsRedactingLogStream(stream) - # - # I don't understand exactly why, but TextIOBase doesn't seem to - # be compatible with IO[str] in mypy's mind. - # - # https://github.com/python/typeshed/blob/master/stdlib/3/io.pyi - # https://github.com/python/typeshed/issues/1229 - # - handler = logging.StreamHandler(stream=wrapper) # type: ignore + handler = logging.StreamHandler(stream=wrapper) handler.setLevel(adjusted_level) formatter = logging.Formatter(fmt, datefmt) handler.setFormatter(formatter) diff --git a/records_mover/pandas/__init__.py b/records_mover/pandas/__init__.py index 8133a99ff..51dacab2e 100644 --- a/records_mover/pandas/__init__.py +++ b/records_mover/pandas/__init__.py @@ -6,7 +6,7 @@ # http://stackoverflow.com/questions/27050108/convert-numpy-type-to-python class NumPyJSONEncoder(json.JSONEncoder): - def default(self, obj): + def default(self, obj: object) -> object: if isinstance(obj, np.integer): return int(obj) elif isinstance(obj, np.floating): diff --git a/records_mover/records/__init__.py b/records_mover/records/__init__.py index 425efc19a..e87f306c9 100644 --- a/records_mover/records/__init__.py +++ b/records_mover/records/__init__.py @@ -4,6 +4,7 @@ 'RecordsFormatType', 'RecordsSchema', 'RecordsFormat', + 'DelimitedVariant', 'DelimitedRecordsFormat', 'ParquetRecordsFormat', 'ProcessingInstructions', @@ -11,9 +12,9 @@ 'Records', ] -from .types import RecordsHints, BootstrappingRecordsHints, RecordsFormatType, DelimitedVariant # noqa -from .schema import RecordsSchema # noqa -from .records_format import RecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat # noqa -from .processing_instructions import ProcessingInstructions # noqa -from .existing_table_handling import ExistingTableHandling # noqa -from .records import Records # noqa +from .types import RecordsHints, BootstrappingRecordsHints, RecordsFormatType, DelimitedVariant +from .schema import RecordsSchema +from .records_format import RecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat +from .processing_instructions import ProcessingInstructions +from .existing_table_handling import ExistingTableHandling +from .records import Records diff --git a/records_mover/records/hints.py b/records_mover/records/hints.py index 007b68326..c3375c426 100644 --- a/records_mover/records/hints.py +++ b/records_mover/records/hints.py @@ -35,6 +35,17 @@ def cant_handle_hint(fail_if_cant_handle_hint: bool, hint_name: str, hints: Reco "or try again with fail_if_cant_handle_hint=False") +python_date_format_from_hints = { + 'YYYY-MM-DD': '%Y-%m-%d', + 'MM/DD/YY': '%m/%d/%Y', + 'DD/MM/YY': '%d/%m/%Y', +} + +python_time_format_from_hints = { + 'HH24:MI:SS': '%H:%M:%S', + 'HH12:MI AM': '%I:%M:%S %p', +} + hint_encoding_from_pandas = { 'utf-8': 'UTF8', 'utf-16': 'UTF16', diff --git a/records_mover/records/pandas/__init__.py b/records_mover/records/pandas/__init__.py index 51ba041bb..8c25fcd3d 100644 --- a/records_mover/records/pandas/__init__.py +++ b/records_mover/records/pandas/__init__.py @@ -1,6 +1,14 @@ from pandas import DataFrame -from .to_csv_options import pandas_to_csv_options # noqa -from .read_csv_options import pandas_read_csv_options # noqa + +__all__ = [ + 'pandas_to_csv_options', + 'pandas_read_csv_options', + 'prep_df_for_csv_output', + 'prep_df_for_loading', +] +from .to_csv_options import pandas_to_csv_options +from .read_csv_options import pandas_read_csv_options +from .prep_for_csv import prep_df_for_csv_output def _lowercase_column_names(df: DataFrame) -> DataFrame: diff --git a/records_mover/records/pandas/prep_for_csv.py b/records_mover/records/pandas/prep_for_csv.py new file mode 100644 index 000000000..45e6b7255 --- /dev/null +++ b/records_mover/records/pandas/prep_for_csv.py @@ -0,0 +1,104 @@ +from pandas import DataFrame +import pandas as pd +from records_mover.records import ProcessingInstructions +from records_mover.records.schema import RecordsSchema +from records_mover.records.schema.field import RecordsSchemaField +from records_mover.records import DelimitedRecordsFormat +from records_mover.records.hints import (python_date_format_from_hints, + python_time_format_from_hints, + cant_handle_hint) +import logging +from typing import Optional, Union, TypeVar + +logger = logging.getLogger(__name__) + + +T = TypeVar('T', bound=Union[pd.Series, pd.Index]) + + +def _convert_series_or_index(series_or_index: T, + field: RecordsSchemaField, + records_format: DelimitedRecordsFormat, + processing_instructions: ProcessingInstructions) -> Optional[T]: + if field.field_type == 'date': + if not isinstance(series_or_index[0], pd.Timestamp): + logger.warning(f"Found {series_or_index.name} as unexpected type " + f"{type(series_or_index[0])}") + else: + logger.info(f"Converting {series_or_index.name} from np.datetime64 to " + "string in CSV's format") + hint_date_format = records_format.hints['dateformat'] + assert isinstance(hint_date_format, str) + pandas_date_format = python_date_format_from_hints.get(hint_date_format) + if pandas_date_format is None: + cant_handle_hint(processing_instructions.fail_if_cant_handle_hint, + 'dateformat', + records_format.hints) + pandas_date_format = '%Y-%m-%d' + if isinstance(series_or_index, pd.Series): + return series_or_index.dt.strftime(pandas_date_format) + else: + return series_or_index.strftime(pandas_date_format) + elif field.field_type == 'time': + if not isinstance(series_or_index[0], pd.Timestamp): + logger.warning(f"Found {series_or_index.name} as unexpected " + f"type {type(series_or_index[0])}") + else: + logger.info(f"Converting {series_or_index.name} from np.datetime64 to string " + "in CSV's format") + hint_time_format = records_format.hints['timeonlyformat'] + assert isinstance(hint_time_format, str) + pandas_time_format = python_time_format_from_hints.get(hint_time_format) + if pandas_time_format is None: + cant_handle_hint(processing_instructions.fail_if_cant_handle_hint, + 'timeonlyformat', + records_format.hints) + pandas_time_format = '%H:%M:%S' + if isinstance(series_or_index, pd.Series): + return series_or_index.dt.strftime(pandas_time_format) + else: + return series_or_index.strftime(pandas_time_format) + else: + logger.debug(f"Not converting field type {field.field_type}") + + return None + + +def prep_df_for_csv_output(df: DataFrame, + include_index: bool, + records_schema: RecordsSchema, + records_format: DelimitedRecordsFormat, + processing_instructions: ProcessingInstructions) -> DataFrame: + # + # Pandas dataframes only have a native 'datetime'/'datetimetz' + # datatype (pd.Timestamp), not an individal 'date', 'time' or + # 'timetz' class. To generate the correct thing when writing out + # a 'date' or 'time' type to a CSV with Pandas' .to_csv() method, + # we need to convert those values to strings that represent + # exactly what we want. + # + # An example of when we can get these pd.Timestamp values inside a + # dataframe from read_csv() is when we tell it that a given column + # represents a date and/or time, allowing us to pick the format on + # the way out. + # + formatted_df = df.copy(deep=False) + remaining_fields = records_schema.fields.copy() + if include_index: + field = remaining_fields.pop(0) + formatted_index = _convert_series_or_index(formatted_df.index, + field, + records_format, + processing_instructions) + if formatted_index is not None: + formatted_df.index = formatted_index + + for index, field in enumerate(remaining_fields): + series = formatted_df.iloc[:, index] + formatted_series = _convert_series_or_index(series, + field, + records_format, + processing_instructions) + if formatted_series is not None: + formatted_df.iloc[:, index] = formatted_series + return formatted_df diff --git a/records_mover/records/pandas/read_csv_options.py b/records_mover/records/pandas/read_csv_options.py index faf17bf08..6982b9484 100644 --- a/records_mover/records/pandas/read_csv_options.py +++ b/records_mover/records/pandas/read_csv_options.py @@ -3,6 +3,7 @@ from ..hints import cant_handle_hint from ..processing_instructions import ProcessingInstructions from ..records_format import DelimitedRecordsFormat +from records_mover.records.schema import RecordsSchema import logging from typing import Set, Dict, Any @@ -11,6 +12,7 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, + records_schema: RecordsSchema, unhandled_hints: Set[str], processing_instructions: ProcessingInstructions) -> Dict[str, Any]: ... @@ -370,14 +372,12 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # Note: A fast-path exists for iso8601-formatted dates. # - # (we don't yet pass in a records schema which would provide - # ability to know in advance which columns are datetimes--sounds - # like it may be very helpful to do so!) - - quiet_remove(unhandled_hints, 'dateformat') - quiet_remove(unhandled_hints, 'timeonlyformat') - quiet_remove(unhandled_hints, 'datetimeformat') - quiet_remove(unhandled_hints, 'datetimeformattz') + pandas_options['parse_dates'] = [ + index + for index, field + in enumerate(records_schema.fields) + if field.field_type in ['date', 'time', 'datetime', 'datetimetz'] + ] # # infer_datetime_format : bool, default False @@ -388,7 +388,9 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # cases this can increase the parsing speed by 5-10x. # - # (won't be used since we're not yet able to pass in parse_dates) + # Left as default for now because presumably Pandas has some + # reason why this isn't the default that they didn't spell out in + # the docs. # # keep_date_col : bool, default False @@ -415,7 +417,8 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # defined by parse_dates) as arguments. # - # (N/A as we don't pass anything as parse_dates) + # (So far the default parser has handled what we've thrown at it, + # so we'll leave this at the default) # # dayfirst : bool, default False @@ -423,7 +426,26 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat, # DD/MM format dates, international and European format. # - # (N/A as we don't pass anything as parse_dates) + def day_first(dateish_format: str) -> bool: + return (dateish_format.startswith('DD-MM-') or + dateish_format.startswith('DD/MM/')) + + assert isinstance(hints['dateformat'], str) + assert isinstance(hints['datetimeformat'], str) + assert isinstance(hints['datetimeformattz'], str) + consistent_formats = (day_first(hints['dateformat']) == + day_first(hints['datetimeformat']) == + day_first(hints['datetimeformattz'])) + + if not consistent_formats: + cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints) + + pandas_options['dayfirst'] = day_first(hints['dateformat']) + + quiet_remove(unhandled_hints, 'dateformat') + quiet_remove(unhandled_hints, 'timeonlyformat') + quiet_remove(unhandled_hints, 'datetimeformat') + quiet_remove(unhandled_hints, 'datetimeformattz') # # iterator : bool, default False diff --git a/records_mover/records/records_schema_json_file.py b/records_mover/records/records_schema_json_file.py index da040595b..0bde8e570 100644 --- a/records_mover/records/records_schema_json_file.py +++ b/records_mover/records/records_schema_json_file.py @@ -16,6 +16,6 @@ def save_schema_json(self, json: str) -> None: def load_schema_json(self) -> Optional[str]: try: return self.schema_loc.string_contents() - except FileNotFoundError as e: - logger.debug(e) + except FileNotFoundError: + logger.debug('No schema JSON found', exc_info=True, stack_info=True) return None diff --git a/records_mover/records/schema/field/__init__.py b/records_mover/records/schema/field/__init__.py index ec8cb5cce..def76154f 100644 --- a/records_mover/records/schema/field/__init__.py +++ b/records_mover/records/schema/field/__init__.py @@ -155,6 +155,35 @@ def to_sqlalchemy_column(self, driver: 'DBDriver') -> 'Column': return field_to_sqlalchemy_column(self, driver) + def cast_series_type(self, series: 'Series') -> 'Series': + import pandas as pd + if self.field_type == 'time': + if series.size > 0: + # https://stackoverflow.com/questions/34501930/how-to-convert-timedelta-to-time-of-day-in-pandas + # + # Some databases (e.g., MySQL) contains a TIME type + # which is ambiguous - it can either represent a + # particular time of day or it can represent an + # elapsed amount of time. + # + # Clever, right? + # + # Unfortunately, Pandas knows about time deltas, but + # not about times of day, so upon use of read_sql(), + # these objects will come out as as a timedelta64[ns] + # type. + # + # Since that's not what our 'time' field type means, + # we have to convert it back to a string, or when it + # gets turned into a CSV later, it'll look really + # goofy - 1pm will come out as: "0 days 01:00:00". + # + if type(series[0]) == pd.Timedelta: + # Convert from "0 days 12:34:56.000000000" to "12:34:56" + return series.astype(str).str.split().str[-1].str.split('.').str[0] + + return series.astype(self.to_numpy_dtype()) + def to_numpy_dtype(self) -> Union[Type[Any], str]: if self.field_type == 'integer': int_constraints =\ diff --git a/records_mover/records/schema/field/constraints/constraints.py b/records_mover/records/schema/field/constraints/constraints.py index 68bbc3a31..5ecc203db 100644 --- a/records_mover/records/schema/field/constraints/constraints.py +++ b/records_mover/records/schema/field/constraints/constraints.py @@ -162,5 +162,5 @@ def from_numpy_dtype(dtype: np.dtype, else: return RecordsSchemaFieldConstraints(required=False, unique=unique) - def __str__(self): + def __str__(self) -> str: return f"{type(self).__name__}({self.to_data()})" diff --git a/records_mover/records/schema/field/statistics.py b/records_mover/records/schema/field/statistics.py index 20cfca977..262f83078 100644 --- a/records_mover/records/schema/field/statistics.py +++ b/records_mover/records/schema/field/statistics.py @@ -48,7 +48,7 @@ def from_data(data: Optional[Union['FieldStatisticsDict', 'StringFieldStatistics return RecordsSchemaFieldStatistics(rows_sampled=rows_sampled, total_rows=total_rows) - def __str__(self): + def __str__(self) -> str: return f"{type(self)}({self.to_data()})" diff --git a/records_mover/records/schema/schema/__init__.py b/records_mover/records/schema/schema/__init__.py index abf65afa2..8f7ca8c7b 100644 --- a/records_mover/records/schema/schema/__init__.py +++ b/records_mover/records/schema/schema/__init__.py @@ -153,11 +153,8 @@ def cast_dataframe_types(self, """ Returns a new dataframe with types that match what we know from this records schema. """ - col_mappings = {field.name: field.to_numpy_dtype() for field in self.fields} - if len(col_mappings) == 0: - # .as_type() doesn't like being given an empty map! - return df - return df.astype(col_mappings) + fields = {field.name: field for field in self.fields} + return df.apply(lambda series: fields[series.name].cast_series_type(series)) def assign_dataframe_names(self, include_index: bool, diff --git a/records_mover/records/sources/dataframes.py b/records_mover/records/sources/dataframes.py index 981dca8d2..0a988cd1e 100644 --- a/records_mover/records/sources/dataframes.py +++ b/records_mover/records/sources/dataframes.py @@ -11,6 +11,7 @@ import logging from typing import Iterator, Iterable, Optional, Union, Dict, IO, Callable, TYPE_CHECKING from records_mover.pandas import purge_unnamed_unused_columns +from records_mover.records.pandas import prep_df_for_csv_output if TYPE_CHECKING: from pandas import DataFrame @@ -61,10 +62,10 @@ def initial_records_schema(self, def serialize_dfs(self, processing_instructions: ProcessingInstructions, + records_schema: RecordsSchema, records_format: BaseRecordsFormat, save_df: Callable[['DataFrame', str], None])\ -> Iterator[FileobjsSource]: - records_schema = self.initial_records_schema(processing_instructions) target_names_to_input_fileobjs: Dict[str, IO[bytes]] = {} i = 1 @@ -122,6 +123,7 @@ def to_fileobjs_source(self, records_format_if_possible: Optional[BaseRecordsFormat]= None) -> Iterator[FileobjsSource]: records_format = self.pick_best_records_format(records_format_if_possible) + records_schema = self.initial_records_schema(processing_instructions) if isinstance(records_format, DelimitedRecordsFormat): unhandled_hints = set(records_format.hints.keys()) options = pandas_to_csv_options(records_format, @@ -131,7 +133,15 @@ def to_fileobjs_source(self, complain_on_unhandled_hints(self.processing_instructions.fail_if_dont_understand, unhandled_hints, records_format.hints) + # Convince mypy that this type will stay the same + delimited_records_format = records_format + def save_df(df: 'DataFrame', output_filename: str) -> None: + df = prep_df_for_csv_output(df, + include_index=self.include_index, + records_schema=records_schema, + records_format=delimited_records_format, + processing_instructions=processing_instructions) df.to_csv(path_or_buf=output_filename, index=self.include_index, **options) @@ -155,4 +165,4 @@ def save_df(df: 'DataFrame', output_filename: str) -> None: else: raise NotImplementedError(f"Teach me how to write to {records_format}") - return self.serialize_dfs(processing_instructions, records_format, save_df) + return self.serialize_dfs(processing_instructions, records_schema, records_format, save_df) diff --git a/records_mover/records/sources/fileobjs.py b/records_mover/records/sources/fileobjs.py index 6cf25ae50..a0ce3a0a6 100644 --- a/records_mover/records/sources/fileobjs.py +++ b/records_mover/records/sources/fileobjs.py @@ -115,6 +115,7 @@ def to_dataframes_source(self, f"{self.records_format.format_type} to dataframe") unhandled_hints = set(self.records_format.hints.keys()) options = pandas_read_csv_options(self.records_format, + self.records_schema, unhandled_hints, processing_instructions) complain_on_unhandled_hints(processing_instructions.fail_if_dont_understand, diff --git a/records_mover/records/targets/fileobj.py b/records_mover/records/targets/fileobj.py index 3fd2af25a..a47ad9395 100644 --- a/records_mover/records/targets/fileobj.py +++ b/records_mover/records/targets/fileobj.py @@ -27,6 +27,7 @@ def move_from_dataframes_source(self, processing_instructions: ProcessingInstructions) -> MoveResult: from ..pandas import pandas_to_csv_options + from records_mover.records.pandas import prep_df_for_csv_output if not isinstance(self.records_format, DelimitedRecordsFormat): raise NotImplementedError("Teach me to export from dataframe to " @@ -40,6 +41,9 @@ def move_from_dataframes_source(self, logger.info(f"Writing CSV file to {self.fileobj} with options {options}...") encoding: str = self.records_format.hints['encoding'] # type: ignore + records_schema = dfs_source.initial_records_schema(processing_instructions) + records_format = self.records_format + def write_dfs(path_or_buf: Union[str, IO[str]]) -> int: first_row = True move_count = 0 @@ -50,6 +54,11 @@ def write_dfs(path_or_buf: Union[str, IO[str]]) -> int: include_header_row = options['header'] and first_row first_row = False options['header'] = include_header_row + df = prep_df_for_csv_output(df, + include_index=dfs_source.include_index, + records_schema=records_schema, + records_format=records_format, + processing_instructions=processing_instructions) df.to_csv(path_or_buf=path_or_buf, mode="a", index=dfs_source.include_index, diff --git a/records_mover/url/base.py b/records_mover/url/base.py index feb97cdbf..6362274e0 100644 --- a/records_mover/url/base.py +++ b/records_mover/url/base.py @@ -176,9 +176,9 @@ def open(self, mode: str = "rb") -> IO[Any]: "exist in the directory." raise NotImplementedError() - def wait_to_exist(self): - "Returns true after the file exists--useful for eventually consistent stores (e.g., S3)" - return True + def wait_to_exist(self) -> None: + "Returns after the file exists--useful for eventually consistent stores (e.g., S3)" + return def exists(self) -> bool: try: diff --git a/records_mover/utils/limits.py b/records_mover/utils/limits.py index 16fecc344..f6081b6cc 100644 --- a/records_mover/utils/limits.py +++ b/records_mover/utils/limits.py @@ -8,6 +8,10 @@ INT16_MIN = -32768 UINT16_MAX = 65535 UINT16_MIN = 0 +INT24_MAX = 8388607 +INT24_MIN = -8388608 +UINT24_MAX = 16777215 +UINT24_MIN = 0 INT32_MAX = 2147483647 INT32_MIN = -2147483648 UINT32_MAX = 4294967295 diff --git a/setup.py b/setup.py index a73778cdd..9ec58bac6 100755 --- a/setup.py +++ b/setup.py @@ -164,6 +164,10 @@ def initialize_options(self) -> None: 'pandas<2', ] +mysql_dependencies = [ + 'mysqlclient' +] + db_dependencies + redshift_dependencies_base = [ # sqlalchemy-redshift 0.7.7 introduced support for Parquet # in UNLOAD @@ -206,7 +210,8 @@ def initialize_options(self) -> None: vertica_dependencies + postgres_dependencies_binary + redshift_dependencies_binary + - bigquery_dependencies + bigquery_dependencies + + mysql_dependencies ) unittest_dependencies = ( @@ -261,6 +266,7 @@ def initialize_options(self) -> None: 'cli': cli_dependencies_base, 'bigquery': bigquery_dependencies, 'aws': aws_dependencies, + 'mysql': mysql_dependencies, 'redshift-binary': redshift_dependencies_binary, 'redshift-source': redshift_dependencies_source, 'postgres-binary': postgres_dependencies_binary, diff --git a/tests/integration/bin/db-mysql b/tests/integration/bin/db-mysql new file mode 100755 index 000000000..b343687d0 --- /dev/null +++ b/tests/integration/bin/db-mysql @@ -0,0 +1,7 @@ +#!/bin/bash -e + +>&2 echo "SHOW DATABASES: List all schemas (MySQL calls them 'databases')" +>&2 echo "SHOW TABLES: List all tables in schema" +>&2 echo "DESCRIBE foo: Show table structure" + +mysql -A "-h${DB_HOST:?}" -P "${DB_PORT:?}" "-u${DB_USERNAME:?}" -p"${DB_PASSWORD}" -D"${DB_DATABASE:?}" --protocol=TCP diff --git a/tests/integration/circleci-dbfacts.yml b/tests/integration/circleci-dbfacts.yml index 72dbfa76f..a445b02fe 100644 --- a/tests/integration/circleci-dbfacts.yml +++ b/tests/integration/circleci-dbfacts.yml @@ -29,6 +29,15 @@ dbs: protocol: postgres user: postgres password: hunter2 + dockerized-mysql: + exports: + host: 127.0.0.1 + port: 3306 + database: mysqlitest + type: mysql + protocol: mysql + user: mysqluser + password: hunter2 demo-itest: jinja_context_name: env exports: diff --git a/tests/integration/docker-compose.yml b/tests/integration/docker-compose.yml index 4a9e40767..263198ba7 100644 --- a/tests/integration/docker-compose.yml +++ b/tests/integration/docker-compose.yml @@ -7,11 +7,30 @@ postgresdb: image: postgres:latest ports: - 5432 + environment: + POSTGRES_PASSWORD: 'hunter2' +mysqldb: + # https://hub.docker.com/_/mysql + + # MySQL after 5 (they bumped version to 8) uses a new auth protocol + # that is not well supported by clients - including the + # Debian-installable client packages. + # + # https://mysqlserverteam.com/mysql-8-0-4-new-default-authentication-plugin-caching_sha2_password/ + image: mysql:5 + ports: + - 3306 + environment: + MYSQL_ROOT_PASSWORD: 'hunter2root' + MYSQL_DATABASE: 'mysqlitest' + MYSQL_USER: mysqluser + MYSQL_PASSWORD: 'hunter2' records_mover: image: records-mover:${BUILD_TAG} links: - verticadb:verticadb - postgresdb:postgresdb + - mysqldb:mysqldb volumes: - ../..:/usr/src/app environment: diff --git a/tests/integration/inside-docker-dbfacts.yml b/tests/integration/inside-docker-dbfacts.yml index 53fefadef..39e6332f8 100644 --- a/tests/integration/inside-docker-dbfacts.yml +++ b/tests/integration/inside-docker-dbfacts.yml @@ -18,6 +18,15 @@ dbs: protocol: postgres user: postgres password: hunter2 + dockerized-mysql: + exports: + host: mysqldb + port: 3306 + database: mysqlitest + type: mysql + protocol: mysql + user: mysqluser + password: hunter2 bltoolsdevbq-bq_itest: jinja_context_name: standard exports: diff --git a/tests/integration/records/directory_validator.py b/tests/integration/records/directory_validator.py index 56cb55ad2..28920c5a3 100644 --- a/tests/integration/records/directory_validator.py +++ b/tests/integration/records/directory_validator.py @@ -69,7 +69,20 @@ def validate_records_schema(self) -> None: 'integer', 'string', 'string', 'string', 'string', 'string', 'string', 'date', 'string', 'datetime', 'datetimetz' - ] + ], + # MySQL's datetimetz type ("TIMESTAMP") doesn't + # support dates before the Unix epoch (Jan 1 1970), + # and records-mover does not yet support using + # inference to determine if the data in question will + # fit into it. + # + # https://app.asana.com/0/1128138765527694/1166526213569051 + # https://stackoverflow.com/questions/31761047/what-difference-between-the-date-time-datetime-and-timestamp-types/56138746 + 'mysql': [ + 'integer', 'string', 'string', 'string', + 'string', 'string', 'string', 'date', 'time', + 'datetime', 'datetime' + ], } if actual_field_types == acceptable_field_types_by_db.get(self.source_db_type): field_types_are_ok = True @@ -112,7 +125,7 @@ def validate(self) -> None: outputs = {} success = False - for alt in ['', '-pandas', '-pandas-utc', '-utc']: + for alt in ['', '-pandas', '-pandas-utc', '-utc', '-pandas-notz']: expected_file = f"{dir_path}/../resources/{self.test_name}{alt}.csv" logger.info(f"expected_file: {expected_file}") try: diff --git a/tests/integration/records/expected_column_types.py b/tests/integration/records/expected_column_types.py index e127b4fe5..615283006 100644 --- a/tests/integration/records/expected_column_types.py +++ b/tests/integration/records/expected_column_types.py @@ -1,32 +1,22 @@ # Note that Redshift doesn't support TIME type: # https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html -expected_column_types = [ - # Vertica - [ +expected_single_database_column_types = { + 'vertica': [ 'INTEGER', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', 'TIME', 'TIMESTAMP', 'TIMESTAMP' ], - # Redshift - [ + 'redshift': [ 'INTEGER', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', 'VARCHAR(8)', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # Postgres - [ + 'postgresql': [ 'INTEGER', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', 'TIME WITHOUT TIME ZONE', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # Postgres when loaded from a dataframe - [ - 'BIGINT', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', - 'VARCHAR(12)', 'VARCHAR(444)', 'DATE', 'TIME WITHOUT TIME ZONE', - 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' - ], - # BigQuery - [ + 'bigquery': [ "", "", "", @@ -39,6 +29,35 @@ "", "" ], + 'mysql': [ + 'INTEGER(11)', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', + 'VARCHAR(111)', 'DATE', 'TIME', 'DATETIME(6)', 'DATETIME(6)' + ], +} + +expected_df_loaded_database_column_types = { + 'postgresql': [ + 'BIGINT', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', + 'VARCHAR(12)', 'VARCHAR(444)', 'DATE', 'TIME WITHOUT TIME ZONE', + 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' + ], + 'mysql': [ + 'BIGINT(20)', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', + 'VARCHAR(111)', 'DATE', 'TIME', 'DATETIME(6)', 'DATETIME(6)' + ], + 'vertica': [ + 'INTEGER', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', + 'VARCHAR(12)', 'VARCHAR(444)', 'DATE', 'TIME', + 'TIMESTAMP', 'TIMESTAMP' + ], + 'redshift': [ + 'BIGINT', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', + 'VARCHAR(12)', 'VARCHAR(444)', 'DATE', 'VARCHAR(8)', + 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' + ], +} + +expected_table2table_column_types = { # Notes on table2table triggered results: # # @@ -84,49 +103,41 @@ # # # - # postgres2postgres - [ + ('postgresql', 'postgresql'): [ 'INTEGER', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'DATE', 'TIME WITHOUT TIME ZONE', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # postgres2vertica - [ + ('postgresql', 'vertica'): [ 'INTEGER', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'DATE', 'TIME', 'TIMESTAMP', 'TIMESTAMP' ], - # postgres2redshift - [ + ('postgresql', 'redshift'): [ 'INTEGER', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'DATE', 'VARCHAR(8)', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # redshift2vertica - [ + ('redshift', 'vertica'): [ 'INTEGER', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', 'VARCHAR(8)', 'TIMESTAMP', 'TIMESTAMP' ], - # bigquery2redshift - [ + ('bigquery', 'redshift'): [ 'BIGINT', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'DATE', 'VARCHAR(8)', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # bigquery2postgres - [ + ('bigquery', 'postgresql'): [ 'BIGINT', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'DATE', 'TIME WITHOUT TIME ZONE', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # bigquery2vertica - [ + ('bigquery', 'vertica'): [ 'INTEGER', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'DATE', 'TIME', 'TIMESTAMP', 'TIMESTAMP' ], - # redshift2bigquery - [ + ('redshift', 'bigquery'): [ "", "", "", @@ -139,28 +150,51 @@ "", "", ], - # vertica2postgres - [ + ('mysql', 'bigquery'): [ + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + ('redshift', 'mysql'): [ + 'INTEGER(11)', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', + 'VARCHAR(111)', 'DATE', 'VARCHAR(8)', 'DATETIME(6)', 'DATETIME(6)' + ], + ('postgresql', 'mysql'): [ + 'INTEGER(11)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', + 'VARCHAR(256)', + 'VARCHAR(256)', 'DATE', 'TIME', 'DATETIME(6)', 'DATETIME(6)' + ], + ('bigquery', 'mysql'): [ + 'BIGINT(20)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', 'VARCHAR(256)', + 'VARCHAR(256)', + 'VARCHAR(256)', 'DATE', 'TIME', 'DATETIME(6)', 'DATETIME(6)' + ], + ('mysql', 'postgresql'): [ + 'INTEGER', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', 'VARCHAR(12)', + 'VARCHAR(444)', 'DATE', 'TIME WITHOUT TIME ZONE', 'TIMESTAMP WITHOUT TIME ZONE', + 'TIMESTAMP WITHOUT TIME ZONE' + ], + ('mysql', 'redshift'): [ + 'INTEGER', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', 'VARCHAR(12)', + 'VARCHAR(444)', 'DATE', 'VARCHAR(8)', 'TIMESTAMP WITHOUT TIME ZONE', + 'TIMESTAMP WITHOUT TIME ZONE' + ], + ('vertica', 'postgresql'): [ 'BIGINT', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', 'TIME WITHOUT TIME ZONE', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # vertica2redshift - [ + ('vertica', 'redshift'): [ 'BIGINT', 'VARCHAR(3)', 'VARCHAR(3)', 'VARCHAR(1)', 'VARCHAR(1)', 'VARCHAR(3)', 'VARCHAR(111)', 'DATE', 'VARCHAR(8)', 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' ], - # Vertica when loaded from a dataframe - [ - 'INTEGER', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', - 'VARCHAR(12)', 'VARCHAR(444)', 'DATE', 'TIME', - 'TIMESTAMP', 'TIMESTAMP' - ], - # Redshift when loaded from a dataframe - [ - 'BIGINT', 'VARCHAR(12)', 'VARCHAR(12)', 'VARCHAR(4)', 'VARCHAR(4)', - 'VARCHAR(12)', 'VARCHAR(444)', 'DATE', 'VARCHAR(8)', - 'TIMESTAMP WITHOUT TIME ZONE', 'TIMESTAMP WITH TIME ZONE' - ], -] +} diff --git a/tests/integration/records/mover_test_case.py b/tests/integration/records/mover_test_case.py new file mode 100644 index 000000000..46620a8e7 --- /dev/null +++ b/tests/integration/records/mover_test_case.py @@ -0,0 +1,120 @@ +from sqlalchemy.engine import Engine +from typing import Optional, List +from records_mover.records import DelimitedVariant + + +class MoverTestCase: + def __init__(self, + target_db_engine: Engine, + source_db_engine: Optional[Engine] = None, + file_variant: Optional[DelimitedVariant] = None) -> None: + """ + :param db_engine: Target database of the records move. + + :param source_data_db_engine: Source database of the records + move. None if we are loading from a file or a dataframe + instead of copying from one database to another. + + :param file_variant: None means the data was given to records mover via a Pandas + dataframe or by copying from another database instead of a CSV. + """ + self.target_db_engine = target_db_engine + self.source_db_engine = source_db_engine + self.file_variant = file_variant + + def database_has_no_usable_timestamptz_type(self, engine: Engine) -> bool: + # If true, timestamptzs are treated like timestamps, and the + # timezone is stripped off without looking at it before the + # timestamp itself is stored without any modifications to the + # hour number. + return engine.name == 'mysql' + + def database_default_store_timezone_is_us_eastern(self) -> bool: + """ + If we don't specify a timezone in a timestamptz string, does the + database assign the US/Eastern timezone when it's stored? + """ + + # We've seen this for some Vertica servers in the past, but it + # doesn't affect our current integration test targets. + + # This seems to be controlled in Vertica by what timezone is + # set on the cluster servers at Vertica install-time. The + # Docker image (jbfavre/vertica) uses UTC, but our physical + # servers when integration tests are run by hand does not. + return False + + def selects_time_types_as_timedelta(self) -> bool: + return self.target_db_engine.name == 'mysql' + + def supports_time_without_date(self) -> bool: + # Redshift as a source or destination doesn't support a time + # type, meaning the net result will be time as a string type. + return (self.target_db_engine.name != 'redshift' + and (self.source_db_engine is None or + self.source_db_engine.name != 'redshift')) + + def variant_doesnt_support_seconds(self, variant: DelimitedVariant): + # things are represented as second-denominated date + time + # + # e.g. - 1/1/00,12:00 AM + return variant == 'csv' + + def variant_doesnt_support_timezones(self, + variant: Optional[DelimitedVariant]) -> bool: + return variant in ['csv', 'bigquery'] + + def variant_uses_am_pm(self, variant: DelimitedVariant) -> bool: + return variant == 'csv' + + def supported_load_variants(self, db_engine: Engine) -> List[DelimitedVariant]: + if db_engine.name == 'bigquery': + return ['bigquery'] + elif db_engine.name == 'vertica': + return ['bluelabs', 'vertica'] + elif db_engine.name == 'redshift': + # This isn't really true, but is good enough to make the + # tests pass for now. We need to create a new named + # variant name for the CSV-esque variant that we now + # prefer for Redshift. + return ['bluelabs', 'csv', 'bigquery'] + elif db_engine.name == 'postgresql': + return ['bluelabs', 'csv', 'bigquery'] + elif db_engine.name == 'mysql': + return [] + else: + raise NotImplementedError(f"Teach me about database type {db_engine.name}") + + def default_load_variant(self, db_engine: Engine) -> Optional[DelimitedVariant]: + supported = self.supported_load_variants(db_engine) + if len(supported) == 0: + return None + return supported[0] + + def determine_load_variant(self) -> Optional[DelimitedVariant]: + if self.loaded_from_file(): + if self.file_variant in self.supported_load_variants(self.target_db_engine): + return self.file_variant + else: + return self.default_load_variant(self.target_db_engine) + else: + # If we're not loading from a file, we're copying from a database + if self.loaded_from_dataframe(): + # Loading from a dataframe + return self.default_load_variant(self.target_db_engine) + else: + # Loading from a database + assert self.source_db_engine is not None + if self.source_db_engine.name == 'bigquery': + return 'bigquery' + else: + return 'vertica' + + def loaded_from_database(self) -> bool: + return self.source_db_engine is not None + + def loaded_from_dataframe(self) -> bool: + return self.file_variant is None and self.source_db_engine is None + + def loaded_from_file(self) -> bool: + return self.file_variant is not None diff --git a/tests/integration/records/multi_db/test_records_table2table.py b/tests/integration/records/multi_db/test_records_table2table.py index e739ac69a..c63d4f922 100644 --- a/tests/integration/records/multi_db/test_records_table2table.py +++ b/tests/integration/records/multi_db/test_records_table2table.py @@ -16,13 +16,14 @@ TARGET_TABLE_NAME_PREFIX = "itest_target" TARGET_TABLE_NAME = f'{TARGET_TABLE_NAME_PREFIX}_{BUILD_NUM}_{CURRENT_EPOCH}' -DB_TYPES = ['vertica', 'redshift', 'bigquery', 'postgres'] +DB_TYPES = ['vertica', 'redshift', 'bigquery', 'postgres', 'mysql'] DB_NAMES = { 'vertica': 'dockerized-vertica', 'redshift': 'demo-itest', 'bigquery': 'bltoolsdevbq-bq_itest', 'postgres': 'dockerized-postgres', + 'mysql': 'dockerized-mysql', } @@ -31,6 +32,8 @@ def schema_name(db_name): return 'itest' elif db_name == 'dockerized-vertica': return 'public' + elif db_name == 'dockerized-mysql': + return 'mysqlitest' elif db_name == 'dockerized-postgres': return 'public' elif db_name == 'bltoolsdevbq-bq_itest': @@ -82,7 +85,7 @@ def move_and_verify(self, source_dbname: str, target_dbname: str) -> None: # will be None or 1 self.assertNotEqual(0, out.move_count) validator = RecordsTableValidator(target_engine, - source_data_db_engine=source_engine) + source_db_engine=source_engine) validator.validate(schema_name=target_schema_name, table_name=TARGET_TABLE_NAME) diff --git a/tests/integration/records/purge_old_test_sheets.py b/tests/integration/records/purge_old_test_sheets.py index c8935515f..434dbf3b4 100755 --- a/tests/integration/records/purge_old_test_sheets.py +++ b/tests/integration/records/purge_old_test_sheets.py @@ -36,8 +36,9 @@ def delete_sheet_by_id(service: SheetsService, body=batch_update_spreadsheet_request_body) try: request.execute() - except HttpError as e: - logger.info(e) + except HttpError: + logger.exception('Transient problem cleaning up temporary sheet ' + '- will try again some other time.') # continue on - likely because something else in parallel hit # this at once, and even if not, we can garbage collect this # sheet in the next attempt with the next batch of tests diff --git a/tests/integration/records/records_database_fixture.py b/tests/integration/records/records_database_fixture.py index 8b4d780c6..6c10e1b56 100644 --- a/tests/integration/records/records_database_fixture.py +++ b/tests/integration/records/records_database_fixture.py @@ -84,6 +84,21 @@ def bring_up(self): '00:00:00'::TIME AS "time", '2000-01-02 12:34:56.789012'::TIMESTAMP AS "timestamp", '2000-01-02 12:34:56.789012 US/Eastern'::TIMESTAMPTZ as "timestamptz"; +""" # noqa + elif self.engine.name == 'mysql': + create_tables = f""" + CREATE TABLE {self.schema_name}.{self.table_name} AS + SELECT 123 AS num, + '123' AS numstr, + 'foo' AS str, + ',' AS comma, + '"' AS doublequote, + '","' AS quotecommaquote, + '* SQL unload would generate multiple files (one for each slice/part)\n* Filecat would produce a single data file' AS newlinestr, + DATE '2000-01-01' AS "date", + TIME '00:00:00' AS "time", + TIMESTAMP '2000-01-02 12:34:56.789012' AS "timestamp", + TIMESTAMP '2000-01-02 12:34:56.789012-05' AS "timestamptz"; """ # noqa else: raise NotImplementedError(f"Please teach me how to integration test {self.engine.name}") diff --git a/tests/integration/records/records_numeric_database_fixture.py b/tests/integration/records/records_numeric_database_fixture.py index 58d12a53e..c46ce322e 100644 --- a/tests/integration/records/records_numeric_database_fixture.py +++ b/tests/integration/records/records_numeric_database_fixture.py @@ -50,6 +50,65 @@ def bring_up(self): 1234.56::NUMERIC(6, 2) AS fixed_6_2, 12147483647.78::REAL AS float32, 19223372036854775807.78::FLOAT8 AS float64; +""" # noqa + elif self.engine.name == 'mysql': + # MySQL supports a number of different numeric types + # https://dev.mysql.com/doc/refman/8.0/en/numeric-types.html + # + create_tables = f""" + CREATE TABLE {self.schema_name}.{self.table_name} ( + `int8` TINYINT, + `uint8` TINYINT UNSIGNED, + `int16` SMALLINT, + `uint16` SMALLINT UNSIGNED, + `int24` MEDIUMINT, + `uint24` MEDIUMINT UNSIGNED, + `int32` INT, + `uint32` INT UNSIGNED, + `int64` BIGINT, + `uint64` BIGINT UNSIGNED, + `fixed_6_2` DECIMAL(6, 2), + `fixed_38_9` DECIMAL(38, 9), + `fixed_65_30` DECIMAL(65, 30), + `float32` FLOAT, + `float64` DOUBLE + ); + INSERT INTO {self.schema_name}.{self.table_name} + ( + `int8`, + `uint8`, + `int16`, + `uint16`, + `int24`, + `uint24`, + `int32`, + `uint32`, + `int64`, + `uint64`, + `fixed_6_2`, + `fixed_38_9`, + `fixed_65_30`, + `float32`, + `float64` + ) + VALUES + ( + 127, + 128, + 32767, + 32768, + 8388607, + 8388608, + 2147483647, + 2147483648, + 9223372036854775807, + 9223372036854775808, + 1234.56, + 1234.56, + 1234.56, + 12147483647.78, + 19223372036854775807.78 + ); """ # noqa else: raise NotImplementedError(f"Please teach me how to integration test {self.engine.name}") diff --git a/tests/integration/records/single_db/base_records_test.py b/tests/integration/records/single_db/base_records_test.py index 208f6abde..d836cd188 100644 --- a/tests/integration/records/single_db/base_records_test.py +++ b/tests/integration/records/single_db/base_records_test.py @@ -47,6 +47,8 @@ def setUp(self): if self.engine.name == 'bigquery': self.schema_name = 'bq_itest' # avoid per-table rate limits + elif self.engine.name == 'mysql': + self.schema_name = 'mysqlitest' else: self.schema_name = 'public' table_name_prefix = "itest_" @@ -80,11 +82,8 @@ def resource_name(self, format_type, variant, hints): else: return f"{format_type}-{variant}-no-header" - def running_from_laptop(self): - return sys.platform == 'darwin' - def has_scratch_bucket(self): - return os.environ.get('SCRATCH_S3_URL') is not None or self.running_from_laptop() + return os.environ.get('SCRATCH_S3_URL') is not None def has_pandas(self): try: diff --git a/tests/integration/records/single_db/numeric_expectations.py b/tests/integration/records/single_db/numeric_expectations.py index 02ab588ec..156712f2b 100644 --- a/tests/integration/records/single_db/numeric_expectations.py +++ b/tests/integration/records/single_db/numeric_expectations.py @@ -1,4 +1,18 @@ expected_field_info = { + 'int8': { + 'type': 'integer', + 'constraints': { + 'min': '-128', + 'max': '127' + } + }, + 'uint8': { + 'type': 'integer', + 'constraints': { + 'min': '0', + 'max': '255' + } + }, 'int16': { 'type': 'integer', 'constraints': { @@ -6,6 +20,27 @@ 'max': '32767' } }, + 'uint16': { + 'type': 'integer', + 'constraints': { + 'min': '0', + 'max': '65535' + } + }, + 'int24': { + 'type': 'integer', + 'constraints': { + 'min': '-8388608', + 'max': '8388607' + } + }, + 'uint24': { + 'type': 'integer', + 'constraints': { + 'min': '0', + 'max': '16777215' + } + }, 'int32': { 'type': 'integer', 'constraints': { @@ -13,6 +48,13 @@ 'max': '2147483647' } }, + 'uint32': { + 'type': 'integer', + 'constraints': { + 'min': '0', + 'max': '4294967295' + } + }, 'int64': { 'type': 'integer', 'constraints': { @@ -20,6 +62,13 @@ 'max': '9223372036854775807' } }, + 'uint64': { + 'type': 'integer', + 'constraints': { + 'min': '0', + 'max': '18446744073709551615' + } + }, 'fixed_6_2': { 'type': 'decimal', 'constraints': { @@ -34,6 +83,13 @@ 'fixed_scale': 9 } }, + 'fixed_65_30': { + 'type': 'decimal', + 'constraints': { + 'fixed_precision': 65, + 'fixed_scale': 30 + } + }, 'float32': { 'type': 'decimal', 'constraints': { @@ -130,4 +186,26 @@ 'fixed_38_9': 'NUMERIC(38, 9)', 'fixed_100_4': 'NUMERIC(100, 4)', }, + # The numbers after the integer types are display widths - how + # many spaces to save to render them on output. Not especially + # relevant and records-mover just uses the defaults which end up + # as the below. + 'mysql': { + 'int8': 'TINYINT(4)', + 'int16': 'SMALLINT(6)', + 'int32': 'INTEGER(11)', + 'int64': 'BIGINT(20)', + 'ubyte': 'TINYINT(3) UNSIGNED', + 'uint8': 'TINYINT(3) UNSIGNED', + 'uint16': 'SMALLINT(5) UNSIGNED', + 'uint32': 'INTEGER(10) UNSIGNED', + 'uint64': 'BIGINT(20) UNSIGNED', + 'float16': 'FLOAT', + 'float32': 'FLOAT', + 'float64': 'DOUBLE', + 'float128': 'DOUBLE', # MySQL doesn't support >float64 + 'fixed_6_2': 'DECIMAL(6, 2)', + 'fixed_38_9': 'DECIMAL(38, 9)', + 'fixed_100_4': 'DOUBLE', # MySQL doesn't support NUMERIC(n,d) where n>65 + }, } diff --git a/tests/integration/records/single_db/test_records_load.py b/tests/integration/records/single_db/test_records_load.py index 9f55f757f..fc7d9283b 100644 --- a/tests/integration/records/single_db/test_records_load.py +++ b/tests/integration/records/single_db/test_records_load.py @@ -152,6 +152,6 @@ def load(self, format_type, variant, hints={}, broken=False, sourcefn=None): else: self.assertIn(out.move_count, [None, 1]) - def verify_db_table(self, variant): + def verify_db_table(self, variant) -> None: validator = RecordsTableValidator(self.engine, file_variant=variant) validator.validate(schema_name=self.schema_name, table_name=self.table_name) diff --git a/tests/integration/records/single_db/test_records_numeric.py b/tests/integration/records/single_db/test_records_numeric.py index ff52df69d..e12489279 100644 --- a/tests/integration/records/single_db/test_records_numeric.py +++ b/tests/integration/records/single_db/test_records_numeric.py @@ -89,6 +89,7 @@ def test_numeric_database_columns_created(self): 'bigquery': 'bigquery', 'vertica': 'vertica', 'postgresql': 'bluelabs', + 'mysql': 'bluelabs', } records_format = DelimitedRecordsFormat(variant=preferred_records_format[self.engine.name]) source = self.records.sources.\ diff --git a/tests/integration/records/table_timezone_validator.py b/tests/integration/records/table_timezone_validator.py new file mode 100644 index 000000000..b6dc90bec --- /dev/null +++ b/tests/integration/records/table_timezone_validator.py @@ -0,0 +1,169 @@ +import pytz +import datetime +from sqlalchemy.engine import Engine +from typing import Optional +from records_mover.records import DelimitedVariant +from .mover_test_case import MoverTestCase + + +class RecordsTableTimezoneValidator: + def __init__(self, + tc: MoverTestCase, + target_db_engine: Engine, + source_db_engine: Optional[Engine] = None, + file_variant: Optional[DelimitedVariant] = None) -> None: + self.tc = tc + self.target_db_engine = target_db_engine + self.source_db_engine = source_db_engine + self.file_variant = file_variant + + def validate(self, + timestampstr: str, + timestamptzstr: str, + timestamptz: datetime.datetime) -> None: + load_variant = self.tc.determine_load_variant() + + if (self.source_db_engine is not None and + self.tc.database_has_no_usable_timestamptz_type(self.source_db_engine)): + # The source database doesn't know anything about + # timezones, and records_database_fixture.py inserts + # something like "2000-01-02 12:34:56.789012-05" - and the + # timezone parts gets ignored by the database. + # + utc_hour = 12 + elif (self.tc.loaded_from_file() and + self.tc.database_has_no_usable_timestamptz_type(self.target_db_engine)): + # In this case, we're trying to load a string that looks like this: + # + # 2000-01-02 12:34:56.789012-05 + # + # But since we're loading it into a column type that + # doesn't store timezones, the database in question just + # strips off the timezone and stores the '12' + utc_hour = 12 + elif(self.tc.loaded_from_dataframe() and + self.tc.database_has_no_usable_timestamptz_type(self.target_db_engine)): + # + # In this case, we correctly tell Pandas that we have are + # at noon:34 US/Eastern, and tell Pandas to format the + # datetime format. + # + # But since we're loading it into a column type that + # doesn't store timezones, the database in question just + # strips off the timezone and stores the '12' + # + utc_hour = 12 + elif (self.tc.loaded_from_file() and + load_variant != self.file_variant and + self.tc.variant_doesnt_support_timezones(load_variant) and + not self.tc.database_default_store_timezone_is_us_eastern()): + # In this case, we're trying to load a string that looks like this: + # + # 2000-01-02 12:34:56.789012-05 + # + # That gets correct turned into a dataframe representing + # noon:34 US/Eastern. We tell Pandas to format the + # datetime format in the load variant. Unfortunately, if + # you don't specify a timezone as part of that format, + # Pandas just prints the TZ-naive hour. + utc_hour = 12 + elif (self.tc.loaded_from_dataframe() and + self.tc.variant_doesnt_support_timezones(load_variant) and + not self.tc.database_default_store_timezone_is_us_eastern()): + # + # In this case, we correctly tell Pandas that we have are + # at noon:34 US/Eastern, and tell Pandas to format the + # datetime format. Unfortunately, if you don't specify a + # timezone as part of that format, Pandas just prints the + # TZ-naive hour. + # + utc_hour = 12 + elif (self.tc.variant_doesnt_support_timezones(self.file_variant) and + not self.tc.database_default_store_timezone_is_us_eastern()): + # In this case we're loading from one of our example + # files, but the example file doesn't contain a timezone. + # Per tests/integration/resources/README.md: + # + # On systems where a timezone can't be represented, + # this should be represented as if the implicit + # timezone was US/Eastern. + # + # Example date that we'd be loading as a string: + # + # 2000-01-02 12:34:56.789012 + # + # Per our tests/integration/resources/README.md, this is + # representing noon:34 on the east coast. + # + # However, if we load into a database who assumes that + # timezoneless times coming in are in in UTC, when we + # select it back out in UTC form, it'll come back as noon + # UTC! + utc_hour = 12 + else: + # ...if, however, either the variant *does* support + # timezones, if it gets rendered back as UTC, it'll be at + # hour 17 UTC - which is noon US/Eastern. + + # ...and if the database assumes US/Eastern when storing, + # the same result will happen, as the database will + # understand that noon on the east coast is hour 17 UTC. + utc_hour = 17 + if ((load_variant is not None and + self.tc.variant_doesnt_support_seconds(load_variant)) or + ((self.file_variant is not None and + self.tc.variant_doesnt_support_seconds(self.file_variant)))): + seconds = '00' + micros = '000000' + else: + seconds = '56' + micros = '789012' + + assert timestampstr == f'2000-01-02 12:34:{seconds}.{micros}',\ + f"expected '2000-01-02 12:34:{seconds}.{micros}' got '{timestampstr}'" + + if (self.source_db_engine is not None and + self.tc.database_has_no_usable_timestamptz_type(self.source_db_engine)): + # Depending on the capabilities of the target database, we + # may not be able to get a rendered version that includes + # the UTC tz - but either way we won't have transferred a + # timezone in. + assert timestamptzstr in [ + f'2000-01-02 {utc_hour}:34:{seconds}.{micros} ', + f'2000-01-02 {utc_hour}:34:{seconds}.{micros} UTC', + f'2000-01-02 {utc_hour}:34:{seconds}.{micros}+00' + ],\ + (f"translated timestamptzstr was {timestamptzstr} and " + f"class is {type(timestamptzstr)} - expected " + f"hour to be {utc_hour}") + else: + assert timestamptzstr in [ + f'2000-01-02 {utc_hour}:34:{seconds}.{micros} UTC', + f'2000-01-02 {utc_hour}:34:{seconds}.{micros}+00' + ],\ + (f"translated timestamptzstr was {timestamptzstr} and " + f"class is {type(timestamptzstr)} - expected " + f"hour to be {utc_hour}") + + utc = pytz.timezone('UTC') + if ((load_variant is not None and + self.tc.variant_doesnt_support_seconds(load_variant)) or + (self.file_variant is not None and + self.tc.variant_doesnt_support_seconds(self.file_variant))): + utc_naive_expected_time = datetime.datetime(2000, 1, 2, utc_hour, 34) + else: + utc_naive_expected_time = datetime.datetime(2000, 1, 2, utc_hour, 34, 56, 789012) + utc_expected_time = utc.localize(utc_naive_expected_time) + + # Dunno why sqlalchemy doesn't return this instead, but + # timestamptzstr shows that db knows what's up internally: + # + actual_time = timestamptz + if actual_time.tzinfo is None: + assert actual_time - utc_naive_expected_time == datetime.timedelta(0),\ + f"Delta is {actual_time - utc_naive_expected_time}, " \ + f"actual_time is {actual_time}, tz-naive expected time is {utc_naive_expected_time}" + else: + assert actual_time - utc_expected_time == datetime.timedelta(0),\ + f"Delta is {actual_time - utc_expected_time}, " \ + f"actual_time is {actual_time}, expected time is {utc_expected_time}" diff --git a/tests/integration/records/table_validator.py b/tests/integration/records/table_validator.py index fd20a385e..80d5e0bdd 100644 --- a/tests/integration/records/table_validator.py +++ b/tests/integration/records/table_validator.py @@ -1,4 +1,3 @@ -import pytz import datetime import logging from sqlalchemy.engine import Engine @@ -6,8 +5,14 @@ from sqlalchemy.sql.elements import TextClause from typing import Optional, Dict, Any, Union from .timezone import set_session_tz -from .expected_column_types import expected_column_types +from .expected_column_types import ( + expected_single_database_column_types, + expected_df_loaded_database_column_types, + expected_table2table_column_types +) from records_mover.records import DelimitedVariant +from .mover_test_case import MoverTestCase +from .table_timezone_validator import RecordsTableTimezoneValidator logger = logging.getLogger(__name__) @@ -25,57 +30,30 @@ # load_variant: Variant which the target database will eventually load # from, or None if the database will be loaded via INSERT.. class RecordsTableValidator: - def __init__(self, db_engine: Engine, - source_data_db_engine: Optional[Engine] = None, + def __init__(self, + target_db_engine: Engine, + source_db_engine: Optional[Engine] = None, file_variant: Optional[DelimitedVariant] = None) -> None: """ :param db_engine: Target database of the records move. - - :param source_data_db_engine: Source database of the records + :param source_db_engine: Source database of the records move. None if we are loading from a file or a dataframe instead of copying from one database to another. - + :param target_db_engine: Target database of the records + move. :param file_variant: None means the data was given to records mover via a Pandas dataframe or by copying from another database instead of a CSV. """ - self.engine = db_engine - self.source_data_db_engine = source_data_db_engine + self.target_db_engine = target_db_engine + self.source_db_engine = source_db_engine self.file_variant = file_variant - - def database_default_store_timezone_is_us_eastern(self) -> bool: - """ - If we don't specify a timezone in a timestamptz string, does the - database assign the US/Eastern timezone when it's stored? - """ - - # We've seen this for some Vertica servers in the past, but it - # doesn't affect our current integration test targets. - - # This seems to be controlled in Vertica by what timezone is - # set on the cluster servers at Vertica install-time. The - # Docker image (jbfavre/vertica) uses UTC, but our physical - # servers when integration tests are run by hand does not. - return False - - def supports_time_without_date(self) -> bool: - # Redshift as a source or destination doesn't support a time - # type, meaning the net result will be time as a string type. - return (self.engine.name != 'redshift' - and (self.source_data_db_engine is None or - self.source_data_db_engine.name != 'redshift')) - - def variant_doesnt_support_seconds(self, variant: DelimitedVariant): - # things are represented as second-denominated date + time - # - # e.g. - 1/1/00,12:00 AM - return variant == 'csv' - - def variant_doesnt_support_timezones(self, - variant: Optional[DelimitedVariant]) -> bool: - return variant in ['csv', 'bigquery'] - - def variant_uses_am_pm(self, variant: DelimitedVariant) -> bool: - return variant == 'csv' + self.tc = MoverTestCase(target_db_engine=target_db_engine, + source_db_engine=source_db_engine, + file_variant=file_variant) + self.tz_validator = RecordsTableTimezoneValidator(tc=self.tc, + target_db_engine=target_db_engine, + source_db_engine=source_db_engine, + file_variant=file_variant) def validate(self, schema_name: str, @@ -84,7 +62,9 @@ def validate(self, self.validate_data_values(schema_name, table_name) def validate_data_types(self, schema_name: str, table_name: str) -> None: - columns = self.engine.dialect.get_columns(self.engine, table_name, schema=schema_name) + columns = self.target_db_engine.dialect.get_columns(self.target_db_engine, + table_name, + schema=schema_name) expected_column_names = [ 'num', 'numstr', 'str', 'comma', 'doublequote', 'quotecommaquote', 'newlinestr', 'date', 'time', 'timestamp', 'timestamptz' @@ -99,46 +79,44 @@ def format_type(column: Dict[str, Any]) -> str: return str(column['type']) + suffix actual_column_types = [format_type(column) for column in columns] - assert actual_column_types in expected_column_types, actual_column_types - - def default_load_variant(self, db_engine: Engine) -> DelimitedVariant: - if db_engine.name == 'bigquery': - return 'bigquery' - elif db_engine.name == 'vertica': - return 'vertica' - else: - return 'bluelabs' - - def determine_load_variant(self) -> DelimitedVariant: - if self.file_variant is None: - - # If we're not loading from a file, we're copying from a database - if self.source_data_db_engine is None: - # Loading from a dataframe - return self.default_load_variant(self.engine) + if self.source_db_engine is None: + if self.file_variant is None: + assert actual_column_types in\ + (expected_df_loaded_database_column_types.get(self.target_db_engine.name), + expected_single_database_column_types[self.target_db_engine.name]),\ + f'Could not find column types filed under ' \ + f"{('df', self.target_db_engine.name)} or : " \ + f"{self.target_db_engine.name}: " \ + f'{actual_column_types}' else: - # Loading from a database - if self.source_data_db_engine.name == 'bigquery': - return 'bigquery' - else: - return 'vertica' + assert actual_column_types ==\ + expected_single_database_column_types[self.target_db_engine.name],\ + f'Could not find column types filed under {self.target_db_engine.name}: ' +\ + f'{actual_column_types}' else: - return self.file_variant - - def loaded_from_dataframe(self) -> bool: - return self.file_variant is None and self.source_data_db_engine is None + assert (actual_column_types in + (expected_table2table_column_types.get((self.source_db_engine.name, + self.target_db_engine.name)), + expected_single_database_column_types[self.source_db_engine.name], + expected_single_database_column_types[self.target_db_engine.name], + expected_df_loaded_database_column_types.get(self.target_db_engine.name))),\ + f'Could not find column types filed under '\ + f"{(self.source_db_engine.name, self.target_db_engine.name)} "\ + 'or either individually: '\ + f'{actual_column_types}' def validate_data_values(self, schema_name: str, table_name: str) -> None: params = {} - load_variant = self.determine_load_variant() - with self.engine.connect() as connection: + load_variant = self.tc.determine_load_variant() + + with self.target_db_engine.connect() as connection: set_session_tz(connection) select_sql: Union[TextClause, str] - if self.engine.name == 'bigquery': + if self.target_db_engine.name == 'bigquery': # # According to Google, "DATETIME is not supported for # uploading from Parquet" - @@ -153,18 +131,32 @@ def validate_data_values(self, # timezone on this column in this test validation code # for uniformity with a CAST(). # + # Similarly, when moving from MySQL, which doesn't + # support a usable datetimetz type, we'll end up + # creating a datetime type for the 'timestamptz' + # column, and will need to cast. select_sql = text(f""" SELECT num, numstr, comma, doublequote, quotecommaquote, date, `time`, CAST(`timestamp` AS datetime) as `timestamp`, format_datetime(:formatstr, CAST(`timestamp` as datetime)) as timestampstr, timestamptz, - format_timestamp(:tzformatstr, timestamptz) as timestamptzstr + format_timestamp(:tzformatstr, CAST(`timestamptz` as timestamp)) + as timestamptzstr FROM {schema_name}.{table_name} """) params = { "tzformatstr": "%E4Y-%m-%d %H:%M:%E*S %Z", "formatstr": "%E4Y-%m-%d %H:%M:%E*S", } + elif self.target_db_engine.name == 'mysql': + select_sql = f""" + SELECT num, numstr, comma, doublequote, quotecommaquote, date, `time`, + `timestamp`, + DATE_FORMAT(`timestamp`, '%%Y-%%m-%%d %%H:%%i:%%s.%%f') as timestampstr, + timestamptz, + DATE_FORMAT(timestamptz, '%%Y-%%m-%%d %%H:%%i:%%s.%%f+00') as timestamptzstr + FROM {schema_name}.{table_name} + """ else: select_sql = f""" SELECT num, numstr, comma, doublequote, quotecommaquote, date, "time", @@ -186,93 +178,32 @@ def validate_data_values(self, assert ret['quotecommaquote'] == '","' assert ret['date'] == datetime.date(2000, 1, 1),\ f"Expected datetime.date(2000, 1, 1), got {ret['date']}" - if self.supports_time_without_date(): - assert ret['time'] == datetime.time(0, 0), f"Incorrect time: {ret['time']}" + + if self.tc.supports_time_without_date(): + if self.tc.selects_time_types_as_timedelta(): + assert ret['time'] == datetime.timedelta(0, 0),\ + f"Incorrect time: {ret['time']} (of type {type(ret['time'])})" + else: + assert ret['time'] == datetime.time(0, 0),\ + f"Incorrect time: {ret['time']} (of type {type(ret['time'])})" else: # fall back to storing as string - if self.variant_uses_am_pm(load_variant): + if load_variant is not None and self.tc.variant_uses_am_pm(load_variant): assert ret['time'] == '12:00 AM', f"time was {ret['time']}" else: assert ret['time'] == '00:00:00', f"time was {ret['time']}" - if self.variant_doesnt_support_seconds(load_variant): + if (((load_variant is not None) and + self.tc.variant_doesnt_support_seconds(load_variant)) or + ((self.file_variant is not None) and + self.tc.variant_doesnt_support_seconds(self.file_variant))): assert ret['timestamp'] ==\ datetime.datetime(2000, 1, 2, 12, 34),\ f"Found timestamp {ret['timestamp']}" else: assert (ret['timestamp'] == datetime.datetime(2000, 1, 2, 12, 34, 56, 789012)),\ - f"ret['timestamp'] was {ret['timestamp']}" - - if (self.loaded_from_dataframe() and - self.variant_doesnt_support_timezones(load_variant) and - not self.database_default_store_timezone_is_us_eastern()): - # - # In this case, we correctly tell Pandas that we have are - # at noon:34 US/Eastern, and tell Pandas to format the - # datetime format. Unfortunately, if you don't specify a - # timezone as part of that format, Pandas just prints the - # TZ-naive hour. - # - utc_hour = 12 - elif (self.variant_doesnt_support_timezones(self.file_variant) and - not self.database_default_store_timezone_is_us_eastern()): - # In this case we're loading from one of our example - # files, but the example file doesn't contain a timezone. - # Per tests/integration/resources/README.md: - # - # On systems where a timezone can't be represented, - # this should be represented as if the implicit - # timezone was US/Eastern. - # - # Example date that we'd be loading as a string: - # - # 2000-01-02 12:34:56.789012 - # - # Per our tests/integration/resources/README.md, this is - # representing noon:34 on the east coast. - # - # However, if we load into a database who assumes that - # timezoneless times coming in are in in UTC, when we - # select it back out in UTC form, it'll come back as noon - # UTC! - utc_hour = 12 - else: - # ...if, however, either the variant *does* support - # timezones, if it gets rendered back as UTC, it'll be at - # hour 17 UTC - which is noon US/Eastern. - - # ...and if the database assumes US/Eastern when storing, - # the same result will happen, as the database will - # understand that noon on the east coast is hour 17 UTC. - utc_hour = 17 - if self.variant_doesnt_support_seconds(load_variant): - seconds = '00' - micros = '000000' - else: - seconds = '56' - micros = '789012' - - assert ret['timestampstr'] == f'2000-01-02 12:34:{seconds}.{micros}', ret['timestampstr'] - - assert ret['timestamptzstr'] in [ - f'2000-01-02 {utc_hour}:34:{seconds}.{micros} UTC', - f'2000-01-02 {utc_hour}:34:{seconds}.{micros}+00' - ],\ - (f"translated ret['timestamptzstr'] was {ret['timestamptzstr']} and " - f"class is {type(ret['timestamptzstr'])} - expected " - f"hour to be {utc_hour}") - - utc = pytz.timezone('UTC') - if self.variant_doesnt_support_seconds(load_variant): - utc_naive_expected_time = datetime.datetime(2000, 1, 2, utc_hour, 34) - else: - utc_naive_expected_time = datetime.datetime(2000, 1, 2, utc_hour, 34, 56, 789012) - utc_expected_time = utc.localize(utc_naive_expected_time) + f"ret['timestamp'] was {ret['timestamp']} of type {type(ret['timestamp'])}" - # Dunno why sqlalchemy doesn't return this instead, but - # timestamptzstr shows that db knows what's up internally: - # - actual_time = ret['timestamptz'] - assert actual_time - utc_expected_time == datetime.timedelta(0),\ - f"Delta is {actual_time - utc_expected_time}, " \ - f"actual_time is {actual_time}, expected time is {utc_expected_time}" + self.tz_validator.validate(timestampstr=ret['timestampstr'], + timestamptzstr=ret['timestamptzstr'], + timestamptz=ret['timestamptz']) diff --git a/tests/integration/resources/README.md b/tests/integration/resources/README.md index 271774edd..6f7ffa8e1 100644 --- a/tests/integration/resources/README.md +++ b/tests/integration/resources/README.md @@ -61,3 +61,10 @@ timezones in timestamps are expressed. output being the UTC time without offset (so, "17:34:56.789012" for our fixtures). Otherwise they would appear as "12:34:56.789012", the time in the US/Eastern timezone that our fixtures assign. + +### notz + +* Some databases (e.g., MySQL) don't have a generally usable column + type for our `datetimetz` type. As a result, when data is exported + from a table without a records schema defined, it's of course not + going to have a timezone offset recorded. diff --git a/tests/integration/resources/delimited-bigquery-no-header-pandas-notz.csv b/tests/integration/resources/delimited-bigquery-no-header-pandas-notz.csv new file mode 100644 index 000000000..c6bbc19e7 --- /dev/null +++ b/tests/integration/resources/delimited-bigquery-no-header-pandas-notz.csv @@ -0,0 +1,2 @@ +123,123,foo,",","""",""",""","* SQL unload would generate multiple files (one for each slice/part) +* Filecat would produce a single data file",2000-01-01,00:00:00,2000-01-02 12:34:56.789012,2000-01-02 12:34:56.789012 diff --git a/tests/integration/resources/delimited-bluelabs-no-header-pandas-notz.csv b/tests/integration/resources/delimited-bluelabs-no-header-pandas-notz.csv new file mode 100644 index 000000000..f542ef910 --- /dev/null +++ b/tests/integration/resources/delimited-bluelabs-no-header-pandas-notz.csv @@ -0,0 +1,2 @@ +123,123,foo,\,,","\,",* SQL unload would generate multiple files (one for each slice/part)\ +* Filecat would produce a single data file,2000-01-01,00:00:00,2000-01-02 12:34:56.789012,2000-01-02 12:34:56.789012 diff --git a/tests/integration/resources/delimited-bluelabs-with-header-pandas-notz.csv b/tests/integration/resources/delimited-bluelabs-with-header-pandas-notz.csv new file mode 100644 index 000000000..a52f2a461 --- /dev/null +++ b/tests/integration/resources/delimited-bluelabs-with-header-pandas-notz.csv @@ -0,0 +1,3 @@ +num,numstr,str,comma,doublequote,quotecommaquote,newlinestr,date,time,timestamp,timestamptz +123,123,foo,\,,","\,",* SQL unload would generate multiple files (one for each slice/part)\ +* Filecat would produce a single data file,2000-01-01,00:00:00,2000-01-02 12:34:56.789012,2000-01-02 12:34:56.789012 diff --git a/tests/integration/resources/delimited-vertica-no-header-pandas-notz.csv b/tests/integration/resources/delimited-vertica-no-header-pandas-notz.csv new file mode 100644 index 000000000..b608a15ae --- /dev/null +++ b/tests/integration/resources/delimited-vertica-no-header-pandas-notz.csv @@ -0,0 +1,2 @@ +123123foo,"","* SQL unload would generate multiple files (one for each slice/part) +* Filecat would produce a single data file2000-01-0100:00:002000-01-02 12:34:56.7890122000-01-02 12:34:56.789012 \ No newline at end of file diff --git a/tests/integration/resources/delimited-vertica-with-header-pandas-notz.csv b/tests/integration/resources/delimited-vertica-with-header-pandas-notz.csv new file mode 100644 index 000000000..581f2abe9 --- /dev/null +++ b/tests/integration/resources/delimited-vertica-with-header-pandas-notz.csv @@ -0,0 +1,2 @@ +numnumstrstrcommadoublequotequotecommaquotenewlinestrdatetimetimestamptimestamptz123123foo,"","* SQL unload would generate multiple files (one for each slice/part) +* Filecat would produce a single data file2000-01-0100:00:002000-01-02 12:34:56.7890122000-01-02 12:34:56.789012 \ No newline at end of file diff --git a/tests/unit/db/mysql/__init__.py b/tests/unit/db/mysql/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/db/mysql/test_mysql_db_driver.py b/tests/unit/db/mysql/test_mysql_db_driver.py new file mode 100644 index 000000000..3863ff493 --- /dev/null +++ b/tests/unit/db/mysql/test_mysql_db_driver.py @@ -0,0 +1,128 @@ +import unittest +from records_mover.db.mysql.mysql_db_driver import MySQLDBDriver +from mock import MagicMock, Mock +import sqlalchemy + + +class TestMySQLDBDriver(unittest.TestCase): + def setUp(self): + self.mock_db_engine = MagicMock(name='db_engine') + self.mock_url_resolver = Mock(name='url_resolver') + self.mock_db_engine.engine = self.mock_db_engine + self.mysql_db_driver = MySQLDBDriver(db=self.mock_db_engine, + url_resolver=self.mock_url_resolver) + + def test_integer_limits(self): + expectations = { + sqlalchemy.dialects.mysql.TINYINT(): (-128, 127), + sqlalchemy.dialects.mysql.TINYINT(unsigned=True): (0, 255), + sqlalchemy.dialects.mysql.SMALLINT(): (-32768, 32767), + sqlalchemy.dialects.mysql.SMALLINT(unsigned=True): (0, 65535), + sqlalchemy.dialects.mysql.MEDIUMINT(): (-8388608, 8388607), + sqlalchemy.dialects.mysql.MEDIUMINT(unsigned=True): (0, 16777215), + sqlalchemy.dialects.mysql.INTEGER(): (-2147483648, 2147483647), + sqlalchemy.dialects.mysql.INTEGER(unsigned=True): (0, 4294967295), + sqlalchemy.dialects.mysql.BIGINT(): (-9223372036854775808, 9223372036854775807), + sqlalchemy.dialects.mysql.BIGINT(unsigned=True): (0, 18446744073709551615), + } + for mock_type, (expected_min_int, expected_max_int) in expectations.items(): + min_int, max_int = self.mysql_db_driver.integer_limits(mock_type) + self.assertEqual(min_int, expected_min_int) + self.assertEqual(max_int, expected_max_int) + + def test_integer_limits_unexpected_type(self): + out = self.mysql_db_driver.integer_limits(Mock(name='unexpected')) + self.assertEqual(None, out) + + def test_fp_constraints_double(self): + db_col_type = sqlalchemy.dialects.mysql.DOUBLE() + total_bits, significand_bits = self.mysql_db_driver.fp_constraints(db_col_type) + self.assertEqual(total_bits, 64) + self.assertEqual(significand_bits, 53) + + def test_fp_constraints_float(self): + db_col_type = sqlalchemy.sql.sqltypes.FLOAT() + total_bits, significand_bits = self.mysql_db_driver.fp_constraints(db_col_type) + self.assertEqual(total_bits, 32) + self.assertEqual(significand_bits, 23) + + def test_fp_constraints_unexpected_type(self): + out = self.mysql_db_driver.fp_constraints(Mock(name='unexpected')) + self.assertEqual(None, out) + + def test_type_for_tiny_fits(self): + out = self.mysql_db_driver.type_for_integer(-123, 123) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.TINYINT) + self.assertEqual(out.unsigned, False) + + def test_type_for_unsigned_tiny_fits(self): + out = self.mysql_db_driver.type_for_integer(123, 253) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.TINYINT) + self.assertEqual(out.unsigned, True) + + def test_type_for_smallint_fits(self): + out = self.mysql_db_driver.type_for_integer(-123, 300) + self.assertEqual(type(out), sqlalchemy.sql.sqltypes.SMALLINT) + + def test_type_for_unsigned_smallint_fits(self): + out = self.mysql_db_driver.type_for_integer(0, 60000) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.SMALLINT) + self.assertEqual(out.unsigned, True) + + def test_type_for_mediumint_fits(self): + out = self.mysql_db_driver.type_for_integer(-123, 123000) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.types.MEDIUMINT) + + def test_type_for_unsigned_integer_fits(self): + out = self.mysql_db_driver.type_for_integer(123, 2147483658) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.INTEGER) + self.assertEqual(out.unsigned, True) + + def test_type_for_bigint_fits(self): + out = self.mysql_db_driver.type_for_integer(-123, 9223372036854775807) + self.assertEqual(type(out), sqlalchemy.sql.sqltypes.BIGINT) + + def test_type_for_unsigned_bigint_fits(self): + out = self.mysql_db_driver.type_for_integer(123, 9223372036854775808) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.types.BIGINT) + self.assertEqual(out.unsigned, True) + + def test_type_for_integer_too_big(self): + out = self.mysql_db_driver.type_for_integer(-12300000000000000000, 123000000000000000000) + self.assertEqual(type(out), sqlalchemy.sql.sqltypes.Numeric) + + def test_type_for_integer_unspecified(self): + out = self.mysql_db_driver.type_for_integer(None, None) + self.assertEqual(type(out), sqlalchemy.sql.sqltypes.Integer) + + def test_type_for_floating_point_too_big(self): + out = self.mysql_db_driver.type_for_floating_point(100, 80) + self.assertEqual(type(out), sqlalchemy.sql.sqltypes.Float) + self.assertEqual(out.precision, 53) + + def test_type_for_floating_point_fits(self): + out = self.mysql_db_driver.type_for_floating_point(12, 8) + self.assertEqual(type(out), sqlalchemy.sql.sqltypes.Float) + self.assertEqual(out.precision, 8) + + def test_type_for_fixed_point_big(self): + type_ = self.mysql_db_driver.type_for_fixed_point(123, 45) + self.assertEqual(type(type_), sqlalchemy.dialects.mysql.types.DOUBLE) + + def test_type_for_fixed_point_small(self): + type_ = self.mysql_db_driver.type_for_fixed_point(12, 3) + self.assertEqual(type(type_), sqlalchemy.types.Numeric) + + def test_varchar_length_is_in_chars(self): + out = self.mysql_db_driver.varchar_length_is_in_chars() + self.assertEqual(out, True) + + def test_type_for_date_plus_time_with_tz(self): + out = self.mysql_db_driver.type_for_date_plus_time(has_tz=True) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.DATETIME) + self.assertEqual(out.fsp, 6) + + def test_type_for_date_plus_time_with_no_tz(self): + out = self.mysql_db_driver.type_for_date_plus_time(has_tz=False) + self.assertEqual(type(out), sqlalchemy.dialects.mysql.DATETIME) + self.assertEqual(out.fsp, 6) diff --git a/tests/unit/db/vertica/base_test_vertica_db_driver.py b/tests/unit/db/vertica/base_test_vertica_db_driver.py index 949f72836..9661fb18f 100644 --- a/tests/unit/db/vertica/base_test_vertica_db_driver.py +++ b/tests/unit/db/vertica/base_test_vertica_db_driver.py @@ -24,6 +24,7 @@ def setUp(self): s3_temp_base_loc=self.mock_s3_temp_base_loc, url_resolver=self.mock_url_resolver) self.mock_VerticaLoader = mock_VerticaLoader + self.mock_vertica_loader = mock_VerticaLoader.return_value mock_records_unload_plan = create_autospec(RecordsUnloadPlan) mock_records_unload_plan.records_format = create_autospec(DelimitedRecordsFormat) diff --git a/tests/unit/db/vertica/test_vertica_db_driver.py b/tests/unit/db/vertica/test_vertica_db_driver.py index e096363da..693e7fdfd 100644 --- a/tests/unit/db/vertica/test_vertica_db_driver.py +++ b/tests/unit/db/vertica/test_vertica_db_driver.py @@ -95,6 +95,13 @@ def test_can_load_from_fileobjs(self): out = self.vertica_db_driver.can_load_from_fileobjs() self.assertEqual(True, out) + def test_can_load_this_format(self): + mock_source_records_format = Mock(name='source_records_format') + out = self.vertica_db_driver.can_load_this_format(mock_source_records_format) + self.assertEqual(self.mock_vertica_loader.can_load_this_format.return_value, + out) + self.mock_vertica_loader.can_load_this_format.assert_called_with(mock_source_records_format) + def test_best_records_format_variant(self): out = self.vertica_db_driver.best_records_format_variant('blah') self.assertEqual(None, out) diff --git a/tests/unit/records/pandas/__init__.py b/tests/unit/records/pandas/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unit/records/pandas/test_prep_for_csv.py b/tests/unit/records/pandas/test_prep_for_csv.py new file mode 100644 index 000000000..6d53f2100 --- /dev/null +++ b/tests/unit/records/pandas/test_prep_for_csv.py @@ -0,0 +1,122 @@ +import pandas as pd +# import pytz +import unittest +from records_mover.records.pandas import prep_df_for_csv_output +from records_mover.records.schema import RecordsSchema +from records_mover.records import DelimitedRecordsFormat, ProcessingInstructions + + +class TestPrepForCsv(unittest.TestCase): + def test_prep_df_for_csv_output_no_include_index(self): + schema_data = { + 'schema': "bltypes/v1", + 'fields': { + "date": { + "type": "date", + "index": 1, + }, + "time": { + "type": "time", + "index": 2, + }, + "timetz": { + "type": "timetz", + "index": 3, + }, + } + } + records_format = DelimitedRecordsFormat(variant='bluelabs') + records_schema = RecordsSchema.from_data(schema_data) + processing_instructions = ProcessingInstructions() + # us_eastern = pytz.timezone('US/Eastern') + data = { + 'date': [pd.Timestamp(year=1970, month=1, day=1)], + 'time': [ + pd.Timestamp(year=1970, month=1, day=1, + hour=12, minute=33, second=53, microsecond=1234) + ], + # timetz is not well supported in records mover yet. For + # instance, specifying how it's turned into a CSV is not + # currently part of the records spec: + # + # https://app.asana.com/0/1128138765527694/1169941483931186 + # + # In addition, Vertica suffers from a driver limitation: + # + # https://app.asana.com/0/search/1169941483931185/1126315736470782 + # + # 'timetz': [ + # us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1, + # hour=12, minute=33, second=53, + # microsecond=1234)), + # ], + } + df = pd.DataFrame(data, + columns=['date', 'time', 'timetz']) + + new_df = prep_df_for_csv_output(df=df, + include_index=False, + records_schema=records_schema, + records_format=records_format, + processing_instructions=processing_instructions) + self.assertEqual(new_df['date'][0], '1970-01-01') + self.assertEqual(new_df['time'][0], '12:33:53') + # self.assertEqual(new_df['timetz'][0], '12:33:53-05') + self.assertIsNotNone(new_df) + + def test_prep_df_for_csv_output_include_index(self): + schema_data = { + 'schema': "bltypes/v1", + 'fields': { + "date": { + "type": "date", + "index": 1, + }, + "time": { + "type": "time", + "index": 2, + }, + "timetz": { + "type": "timetz", + "index": 3, + }, + } + } + records_format = DelimitedRecordsFormat(variant='bluelabs') + records_schema = RecordsSchema.from_data(schema_data) + processing_instructions = ProcessingInstructions() + # us_eastern = pytz.timezone('US/Eastern') + data = { + 'time': [ + pd.Timestamp(year=1970, month=1, day=1, + hour=12, minute=33, second=53, microsecond=1234) + ], + # timetz is not well supported in records mover yet. For + # instance, specifying how it's turned into a CSV is not + # currently part of the records spec: + # + # https://app.asana.com/0/1128138765527694/1169941483931186 + # + # In addition, Vertica suffers from a driver limitation: + # + # https://app.asana.com/0/search/1169941483931185/1126315736470782 + # + # 'timetz': [ + # us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1, + # hour=12, minute=33, second=53, + # microsecond=1234)), + # ], + } + df = pd.DataFrame(data, + index=[pd.Timestamp(year=1970, month=1, day=1)], + columns=['time', 'timetz']) + + new_df = prep_df_for_csv_output(df=df, + include_index=True, + records_schema=records_schema, + records_format=records_format, + processing_instructions=processing_instructions) + self.assertEqual(new_df.index[0], '1970-01-01') + self.assertEqual(new_df['time'][0], '12:33:53') + # self.assertEqual(new_df['timetz'][0], '12:33:53-05') + self.assertIsNotNone(new_df) diff --git a/tests/unit/records/schema/field/test_field.py b/tests/unit/records/schema/field/test_field.py index fdd0aed88..9c22c5863 100644 --- a/tests/unit/records/schema/field/test_field.py +++ b/tests/unit/records/schema/field/test_field.py @@ -2,6 +2,7 @@ from mock import Mock, patch # , ANY from records_mover.records.schema.field import RecordsSchemaField import numpy as np +import pandas as pd class TestField(unittest.TestCase): @@ -209,3 +210,35 @@ def test_python_type_to_field_type(self): mock_unknown_type = Mock(name='unknown_type') out = RecordsSchemaField.python_type_to_field_type(mock_unknown_type) self.assertIsNone(out) + + def test_cast_series_type_time_empty(self): + mock_name = Mock(name='name') + mock_field_type = 'time' + mock_constraints = Mock(name='constraints') + mock_statistics = Mock(name='statistics') + mock_representations = Mock(name='representations') + field = RecordsSchemaField(name=mock_name, + field_type=mock_field_type, + constraints=mock_constraints, + statistics=mock_statistics, + representations=mock_representations) + data = np.array([]) + series = pd.Series(data) + new_series = field.cast_series_type(series) + self.assertIsNotNone(new_series) + + def test_cast_series_type_time_timedelta_entries(self): + mock_name = Mock(name='name') + mock_field_type = 'time' + mock_constraints = Mock(name='constraints') + mock_statistics = Mock(name='statistics') + mock_representations = Mock(name='representations') + field = RecordsSchemaField(name=mock_name, + field_type=mock_field_type, + constraints=mock_constraints, + statistics=mock_statistics, + representations=mock_representations) + data = np.array([pd.Timedelta(hours=1, minutes=23, seconds=45)]) + series = pd.Series(data) + new_series = field.cast_series_type(series) + self.assertEqual(new_series[0], '01:23:45') diff --git a/tests/unit/records/schema/test_records_schema.py b/tests/unit/records/schema/test_records_schema.py index ecccc4611..18ad2e20e 100644 --- a/tests/unit/records/schema/test_records_schema.py +++ b/tests/unit/records/schema/test_records_schema.py @@ -164,10 +164,9 @@ def test_cast_dataframe_types(self): schema = RecordsSchema(fields=mock_fields, known_representations=mock_known_representations) mock_df = Mock(name='df') - mock_col_mappings = {mock_field_a.name: mock_field_a.to_numpy_dtype.return_value} out = schema.cast_dataframe_types(mock_df) - mock_df.astype.assert_called_with(mock_col_mappings) - self.assertEqual(out, mock_df.astype.return_value) + mock_df.apply.assert_called() + self.assertEqual(out, mock_df.apply.return_value) def test_cast_dataframe_types_no_fields(self): mock_fields = [] @@ -176,7 +175,7 @@ def test_cast_dataframe_types_no_fields(self): known_representations=mock_known_representations) mock_df = Mock(name='df') out = schema.cast_dataframe_types(mock_df) - self.assertEqual(out, mock_df) + self.assertEqual(out, mock_df.apply.return_value) def test_assign_dataframe_names_no_index(self): data = [{'a': 1}] diff --git a/tests/unit/records/sources/test_dataframes.py b/tests/unit/records/sources/test_dataframes.py index 92c40677c..8c0e2cdd4 100644 --- a/tests/unit/records/sources/test_dataframes.py +++ b/tests/unit/records/sources/test_dataframes.py @@ -5,6 +5,7 @@ class TestDataframesRecordsSource(unittest.TestCase): + @patch('records_mover.records.sources.dataframes.prep_df_for_csv_output') @patch('records_mover.records.sources.dataframes.purge_unnamed_unused_columns') @patch('records_mover.records.sources.dataframes.RecordsSchema') @patch('records_mover.records.sources.dataframes.FileobjsSource') @@ -19,7 +20,8 @@ def test_to_delimited_fileobjs_source(self, mock_complain_on_unhandled_hints, mock_FileobjsSource, mock_RecordsSchema, - mock_purge_unnamed_unused_columns): + mock_purge_unnamed_unused_columns, + mock_prep_df_for_csv_output): mock_df_1 = Mock(name='df_1') mock_df_2 = Mock(name='df_2') mock_processing_instructions = Mock(name='processing_instructions') @@ -60,12 +62,21 @@ def generate_filename(prefix): assert_called_with(mock_processing_instructions.fail_if_dont_understand, mock_unhandled_hints, mock_target_records_format.hints) - mock_df_1.to_csv.assert_called_with(path_or_buf=mock_output_filename, - index=mock_include_index, - **mock_options) - mock_df_2.to_csv.assert_called_with(path_or_buf=mock_output_filename, - index=mock_include_index, - **mock_options) + mock_pi = mock_processing_instructions + mock_prep_df_for_csv_output.assert_any_call(mock_df_1, + include_index=mock_include_index, + records_schema=mock_target_records_schema, + records_format=mock_target_records_format, + processing_instructions=mock_pi) + mock_prep_df_for_csv_output.assert_any_call(mock_df_2, + include_index=mock_include_index, + records_schema=mock_target_records_schema, + records_format=mock_target_records_format, + processing_instructions=mock_pi) + mock_formatted_df = mock_prep_df_for_csv_output.return_value + mock_formatted_df.to_csv.assert_called_with(path_or_buf=mock_output_filename, + index=mock_include_index, + **mock_options) mock_FileobjsSource.\ assert_called_with(target_names_to_input_fileobjs={ "data001.csv": mock_data_fileobj_1, diff --git a/tests/unit/records/sources/test_fileobjs.py b/tests/unit/records/sources/test_fileobjs.py index e8b169e05..76fa5285f 100644 --- a/tests/unit/records/sources/test_fileobjs.py +++ b/tests/unit/records/sources/test_fileobjs.py @@ -123,7 +123,10 @@ def test_str(self): @patch('records_mover.records.sources.fileobjs.io') @patch('pandas.read_csv') def test_to_dataframes_source(self, mock_read_csv, mock_io, mock_pandas_read_csv_options): - def read_csv_options(records_format, unhandled_hints, processing_instructions): + def read_csv_options(records_format, + records_schema, + unhandled_hints, + processing_instructions): unhandled_hints.clear() return {} diff --git a/tests/unit/records/targets/test_fileobj.py b/tests/unit/records/targets/test_fileobj.py index 83aa42e4a..1831f8874 100644 --- a/tests/unit/records/targets/test_fileobj.py +++ b/tests/unit/records/targets/test_fileobj.py @@ -6,11 +6,13 @@ class TestFileobjTarget(unittest.TestCase): + @patch('records_mover.records.pandas.prep_df_for_csv_output') @patch('records_mover.records.targets.fileobj.io') @patch('records_mover.records.targets.fileobj.complain_on_unhandled_hints') def test_move_from_dataframe_uncompressed_no_header_row(self, mock_complain_on_unhandled_hints, - mock_io): + mock_io, + mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ 'encoding': 'mumble', @@ -28,6 +30,7 @@ def test_move_from_dataframe_uncompressed_no_header_row(self, mock_processing_instructions = Mock(name='processing_instructions') mock_dfs_source = Mock(name='dfs_source') mock_dfs_source.dfs = [mock_df_1, mock_df_2] + mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_text_fileobj = mock_io.TextIOWrapper.return_value @@ -57,11 +60,13 @@ def test_move_from_dataframe_uncompressed_no_header_row(self, sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None)) + @patch('records_mover.records.pandas.prep_df_for_csv_output') @patch('records_mover.records.targets.fileobj.io') @patch('records_mover.records.targets.fileobj.complain_on_unhandled_hints') def test_move_from_dataframe_uncompressed_with_header_row(self, mock_complain_on_unhandled_hints, - mock_io): + mock_io, + mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ 'encoding': 'mumble', @@ -79,6 +84,7 @@ def test_move_from_dataframe_uncompressed_with_header_row(self, mock_processing_instructions = Mock(name='processing_instructions') mock_dfs_source = Mock(name='dfs_source') mock_dfs_source.dfs = [mock_df_1, mock_df_2] + mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_text_fileobj = mock_io.TextIOWrapper.return_value @@ -108,11 +114,13 @@ def test_move_from_dataframe_uncompressed_with_header_row(self, sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None)) + @patch('records_mover.records.pandas.prep_df_for_csv_output') @patch('records_mover.records.targets.fileobj.io') @patch('records_mover.records.targets.fileobj.complain_on_unhandled_hints') def test_move_from_dataframe_compressed_no_header_row(self, mock_complain_on_unhandled_hints, - mock_io): + mock_io, + mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ 'encoding': 'mumble', @@ -130,6 +138,7 @@ def test_move_from_dataframe_compressed_no_header_row(self, mock_processing_instructions = Mock(name='processing_instructions') mock_dfs_source = Mock(name='dfs_source') mock_dfs_source.dfs = [mock_df_1, mock_df_2] + mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, @@ -160,11 +169,13 @@ def test_move_from_dataframe_compressed_no_header_row(self, sep=',') self.assertEqual(out, MoveResult(move_count=2, output_urls=None)) + @patch('records_mover.records.pandas.prep_df_for_csv_output') @patch('records_mover.records.targets.fileobj.io') @patch('records_mover.records.targets.fileobj.complain_on_unhandled_hints') def test_move_from_dataframe_compressed_with_header_row(self, mock_complain_on_unhandled_hints, - mock_io): + mock_io, + mock_prep_df_for_csv_output): mock_fileobj = Mock(name='fileobj') mock_records_format = DelimitedRecordsFormat(hints={ 'encoding': 'mumble', @@ -182,6 +193,7 @@ def test_move_from_dataframe_compressed_with_header_row(self, mock_processing_instructions = Mock(name='processing_instructions') mock_dfs_source = Mock(name='dfs_source') mock_dfs_source.dfs = [mock_df_1, mock_df_2] + mock_prep_df_for_csv_output.side_effect = [mock_df_1, mock_df_2] out = fileobj_target.move_from_dataframes_source(mock_dfs_source, mock_processing_instructions) mock_df_1.to_csv.assert_called_with(path_or_buf=ANY, diff --git a/tests/unit/records/test_pandas_read_csv_options.py b/tests/unit/records/test_pandas_read_csv_options.py index e4d27e7e2..10e9c3dca 100644 --- a/tests/unit/records/test_pandas_read_csv_options.py +++ b/tests/unit/records/test_pandas_read_csv_options.py @@ -3,11 +3,36 @@ from records_mover.records.pandas import pandas_read_csv_options from records_mover.records.processing_instructions import ProcessingInstructions from records_mover.records.records_format import DelimitedRecordsFormat +from records_mover.records.schema import RecordsSchema class TestPandasReadCsvOptions(unittest.TestCase): + def setUp(self): + self.records_schema = RecordsSchema.from_data({ + 'schema': 'bltypes/v1', + 'fields': { + "date": { + "type": "date", + "index": 1, + }, + "time": { + "type": "time", + "index": 2, + }, + "timestamp": { + "type": "datetime", + "index": 3, + }, + "timestamptz": { + "type": "datetimetz", + "index": 4, + } + } + }) + def test_pandas_read_csv_options_bluelabs(self): expected = { + 'dayfirst': False, 'compression': 'gzip', 'delimiter': ',', 'doublequote': False, @@ -19,17 +44,71 @@ def test_pandas_read_csv_options_bluelabs(self): 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 3, - 'warn_bad_lines': True + 'warn_bad_lines': True, + 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=bluelabs_format_hints) unhandled_hints = set() - actual = pandas_read_csv_options(records_format, unhandled_hints, processing_instructions) + actual = pandas_read_csv_options(records_format, + self.records_schema, + unhandled_hints, + processing_instructions) + self.assertEqual(expected, actual) + self.assertFalse(unhandled_hints) + + def test_pandas_read_csv_options_bleulabs(self): + expected = { + 'dayfirst': True, + 'compression': 'gzip', + 'delimiter': ',', + 'doublequote': False, + 'encoding': 'UTF8', + 'engine': 'python', + 'error_bad_lines': True, + 'escapechar': '\\', + 'header': None, + 'prefix': 'untitled_', + 'quotechar': '"', + 'quoting': 3, + 'warn_bad_lines': True, + 'parse_dates': [0, 1, 2, 3], + } + processing_instructions = ProcessingInstructions() + hints = bluelabs_format_hints.copy() + hints.update({ + 'dateformat': 'DD-MM-YYYY', + 'datetimeformattz': 'DD-MM-YYYY HH24:MIOF', + 'datetimeformat': 'DD-MM-YYYY HH24:MI', + }) + records_format = DelimitedRecordsFormat(hints=hints) + unhandled_hints = set() + actual = pandas_read_csv_options(records_format, + self.records_schema, + unhandled_hints, + processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints) + def test_pandas_read_csv_options_inconsistent_date_format(self): + processing_instructions = ProcessingInstructions() + hints = bluelabs_format_hints.copy() + hints.update({ + 'dateformat': 'DD-MM-YYYY', + 'datetimeformattz': 'MM-DD-YYYY HH24:MIOF', + 'datetimeformat': 'DD-MM-YYYY HH24:MI', + }) + records_format = DelimitedRecordsFormat(hints=hints) + unhandled_hints = set() + with self.assertRaises(NotImplementedError): + pandas_read_csv_options(records_format, + self.records_schema, + unhandled_hints, + processing_instructions) + def test_pandas_read_csv_options_csv(self): expected = { + 'dayfirst': False, 'compression': 'gzip', 'delimiter': ',', 'doublequote': True, @@ -40,18 +119,23 @@ def test_pandas_read_csv_options_csv(self): 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 0, - 'warn_bad_lines': True + 'warn_bad_lines': True, + 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=csv_format_hints) unhandled_hints = set() - actual = pandas_read_csv_options(records_format, unhandled_hints, processing_instructions) + actual = pandas_read_csv_options(records_format, + self.records_schema, + unhandled_hints, + processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints) def test_pandas_read_csv_options_vertica(self): self.maxDiff = None expected = { + 'dayfirst': False, 'compression': None, 'delimiter': '\x01', 'doublequote': False, @@ -62,11 +146,15 @@ def test_pandas_read_csv_options_vertica(self): 'prefix': 'untitled_', 'quotechar': '"', 'quoting': 3, - 'warn_bad_lines': True + 'warn_bad_lines': True, + 'parse_dates': [0, 1, 2, 3], } processing_instructions = ProcessingInstructions() records_format = DelimitedRecordsFormat(hints=vertica_format_hints) unhandled_hints = set() - actual = pandas_read_csv_options(records_format, unhandled_hints, processing_instructions) + actual = pandas_read_csv_options(records_format, + self.records_schema, + unhandled_hints, + processing_instructions) self.assertEqual(expected, actual) self.assertFalse(unhandled_hints) diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index a99f4f7cd..77cecce8f 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -50,3 +50,49 @@ def test_get_default_db_engine_no_default(self, mock_db_facts_from_env.assert_called_with() mock_db_facts = mock_db_facts_from_env.return_value mock_engine_from_db_facts.assert_called_with(mock_db_facts) + + @patch('records_mover.session.db_facts_from_env') + @patch('records_mover.db.connect.engine_from_db_facts') + def test_get_default_db_facts_no_default(self, + mock_engine_from_db_facts, + mock_db_facts_from_env, + mock_os, + mock_subprocess): + session = Session() + self.assertEqual(session.get_default_db_facts(), mock_db_facts_from_env.return_value) + mock_db_facts_from_env.assert_called_with() + + @patch('records_mover.db.connect.engine_from_db_facts') + def test_get_default_db_facts_with_default(self, + mock_engine_from_db_facts, + mock_os, + mock_subprocess): + mock_creds = Mock(name='creds') + mock_default_db_creds_name = Mock(name='default_db_creds_name') + session = Session(creds=mock_creds, + default_db_creds_name=mock_default_db_creds_name) + self.assertEqual(session.get_default_db_facts(), + mock_creds.db_facts.return_value) + mock_creds.db_facts.assert_called_with(mock_default_db_creds_name) + + @patch('records_mover.session.set_stream_logging') + def test_set_stream_logging(self, + mock_set_stream_logging, + mock_os, + mock_subprocess): + session = Session() + mock_name = Mock(name='name') + mock_level = Mock(name='level') + mock_stream = Mock(name='stream') + mock_fmt = Mock(name='fmt') + mock_datefmt = Mock(name='datefmt') + session.set_stream_logging(name=mock_name, + level=mock_level, + stream=mock_stream, + fmt=mock_fmt, + datefmt=mock_datefmt) + mock_set_stream_logging.assert_called_with(name=mock_name, + level=mock_level, + stream=mock_stream, + fmt=mock_fmt, + datefmt=mock_datefmt) diff --git a/tests/unit/url/test_filesystem.py b/tests/unit/url/test_filesystem.py index dc72a3730..a82785109 100644 --- a/tests/unit/url/test_filesystem.py +++ b/tests/unit/url/test_filesystem.py @@ -79,7 +79,8 @@ def test_repr(self): repr(self.filesystem_file_url)) def test_wait_to_exist(self): - self.assertTrue(self.filesystem_file_url.wait_to_exist()) + self.filesystem_file_url.wait_to_exist() + # ensure it returns def test_is_directory(self): self.assertFalse(self.filesystem_file_url.is_directory()) diff --git a/tests/unit/utils/test_json_schema.py b/tests/unit/utils/test_json_schema.py index 799599e46..9cf951586 100644 --- a/tests/unit/utils/test_json_schema.py +++ b/tests/unit/utils/test_json_schema.py @@ -37,7 +37,7 @@ def m(a_str: str, a_ignorable: int, a_special: int, a_optional_impractical_type: Optional[Callable[[int], int]], - a_defaulted_string: str="foo") -> None: + a_defaulted_string: str = "foo") -> None: """ :param a_str: a_str desc :param a_int: a_int desc diff --git a/types/stubs/boto3/session/__init__.pyi b/types/stubs/boto3/session/__init__.pyi index 7707f2dff..f9521b314 100644 --- a/types/stubs/boto3/session/__init__.pyi +++ b/types/stubs/boto3/session/__init__.pyi @@ -164,6 +164,9 @@ class Session: region_name: str resource: Any + def __init__(self) -> None: + ... + def get_credentials(self) -> Optional[Credentials]: ... diff --git a/types/stubs/google/oauth2/service_account/__init__.pyi b/types/stubs/google/oauth2/service_account/__init__.pyi index 806d3b334..a5c3c7a9b 100644 --- a/types/stubs/google/oauth2/service_account/__init__.pyi +++ b/types/stubs/google/oauth2/service_account/__init__.pyi @@ -6,6 +6,6 @@ import google.auth.service_account class Credentials(google.auth.credentials.Credentials): @classmethod - def from_service_account_info(cls, info: Mapping[str, str], scopes: Iterable[str], **kwargs) ->\ + def from_service_account_info(cls, info: Mapping[str, str], scopes: Iterable[str]) ->\ google.auth.service_account.Credentials: ... diff --git a/types/stubs/logging/__init__.pyi b/types/stubs/logging/__init__.pyi new file mode 100644 index 000000000..db7379717 --- /dev/null +++ b/types/stubs/logging/__init__.pyi @@ -0,0 +1,269 @@ +__all__ = [ + 'Logger', + 'Filterer', + 'LogRecord', +] + +from typing import Any, Optional +from .log_record import LogRecord +from .logger import Logger +from .filterer import Filterer + + +raiseExceptions: bool +CRITICAL: int +FATAL: int +ERROR: int +WARNING: int +WARN: int +INFO: int +DEBUG: int +NOTSET: int + + +def getLevelName(level: Any): ... + + +def addLevelName(level: Any, levelName: Any) -> None: ... + + +def setLogRecordFactory(factory: Any) -> None: ... + + +def getLogRecordFactory(): ... + + +def makeLogRecord(dict: Any): ... + + +class PercentStyle: + default_format: str = ... + asctime_format: str = ... + asctime_search: str = ... + validation_pattern: Any = ... + def __init__(self, fmt: Any) -> None: ... + def usesTime(self): ... + def validate(self) -> None: ... + def format(self, record: Any): ... + + +class StrFormatStyle(PercentStyle): + default_format: str = ... + asctime_format: str = ... + asctime_search: str = ... + fmt_spec: Any = ... + field_spec: Any = ... + def validate(self) -> None: ... + + +class StringTemplateStyle(PercentStyle): + default_format: str = ... + asctime_format: str = ... + asctime_search: str = ... + def __init__(self, fmt: Any) -> None: ... + def usesTime(self): ... + def validate(self) -> None: ... + + +BASIC_FORMAT: str + + +class Formatter: + converter: Any = ... + datefmt: Any = ... + + def __init__( + self, + fmt: Optional[Any] = ..., + datefmt: Optional[Any] = ..., + style: str = ..., + validate: bool = ..., + ) -> None: ... + + default_time_format: str = ... + default_msec_format: str = ... + def formatTime(self, record: Any, datefmt: Optional[Any] = ...): ... + def formatException(self, ei: Any): ... + def usesTime(self): ... + def formatMessage(self, record: Any): ... + def formatStack(self, stack_info: Any): ... + def format(self, record: Any): ... + + +class BufferingFormatter: + linefmt: Any = ... + def __init__(self, linefmt: Optional[Any] = ...) -> None: ... + def formatHeader(self, records: Any): ... + def formatFooter(self, records: Any): ... + def format(self, records: Any): ... + + +class Filter: + name: Any = ... + nlen: Any = ... + def __init__(self, name: str = ...) -> None: ... + def filter(self, record: Any): ... + + +class Handler(Filterer): + level: Any = ... + formatter: Any = ... + def __init__(self, level: Any = ...) -> None: ... + def get_name(self): ... + def set_name(self, name: Any) -> None: ... + name: Any = ... + lock: Any = ... + def createLock(self) -> None: ... + def acquire(self) -> None: ... + def release(self) -> None: ... + def setLevel(self, level: Any) -> None: ... + def format(self, record: Any): ... + def emit(self, record: Any) -> None: ... + def handle(self, record: Any): ... + def setFormatter(self, fmt: Any) -> None: ... + def flush(self) -> None: ... + def close(self) -> None: ... + def handleError(self, record: Any) -> None: ... + + +class StreamHandler(Handler): + terminator: str = ... + stream: Any = ... + def __init__(self, stream: Optional[Any] = ...) -> None: ... + def flush(self) -> None: ... + def emit(self, record: Any) -> None: ... + def setStream(self, stream: Any): ... + + +class FileHandler(StreamHandler): + baseFilename: Any = ... + mode: Any = ... + encoding: Any = ... + delay: Any = ... + stream: Any = ... + + def __init__( + self, + filename: Any, + mode: str = ..., + encoding: Optional[Any] = ..., + delay: bool = ..., + ) -> None: ... + def close(self) -> None: ... + def emit(self, record: Any) -> None: ... + + +class _StderrHandler(StreamHandler): + def __init__(self, level: Any = ...) -> None: ... + @property + def stream(self): ... + + +lastResort: Any + + +class PlaceHolder: + loggerMap: Any = ... + def __init__(self, alogger: Any) -> None: ... + def append(self, alogger: Any) -> None: ... + + +def setLoggerClass(klass: Any) -> None: ... + + +def getLoggerClass(): ... + + +class Manager: + root: Any = ... + disable: int = ... + emittedNoHandlerWarning: bool = ... + loggerDict: Any = ... + loggerClass: Any = ... + logRecordFactory: Any = ... + def __init__(self, rootnode: Any) -> None: ... + def getLogger(self, name: str) -> "Logger": ... + def setLoggerClass(self, klass: Any) -> None: ... + def setLogRecordFactory(self, factory: Any) -> None: ... + + +class RootLogger(Logger): + def __init__(self, level: Any) -> None: ... + def __reduce__(self): ... + + +class LoggerAdapter: + logger: Any = ... + extra: Any = ... + def __init__(self, logger: Any, extra: Any) -> None: ... + def process(self, msg: Any, kwargs: Any): ... + def debug(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def info(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def warning(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def warn(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def error(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + + def exception( + self, msg: str, *args: Any, exc_info: bool = ..., **kwargs: Any + ) -> None: ... + def critical(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def log(self, level: Any, msg: str, *args: Any, **kwargs: Any) -> None: ... + def isEnabledFor(self, level: Any): ... + def setLevel(self, level: Any) -> None: ... + def getEffectiveLevel(self): ... + def hasHandlers(self): ... + @property + def manager(self): ... + @manager.setter + def manager(self, value: Any) -> None: ... + @property + def name(self): ... + + +def basicConfig(**kwargs: Any) -> None: ... + + +def getLogger(name: Optional[str] = ...) -> Logger: ... + + +def critical(msg: str, *args: Any, **kwargs: Any) -> None: ... + + +fatal = critical + + +def error(msg: str, *args: Any, **kwargs: Any) -> None: ... + + +def exception(msg: str, *args: Any, exc_info: bool = ..., **kwargs: Any) -> None: ... + + +def warning(msg: str, *args: Any, **kwargs: Any) -> None: ... + + +def warn(msg: str, *args: Any, **kwargs: Any) -> None: ... + + +def info(msg: str, *args: Any, **kwargs: Any) -> None: ... + + +def debug(msg: str, *args: Any, **kwargs: Any) -> None: ... + + +def log(level: Any, msg: str, *args: Any, **kwargs: Any) -> None: ... + + +def disable(level: Any = ...) -> None: ... + + +def shutdown(handlerList: Any = ...) -> None: ... + + +class NullHandler(Handler): + def handle(self, record: Any) -> None: ... + def emit(self, record: Any) -> None: ... + lock: Any = ... + def createLock(self) -> None: ... + + +def captureWarnings(capture: Any) -> None: ... diff --git a/types/stubs/logging/config.pyi b/types/stubs/logging/config.pyi new file mode 100644 index 000000000..ce7efc77f --- /dev/null +++ b/types/stubs/logging/config.pyi @@ -0,0 +1,79 @@ +from typing import Any, Optional + +DEFAULT_LOGGING_CONFIG_PORT: int +RESET_ERROR: Any + + +def fileConfig( + fname: Any, defaults: Optional[Any] = ..., disable_existing_loggers: bool = ... +) -> None: ... + + +IDENTIFIER: Any + + +def valid_ident(s: Any): ... + + +class ConvertingMixin: + def convert_with_key(self, key: Any, value: Any, replace: bool = ...): ... + def convert(self, value: Any): ... + + +class ConvertingDict(dict, ConvertingMixin): + def __getitem__(self, key: Any): ... + def get(self, key: Any, default: Optional[Any] = ...): ... + def pop(self, key: Any, default: Optional[Any] = ...): ... + + +class ConvertingList(list, ConvertingMixin): + def __getitem__(self, key: Any): ... + def pop(self, idx: int = ...): ... + + +class ConvertingTuple(tuple, ConvertingMixin): + def __getitem__(self, key: Any): ... + + +class BaseConfigurator: + CONVERT_PATTERN: Any = ... + WORD_PATTERN: Any = ... + DOT_PATTERN: Any = ... + INDEX_PATTERN: Any = ... + DIGIT_PATTERN: Any = ... + value_converters: Any = ... + importer: Any = ... + config: Any = ... + def __init__(self, config: Any) -> None: ... + def resolve(self, s: Any): ... + def ext_convert(self, value: Any): ... + def cfg_convert(self, value: Any): ... + def convert(self, value: Any): ... + def configure_custom(self, config: Any): ... + def as_tuple(self, value: Any): ... + + +class DictConfigurator(BaseConfigurator): + def configure(self) -> None: ... + def configure_formatter(self, config: Any): ... + def configure_filter(self, config: Any): ... + def add_filters(self, filterer: Any, filters: Any) -> None: ... + def configure_handler(self, config: Any): ... + def add_handlers(self, logger: Any, handlers: Any) -> None: ... + + def common_logger_config( + self, logger: Any, config: Any, incremental: bool = ... + ) -> None: ... + + def configure_logger( + self, name: Any, config: Any, incremental: bool = ... + ) -> None: ... + def configure_root(self, config: Any, incremental: bool = ...) -> None: ... + + +dictConfigClass = DictConfigurator + + +def dictConfig(config: Any) -> None: ... +def listen(port: Any = ..., verify: Optional[Any] = ...): ... +def stopListening() -> None: ... diff --git a/types/stubs/logging/filterer.py b/types/stubs/logging/filterer.py new file mode 100644 index 000000000..084a7b607 --- /dev/null +++ b/types/stubs/logging/filterer.py @@ -0,0 +1,9 @@ +from typing import Any + + +class Filterer: + filters: Any = ... + def __init__(self) -> None: ... + def addFilter(self, filter: Any) -> None: ... + def removeFilter(self, filter: Any) -> None: ... + def filter(self, record: Any): ... diff --git a/types/stubs/logging/handlers.pyi b/types/stubs/logging/handlers.pyi new file mode 100644 index 000000000..99eb4b7a1 --- /dev/null +++ b/types/stubs/logging/handlers.pyi @@ -0,0 +1,273 @@ +import logging +from typing import Any, Optional + +DEFAULT_TCP_LOGGING_PORT: int +DEFAULT_UDP_LOGGING_PORT: int +DEFAULT_HTTP_LOGGING_PORT: int +DEFAULT_SOAP_LOGGING_PORT: int +SYSLOG_UDP_PORT: int +SYSLOG_TCP_PORT: int + + +class BaseRotatingHandler(logging.FileHandler): + mode: Any = ... + encoding: Any = ... + namer: Any = ... + rotator: Any = ... + + def __init__( + self, filename: Any, mode: Any, encoding: Optional[Any] = ..., delay: bool = ... + ) -> None: ... + def emit(self, record: Any) -> None: ... + def rotation_filename(self, default_name: Any): ... + def rotate(self, source: Any, dest: Any) -> None: ... + + +class RotatingFileHandler(BaseRotatingHandler): + maxBytes: Any = ... + backupCount: Any = ... + + def __init__( + self, + filename: Any, + mode: str = ..., + maxBytes: int = ..., + backupCount: int = ..., + encoding: Optional[Any] = ..., + delay: bool = ..., + ) -> None: ... + stream: Any = ... + def doRollover(self) -> None: ... + def shouldRollover(self, record: Any): ... + + +class TimedRotatingFileHandler(BaseRotatingHandler): + when: Any = ... + backupCount: Any = ... + utc: Any = ... + atTime: Any = ... + interval: int = ... + suffix: str = ... + extMatch: str = ... + dayOfWeek: Any = ... + rolloverAt: Any = ... + + def __init__( + self, + filename: Any, + when: str = ..., + interval: int = ..., + backupCount: int = ..., + encoding: Optional[Any] = ..., + delay: bool = ..., + utc: bool = ..., + atTime: Optional[Any] = ..., + ) -> None: ... + def computeRollover(self, currentTime: Any): ... + def shouldRollover(self, record: Any): ... + def getFilesToDelete(self): ... + stream: Any = ... + def doRollover(self) -> None: ... + + +class WatchedFileHandler(logging.FileHandler): + def __init__( + self, + filename: Any, + mode: str = ..., + encoding: Optional[Any] = ..., + delay: bool = ..., + ) -> None: ... + stream: Any = ... + def reopenIfNeeded(self) -> None: ... + def emit(self, record: Any) -> None: ... + + +class SocketHandler(logging.Handler): + host: Any = ... + port: Any = ... + address: Any = ... + sock: Any = ... + closeOnError: bool = ... + retryTime: Any = ... + retryStart: float = ... + retryMax: float = ... + retryFactor: float = ... + def __init__(self, host: Any, port: Any) -> None: ... + def makeSocket(self, timeout: int = ...): ... + retryPeriod: Any = ... + def createSocket(self) -> None: ... + def send(self, s: Any) -> None: ... + def makePickle(self, record: Any): ... + def handleError(self, record: Any) -> None: ... + def emit(self, record: Any) -> None: ... + def close(self) -> None: ... + + +class DatagramHandler(SocketHandler): + closeOnError: bool = ... + def __init__(self, host: Any, port: Any) -> None: ... + def makeSocket(self): ... + def send(self, s: Any) -> None: ... + + +class SysLogHandler(logging.Handler): + LOG_EMERG: int = ... + LOG_ALERT: int = ... + LOG_CRIT: int = ... + LOG_ERR: int = ... + LOG_WARNING: int = ... + LOG_NOTICE: int = ... + LOG_INFO: int = ... + LOG_DEBUG: int = ... + LOG_KERN: int = ... + LOG_USER: int = ... + LOG_MAIL: int = ... + LOG_DAEMON: int = ... + LOG_AUTH: int = ... + LOG_SYSLOG: int = ... + LOG_LPR: int = ... + LOG_NEWS: int = ... + LOG_UUCP: int = ... + LOG_CRON: int = ... + LOG_AUTHPRIV: int = ... + LOG_FTP: int = ... + LOG_LOCAL0: int = ... + LOG_LOCAL1: int = ... + LOG_LOCAL2: int = ... + LOG_LOCAL3: int = ... + LOG_LOCAL4: int = ... + LOG_LOCAL5: int = ... + LOG_LOCAL6: int = ... + LOG_LOCAL7: int = ... + priority_names: Any = ... + facility_names: Any = ... + priority_map: Any = ... + address: Any = ... + facility: Any = ... + socktype: Any = ... + unixsocket: bool = ... + socket: Any = ... + + def __init__( + self, address: Any = ..., facility: Any = ..., socktype: Optional[Any] = ... + ) -> None: ... + def encodePriority(self, facility: Any, priority: Any): ... + def close(self) -> None: ... + def mapPriority(self, levelName: Any): ... + ident: str = ... + append_nul: bool = ... + def emit(self, record: Any) -> None: ... + + +class SMTPHandler(logging.Handler): + username: Any = ... + fromaddr: Any = ... + toaddrs: Any = ... + subject: Any = ... + secure: Any = ... + timeout: Any = ... + + def __init__( + self, + mailhost: Any, + fromaddr: Any, + toaddrs: Any, + subject: Any, + credentials: Optional[Any] = ..., + secure: Optional[Any] = ..., + timeout: float = ..., + ) -> None: ... + def getSubject(self, record: Any): ... + def emit(self, record: Any) -> None: ... + + +class NTEventLogHandler(logging.Handler): + appname: Any = ... + dllname: Any = ... + logtype: Any = ... + deftype: Any = ... + typemap: Any = ... + + def __init__( + self, appname: Any, dllname: Optional[Any] = ..., logtype: str = ... + ) -> None: ... + def getMessageID(self, record: Any): ... + def getEventCategory(self, record: Any): ... + def getEventType(self, record: Any): ... + def emit(self, record: Any) -> None: ... + def close(self) -> None: ... + + +class HTTPHandler(logging.Handler): + host: Any = ... + url: Any = ... + method: Any = ... + secure: Any = ... + credentials: Any = ... + context: Any = ... + + def __init__( + self, + host: Any, + url: Any, + method: str = ..., + secure: bool = ..., + credentials: Optional[Any] = ..., + context: Optional[Any] = ..., + ) -> None: ... + def mapLogRecord(self, record: Any): ... + def emit(self, record: Any) -> None: ... + + +class BufferingHandler(logging.Handler): + capacity: Any = ... + buffer: Any = ... + def __init__(self, capacity: Any) -> None: ... + def shouldFlush(self, record: Any): ... + def emit(self, record: Any) -> None: ... + def flush(self) -> None: ... + def close(self) -> None: ... + + +class MemoryHandler(BufferingHandler): + flushLevel: Any = ... + target: Any = ... + flushOnClose: Any = ... + + def __init__( + self, + capacity: Any, + flushLevel: Any = ..., + target: Optional[Any] = ..., + flushOnClose: bool = ..., + ) -> None: ... + def shouldFlush(self, record: Any): ... + def setTarget(self, target: Any) -> None: ... + buffer: Any = ... + def flush(self) -> None: ... + def close(self) -> None: ... + + +class QueueHandler(logging.Handler): + queue: Any = ... + def __init__(self, queue: Any) -> None: ... + def enqueue(self, record: Any) -> None: ... + def prepare(self, record: Any): ... + def emit(self, record: Any) -> None: ... + + +class QueueListener: + queue: Any = ... + handlers: Any = ... + respect_handler_level: Any = ... + + def __init__( + self, queue: Any, *handlers: Any, respect_handler_level: bool = ... + ) -> None: ... + def dequeue(self, block: Any): ... + def start(self) -> None: ... + def prepare(self, record: Any): ... + def handle(self, record: Any) -> None: ... + def enqueue_sentinel(self) -> None: ... + def stop(self) -> None: ... diff --git a/types/stubs/logging/log_record.pyi b/types/stubs/logging/log_record.pyi new file mode 100644 index 000000000..e09bbe75d --- /dev/null +++ b/types/stubs/logging/log_record.pyi @@ -0,0 +1,40 @@ +from typing import Any, Optional + + +class LogRecord: + name: Any = ... + msg: Any = ... + args: Any = ... + levelname: Any = ... + levelno: Any = ... + pathname: Any = ... + filename: Any = ... + module: Any = ... + exc_info: Any = ... + exc_text: Any = ... + stack_info: Any = ... + lineno: Any = ... + funcName: Any = ... + created: Any = ... + msecs: Any = ... + relativeCreated: Any = ... + thread: Any = ... + threadName: Any = ... + processName: Any = ... + process: Any = ... + + def __init__( + self, + name: Any, + level: Any, + pathname: Any, + lineno: Any, + msg: Any, + args: Any, + exc_info: Any, + func: Optional[Any] = ..., + sinfo: Optional[Any] = ..., + **kwargs: Any + ) -> None: ... + + def getMessage(self): ... diff --git a/types/stubs/logging/logger.pyi b/types/stubs/logging/logger.pyi new file mode 100644 index 000000000..c6e03af4d --- /dev/null +++ b/types/stubs/logging/logger.pyi @@ -0,0 +1,49 @@ +from typing import Any, Optional +from .filterer import Filterer # noqa + + +class Logger(Filterer): + name: Any = ... + level: Any = ... + parent: Any = ... + propagate: bool = ... + handlers: Any = ... + disabled: bool = ... + def __init__(self, name: Any, level: Any = ...) -> None: ... + def setLevel(self, level: Any) -> None: ... + def debug(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def info(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def warning(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def warn(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + def error(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + + def exception( + self, msg: str, *args: Any, exc_info: bool = ..., **kwargs: Any + ) -> None: ... + def critical(self, msg: str, *args: Any, **kwargs: Any) -> None: ... + fatal: Any = ... + def log(self, level: Any, msg: str, *args: Any, **kwargs: Any) -> None: ... + def findCaller(self, stack_info: bool = ..., stacklevel: int = ...): ... + + def makeRecord( + self, + name: Any, + level: Any, + fn: Any, + lno: Any, + msg: Any, + args: Any, + exc_info: Any, + func: Optional[Any] = ..., + extra: Optional[Any] = ..., + sinfo: Optional[Any] = ..., + ): ... + def handle(self, record: Any) -> None: ... + def addHandler(self, hdlr: Any) -> None: ... + def removeHandler(self, hdlr: Any) -> None: ... + def hasHandlers(self): ... + def callHandlers(self, record: Any) -> None: ... + def getEffectiveLevel(self): ... + def isEnabledFor(self, level: Any): ... + def getChild(self, suffix: Any): ... + def __reduce__(self): ... diff --git a/wait-for-mysql.sh b/wait-for-mysql.sh new file mode 100755 index 000000000..bdf0ea5dd --- /dev/null +++ b/wait-for-mysql.sh @@ -0,0 +1,7 @@ +#!/bin/bash -e + +while ! db dockerized-mysql &2 echo "Waiting 5 seconds for MySQL to be available" + sleep 5 +done