Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RM-86 Error on Future and Deprecation Warnings #244

Merged
merged 16 commits into from
May 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ commands:
default: ""
steps:
- restore_cache:
key: deps-v7-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
key: deps-v8-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
- run:
name: Install python deps in venv
environment:
PYENV_VERSION: <<parameters.python_version>>
SQLALCHEMY_SILENCE_UBER_WARNING: 1
command: |
if [ -f venv/bin/activate ]
then
Expand Down Expand Up @@ -63,7 +64,7 @@ commands:
fi
fi
- save_cache:
key: deps-v6-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
key: deps-v8-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
paths:
- "venv"
wait_for_db:
Expand Down Expand Up @@ -113,6 +114,8 @@ jobs:
description: "Enforce coverage not slipping"
docker:
- image: cimg/python:<<parameters.python_version>>
environment:
SQLALCHEMY_SILENCE_UBER_WARNING: 1
steps:
- checkout
- add_ssh_keys:
Expand Down Expand Up @@ -185,6 +188,8 @@ jobs:
command:
type: string
description: "Command to run in records-mover venv"
environment:
SQLALCHEMY_SILENCE_UBER_WARNING: 1
docker:
- image: cimg/python:<<parameters.python_version>>
- image: jbfavre/vertica:8.1.1-16_centos-7
Expand Down Expand Up @@ -294,7 +299,12 @@ jobs:
type: boolean
description: "If true, pass in the env variable specifying an GCS scratch bucket to Records Mover (for BigQuery)"
default: true

pytest_flags:
type: string
description: "Any flags you'd like to add to the pytest call"
default: ""
environment:
SQLALCHEMY_SILENCE_UBER_WARNING: 1
docker:
- image: cimg/python:<<parameters.python_version>>
steps:
Expand Down Expand Up @@ -327,7 +337,7 @@ jobs:
# This is set by default in the CircleCI environment
unset SCRATCH_GCS_URL
fi
with-db <<parameters.db_name>> pytest
with-db <<parameters.db_name>> pytest <<parameters.pytest_flags>>
- store_test_results:
path: test-reports
- store_artifacts:
Expand All @@ -351,6 +361,8 @@ jobs:
default: '3.9'
docker:
- image: cimg/python:<<parameters.python_version>>
environment:
SQLALCHEMY_SILENCE_UBER_WARNING: 1
steps:
- checkout
- installvenv:
Expand Down Expand Up @@ -400,6 +412,8 @@ jobs:
cli-extra-test:
docker:
- image: cimg/python:3.9
environment:
SQLALCHEMY_SILENCE_UBER_WARNING: 1
steps:
- checkout
- installvenv:
Expand Down Expand Up @@ -622,6 +636,7 @@ workflows:
numpy_version: "<1.24"
context: slack-secrets
db_name: demo-itest
pytest_flags: "-W default::FutureWarning"
filters:
tags:
only: /v\d+\.\d+\.\d+(-[\w]+)?/
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ jobs:
fail-fast: true
matrix:
python-version: ["3.8", "3.9"]

env:
SQLALCHEMY_SILENCE_UBER_WARNING: 1
steps:
- uses: actions/checkout@v3
- name: Install python ${{ matrix.python-version }}
Expand Down
2 changes: 1 addition & 1 deletion metrics/mypy_high_water_mark
Original file line number Diff line number Diff line change
@@ -1 +1 @@
89.7400
89.7000
5 changes: 5 additions & 0 deletions pytest.ini
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Brunope especially for your visibility, this pytest.ini is what makes the pytest command error on Deprecation and Future warning

Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# pytest.ini
[pytest]
filterwarnings =
error::DeprecationWarning:.*records_mover.*
error::FutureWarning:.*records_mover.*
15 changes: 8 additions & 7 deletions records_mover/db/connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,14 @@ def create_sqlalchemy_url(db_facts: DBFacts,

return create_bigquery_sqlalchemy_url(db_facts)
else:
return sa.engine.url.URL(drivername=driver,
username=username,
password=db_facts['password'],
host=db_facts['host'],
port=db_facts['port'],
database=db_facts['database'],
query=query_for_type.get(db_type))
return sa.engine.url.URL.create( # type: ignore
drivername=driver,
username=username,
password=db_facts['password'],
host=db_facts['host'],
port=db_facts['port'],
database=db_facts['database'],
query=query_for_type.get(db_type))


def engine_from_lpass_entry(lpass_entry_name: str) -> sa.engine.Engine:
Expand Down
7 changes: 4 additions & 3 deletions records_mover/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def purge_unnamed_unused_columns(df: DataFrame) -> DataFrame:
# "unnamed: 1", or maybe "Unnamed: 1" (not sure why/when
# that differs). Let's clean those up.
for column in df:
if column.startswith('Unnamed: ') or column.startswith('unnamed: '):
if not df[column].notnull().any():
df = df.drop(column, axis=1)
if type(column) == str:
if column.startswith('Unnamed: ') or column.startswith('unnamed: '):
if not df[column].notnull().any():
df = df.drop(column, axis=1)
return df
12 changes: 0 additions & 12 deletions records_mover/records/delimited/csv_streamer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,18 +44,6 @@ def stream_csv(filepath_or_buffer: Union[str, IO[bytes]],
'iterator': True,
'engine': 'python'
}
if header is None:
# Pandas only accepts the prefix argument (which makes for
# tidier column names when otherwise not provided) when the
# header is explicitly marked as missing, not when it's
# available or even when we ask Pandas to infer it. Bummer,
# as this means that when Pandas infers that there's no
# header, the column names will end up different than folks
# explicitly tell records mover that there is no header.
#
# https://github.com/pandas-dev/pandas/issues/27394
# https://github.com/pandas-dev/pandas/pull/31383
kwargs['prefix'] = 'untitled_'
if 'quoting' in hints:
quoting = hints['quoting']
kwargs['quoting'] = pandas_quoting_from_hint[quoting]
Expand Down
2 changes: 1 addition & 1 deletion records_mover/records/pandas/prep_for_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,5 @@ def prep_df_for_csv_output(df: DataFrame,
records_format,
processing_instructions)
if formatted_series is not None:
formatted_df.iloc[:, index] = formatted_series
formatted_df[formatted_df.columns[index]] = formatted_series
return formatted_df
40 changes: 10 additions & 30 deletions records_mover/records/pandas/read_csv_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from records_mover.records.schema import RecordsSchema
import logging
from typing import Set, Dict, Any
from packaging import version
import pandas as pd


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -146,29 +148,6 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat,
# (better to keep a standard format, no matter how many columsn)
#

#
# prefix : str, optional
#
# Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1,
#

#
# Not sure this actually does anything - when loading a CSV format
# file with an empty final column name - e.g.,
# tests/integration/resources/delimited-csv-with-header.csv - the
# column still comes out as 'unnamed: 11'ead as 'untitled_11'.
#
# Leaving this in case a future version of Pandas behaves
# better.
#
if pandas_options['header'] is None:
# Pandas only accepts the prefix argument when the
# header is marked as missing.
#
# https://github.com/pandas-dev/pandas/issues/27394
# https://github.com/pandas-dev/pandas/pull/31383
pandas_options['prefix'] = 'untitled_'

#
# mangle_dupe_cols : bool, default True
#
Expand Down Expand Up @@ -522,7 +501,10 @@ def day_first(dateish_format: str) -> bool:
# Character to break file into lines. Only valid with C parser.
#
if non_standard_record_terminator:
pandas_options['lineterminator'] = hints.record_terminator
if version.parse(pd.__version__) >= version.parse('1.5.0'):
pandas_options['lineterminator'] = hints.record_terminator
else:
pandas_options['line_terminator'] = hints.record_terminator
quiet_remove(unhandled_hints, 'record-terminator')

#
Expand Down Expand Up @@ -630,25 +612,23 @@ def day_first(dateish_format: str) -> bool:
# (deprecated, so not supplying)

#
# error_bad_lines : bool, default True
# on_bad_lines : string default 'error'
#
# Lines with too many fields (e.g. a csv line with too many
# commas) will by default cause an exception to be raised, and no
# DataFrame will be returned. If False, then these “bad lines”
# will dropped from the DataFrame that is returned.
#

pandas_options['error_bad_lines'] = processing_instructions.fail_if_row_invalid
pandas_options['on_bad_lines'] = 'error' if processing_instructions.fail_if_row_invalid else 'warn'

#
# warn_bad_lines : bool, default True
#
#
# If error_bad_lines is False, and warn_bad_lines is True, a
# If processing_instructions.fail_if_row_invalid is False, a
# warning for each “bad line” will be output.
#

pandas_options['warn_bad_lines'] = True

#
# delim_whitespace : bool, default False
#
Expand Down
7 changes: 6 additions & 1 deletion records_mover/records/pandas/to_csv_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from records_mover.mover_types import _assert_never
import logging
from typing import Set, Dict
from packaging import version
import pandas as pd


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -111,7 +113,10 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
pandas_options['sep'] = hints.field_delimiter
quiet_remove(unhandled_hints, 'field-delimiter')

pandas_options['line_terminator'] = hints.record_terminator
if version.parse(pd.__version__) >= version.parse('1.5.0'):
pandas_options['lineterminator'] = hints.record_terminator
else:
pandas_options['line_terminator'] = hints.record_terminator
quiet_remove(unhandled_hints, 'record-terminator')

return pandas_options
8 changes: 4 additions & 4 deletions tests/component/records/pandas/test_read_csv_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class DateFormatExpectations(TypedDict):
fileobj = io.StringIO(create_sample(dateformat))
df = pandas.read_csv(filepath_or_buffer=fileobj,
**options)
timestamp = df['untitled_0'][0]
timestamp = df[0][0]
self.assertEqual(timestamp.year, SAMPLE_YEAR)
self.assertEqual(timestamp.month, SAMPLE_MONTH)
self.assertEqual(timestamp.day, SAMPLE_DAY)
Expand Down Expand Up @@ -145,7 +145,7 @@ class DateTimeFormatTzExpectations(TypedDict):
fileobj = io.StringIO(datetimetz)
df = pandas.read_csv(filepath_or_buffer=fileobj,
**options)
timestamp = df['untitled_0'][0]
timestamp = df[0][0]
self.assertIsInstance(timestamp, pandas.Timestamp,
f"Pandas did not parse {datetimetz} as a timestamp object")
self.assertEqual(timestamp.year, SAMPLE_YEAR)
Expand Down Expand Up @@ -208,7 +208,7 @@ class DateTimeFormatExpectations(TypedDict):
fileobj = io.StringIO(datetimetz)
df = pandas.read_csv(filepath_or_buffer=fileobj,
**options)
timestamp = df['untitled_0'][0]
timestamp = df[0][0]
self.assertIsInstance(timestamp, pandas.Timestamp,
f"Pandas did not parse {datetimetz} as a timestamp object")
self.assertEqual(timestamp.year, SAMPLE_YEAR)
Expand Down Expand Up @@ -249,7 +249,7 @@ def test_timeonlyformat(self) -> None:
fileobj = io.StringIO(timeonly)
df = pandas.read_csv(filepath_or_buffer=fileobj,
**options)
timestamp = df['untitled_0'][0]
timestamp = df[0][0]
self.assertIsInstance(timestamp, pandas.Timestamp,
f"Pandas did not parse {timeonly} as a timestamp object")
self.assertEqual(timestamp.hour, SAMPLE_HOUR)
Expand Down
Loading