bluelabsio · ryantimjohn · May 23, 2023 · May 9, 2023 · May 9, 2023 · May 12, 2023
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -30,11 +30,12 @@ commands:
         default: ""
     steps:
       - restore_cache:
-          key: deps-v7-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: deps-v8-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
       - run:
           name: Install python deps in venv
           environment:
             PYENV_VERSION: <<parameters.python_version>>
+            SQLALCHEMY_SILENCE_UBER_WARNING: 1
           command: |
             if [ -f venv/bin/activate ]
             then
@@ -63,7 +64,7 @@ commands:
               fi
             fi
       - save_cache:
-          key: deps-v6-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: deps-v8-<<parameters.python_version>>-<<parameters.pandas_version>>-<<parameters.extras>>-<<parameters.include_dev_dependencies>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
           paths:
             - "venv"
   wait_for_db:
@@ -113,6 +114,8 @@ jobs:
         description: "Enforce coverage not slipping"
     docker:
       - image: cimg/python:<<parameters.python_version>>
+    environment:
+            SQLALCHEMY_SILENCE_UBER_WARNING: 1
     steps:
       - checkout
       - add_ssh_keys:
@@ -185,6 +188,8 @@ jobs:
       command:
         type: string
         description: "Command to run in records-mover venv"
+    environment:
+            SQLALCHEMY_SILENCE_UBER_WARNING: 1
     docker:
       - image: cimg/python:<<parameters.python_version>>
       - image: jbfavre/vertica:8.1.1-16_centos-7
@@ -294,7 +299,12 @@ jobs:
         type: boolean
         description: "If true, pass in the env variable specifying an GCS scratch bucket to Records Mover (for BigQuery)"
         default: true
-
+      pytest_flags:
+        type: string
+        description: "Any flags you'd like to add to the pytest call"
+        default: ""
+    environment:
+            SQLALCHEMY_SILENCE_UBER_WARNING: 1
     docker:
       - image: cimg/python:<<parameters.python_version>>
     steps:
@@ -327,7 +337,7 @@ jobs:
               # This is set by default in the CircleCI environment
               unset SCRATCH_GCS_URL
             fi
-            with-db <<parameters.db_name>> pytest
+            with-db <<parameters.db_name>> pytest <<parameters.pytest_flags>>
       - store_test_results:
           path: test-reports
       - store_artifacts:
@@ -351,6 +361,8 @@ jobs:
         default: '3.9'
     docker:
       - image: cimg/python:<<parameters.python_version>>
+    environment:
+            SQLALCHEMY_SILENCE_UBER_WARNING: 1
     steps:
       - checkout
       - installvenv:
@@ -400,6 +412,8 @@ jobs:
   cli-extra-test:
     docker:
       - image: cimg/python:3.9
+    environment:
+      SQLALCHEMY_SILENCE_UBER_WARNING: 1
     steps:
       - checkout
       - installvenv:
@@ -622,6 +636,7 @@ workflows:
           numpy_version: "<1.24"
           context: slack-secrets
           db_name: demo-itest
+          pytest_flags: "-W default::FutureWarning"
           filters:
             tags:
               only: /v\d+\.\d+\.\d+(-[\w]+)?/

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,7 +10,8 @@ jobs:
       fail-fast: true
       matrix:
         python-version: ["3.8", "3.9"]
-
+    env:
+      SQLALCHEMY_SILENCE_UBER_WARNING: 1
     steps:
       - uses: actions/checkout@v3
       - name: Install python ${{ matrix.python-version }}

diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark
@@ -1 +1 @@
-89.7400
+89.7000
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,5 @@
+# pytest.ini
+[pytest]
+filterwarnings =
+    error::DeprecationWarning:.*records_mover.*
+    error::FutureWarning:.*records_mover.*
diff --git a/records_mover/db/connect.py b/records_mover/db/connect.py
@@ -113,13 +113,14 @@ def create_sqlalchemy_url(db_facts: DBFacts,
 
         return create_bigquery_sqlalchemy_url(db_facts)
     else:
-        return sa.engine.url.URL(drivername=driver,
-                                 username=username,
-                                 password=db_facts['password'],
-                                 host=db_facts['host'],
-                                 port=db_facts['port'],
-                                 database=db_facts['database'],
-                                 query=query_for_type.get(db_type))
+        return sa.engine.url.URL.create(  # type: ignore
+            drivername=driver,
+            username=username,
+            password=db_facts['password'],
+            host=db_facts['host'],
+            port=db_facts['port'],
+            database=db_facts['database'],
+            query=query_for_type.get(db_type))
 
 
 def engine_from_lpass_entry(lpass_entry_name: str) -> sa.engine.Engine:

diff --git a/records_mover/pandas/__init__.py b/records_mover/pandas/__init__.py
@@ -29,7 +29,8 @@ def purge_unnamed_unused_columns(df: DataFrame) -> DataFrame:
     # "unnamed: 1", or maybe "Unnamed: 1" (not sure why/when
     # that differs).  Let's clean those up.
     for column in df:
-        if column.startswith('Unnamed: ') or column.startswith('unnamed: '):
-            if not df[column].notnull().any():
-                df = df.drop(column, axis=1)
+        if type(column) == str:
+            if column.startswith('Unnamed: ') or column.startswith('unnamed: '):
+                if not df[column].notnull().any():
+                    df = df.drop(column, axis=1)
     return df
diff --git a/records_mover/records/delimited/csv_streamer.py b/records_mover/records/delimited/csv_streamer.py
@@ -44,18 +44,6 @@ def stream_csv(filepath_or_buffer: Union[str, IO[bytes]],
         'iterator': True,
         'engine': 'python'
     }
-    if header is None:
-        # Pandas only accepts the prefix argument (which makes for
-        # tidier column names when otherwise not provided) when the
-        # header is explicitly marked as missing, not when it's
-        # available or even when we ask Pandas to infer it.  Bummer,
-        # as this means that when Pandas infers that there's no
-        # header, the column names will end up different than folks
-        # explicitly tell records mover that there is no header.
-        #
-        # https://github.com/pandas-dev/pandas/issues/27394
-        # https://github.com/pandas-dev/pandas/pull/31383
-        kwargs['prefix'] = 'untitled_'
     if 'quoting' in hints:
         quoting = hints['quoting']
         kwargs['quoting'] = pandas_quoting_from_hint[quoting]

diff --git a/records_mover/records/pandas/prep_for_csv.py b/records_mover/records/pandas/prep_for_csv.py
@@ -110,5 +110,5 @@ def prep_df_for_csv_output(df: DataFrame,
                                                     records_format,
                                                     processing_instructions)
         if formatted_series is not None:
-            formatted_df.iloc[:, index] = formatted_series
+            formatted_df[formatted_df.columns[index]] = formatted_series
     return formatted_df
diff --git a/records_mover/records/pandas/read_csv_options.py b/records_mover/records/pandas/read_csv_options.py
@@ -6,6 +6,8 @@
 from records_mover.records.schema import RecordsSchema
 import logging
 from typing import Set, Dict, Any
+from packaging import version
+import pandas as pd
 
 
 logger = logging.getLogger(__name__)
@@ -146,29 +148,6 @@ def pandas_read_csv_options(records_format: DelimitedRecordsFormat,
     # (better to keep a standard format, no matter how many columsn)
     #
 
-    #
-    # prefix : str, optional
-    #
-    # Prefix to add to column numbers when no header, e.g. ‘X’ for X0, X1,
-    #
-
-    #
-    # Not sure this actually does anything - when loading a CSV format
-    # file with an empty final column name - e.g.,
-    # tests/integration/resources/delimited-csv-with-header.csv - the
-    # column still comes out as 'unnamed: 11'ead as 'untitled_11'.
-    #
-    # Leaving this in case a future version of Pandas behaves
-    # better.
-    #
-    if pandas_options['header'] is None:
-        # Pandas only accepts the prefix argument when the
-        # header is marked as missing.
-        #
-        # https://github.com/pandas-dev/pandas/issues/27394
-        # https://github.com/pandas-dev/pandas/pull/31383
-        pandas_options['prefix'] = 'untitled_'
-
     #
     # mangle_dupe_cols : bool, default True
     #
@@ -522,7 +501,10 @@ def day_first(dateish_format: str) -> bool:
     # Character to break file into lines. Only valid with C parser.
     #
     if non_standard_record_terminator:
-        pandas_options['lineterminator'] = hints.record_terminator
+        if version.parse(pd.__version__) >= version.parse('1.5.0'):
+            pandas_options['lineterminator'] = hints.record_terminator
+        else:
+            pandas_options['line_terminator'] = hints.record_terminator
     quiet_remove(unhandled_hints, 'record-terminator')
 
     #
@@ -630,25 +612,23 @@ def day_first(dateish_format: str) -> bool:
     # (deprecated, so not supplying)
 
     #
-    # error_bad_lines : bool, default True
+    # on_bad_lines : string default 'error'
     #
     # Lines with too many fields (e.g. a csv line with too many
     # commas) will by default cause an exception to be raised, and no
     # DataFrame will be returned. If False, then these “bad lines”
     # will dropped from the DataFrame that is returned.
     #
 
-    pandas_options['error_bad_lines'] = processing_instructions.fail_if_row_invalid
+    pandas_options['on_bad_lines'] = 'error' if processing_instructions.fail_if_row_invalid else 'warn'
 
     #
-    # warn_bad_lines : bool, default True
+    #  
     #
-    # If error_bad_lines is False, and warn_bad_lines is True, a
+    # If processing_instructions.fail_if_row_invalid is False, a
     # warning for each “bad line” will be output.
     #
 
-    pandas_options['warn_bad_lines'] = True
-
     #
     # delim_whitespace : bool, default False
     #

diff --git a/records_mover/records/pandas/to_csv_options.py b/records_mover/records/pandas/to_csv_options.py
@@ -6,6 +6,8 @@
 from records_mover.mover_types import _assert_never
 import logging
 from typing import Set, Dict
+from packaging import version
+import pandas as pd
 
 
 logger = logging.getLogger(__name__)
@@ -111,7 +113,10 @@ def pandas_to_csv_options(records_format: DelimitedRecordsFormat,
     pandas_options['sep'] = hints.field_delimiter
     quiet_remove(unhandled_hints, 'field-delimiter')
 
-    pandas_options['line_terminator'] = hints.record_terminator
+    if version.parse(pd.__version__) >= version.parse('1.5.0'):
+        pandas_options['lineterminator'] = hints.record_terminator
+    else:
+        pandas_options['line_terminator'] = hints.record_terminator
     quiet_remove(unhandled_hints, 'record-terminator')
 
     return pandas_options
diff --git a/tests/component/records/pandas/test_read_csv_options.py b/tests/component/records/pandas/test_read_csv_options.py
@@ -90,7 +90,7 @@ class DateFormatExpectations(TypedDict):
             fileobj = io.StringIO(create_sample(dateformat))
             df = pandas.read_csv(filepath_or_buffer=fileobj,
                                  **options)
-            timestamp = df['untitled_0'][0]
+            timestamp = df[0][0]
             self.assertEqual(timestamp.year, SAMPLE_YEAR)
             self.assertEqual(timestamp.month, SAMPLE_MONTH)
             self.assertEqual(timestamp.day, SAMPLE_DAY)
@@ -145,7 +145,7 @@ class DateTimeFormatTzExpectations(TypedDict):
             fileobj = io.StringIO(datetimetz)
             df = pandas.read_csv(filepath_or_buffer=fileobj,
                                  **options)
-            timestamp = df['untitled_0'][0]
+            timestamp = df[0][0]
             self.assertIsInstance(timestamp, pandas.Timestamp,
                                   f"Pandas did not parse {datetimetz} as a timestamp object")
             self.assertEqual(timestamp.year, SAMPLE_YEAR)
@@ -208,7 +208,7 @@ class DateTimeFormatExpectations(TypedDict):
             fileobj = io.StringIO(datetimetz)
             df = pandas.read_csv(filepath_or_buffer=fileobj,
                                  **options)
-            timestamp = df['untitled_0'][0]
+            timestamp = df[0][0]
             self.assertIsInstance(timestamp, pandas.Timestamp,
                                   f"Pandas did not parse {datetimetz} as a timestamp object")
             self.assertEqual(timestamp.year, SAMPLE_YEAR)
@@ -249,7 +249,7 @@ def test_timeonlyformat(self) -> None:
             fileobj = io.StringIO(timeonly)
             df = pandas.read_csv(filepath_or_buffer=fileobj,
                                  **options)
-            timestamp = df['untitled_0'][0]
+            timestamp = df[0][0]
             self.assertIsInstance(timestamp, pandas.Timestamp,
                                   f"Pandas did not parse {timeonly} as a timestamp object")
             self.assertEqual(timestamp.hour, SAMPLE_HOUR)