From 52069b082358e2f1c3f4c954ed1637effc5e9af5 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 13:51:46 -0500
Subject: [PATCH 01/12] Add README.md, INSTALL.md and move psycopg2 to
 mover-cli extra

---
 INSTALL.md | 55 ++++++++++++++++++++++++++++++++++++++
 README.md  | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 deps.sh    | 10 -------
 setup.py   |  2 +-
 4 files changed, 132 insertions(+), 12 deletions(-)
 create mode 100644 INSTALL.md

diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 000000000..ccc30e294
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,55 @@
+# Installing Records Mover
+
+You can install records-mover with the following 'extras':
+
+* `pip3 install records-mover` - Install minimal version, not
+  including Pandas (needed only for local data copy) or psycopg2
+  (needed for Redshift or PostgreSQL connections).
+* `pip3 install records-mover[gsheets]` - Minimal install plus API
+  libraries to access Google Sheets.
+* `pip3 install records-mover[mover-cli]` - Install everything and
+  make assumptions compatible with using mvrec on the command line.
+  Installs `pandas` as well as `psycopg2-binary` and `pyarrow` for
+  local Parquet support.  Don't use if you plan on using the library
+  because of the `psycopg2-binary` risk below.
+
+## Why this is complicated
+
+Records mover relies on a number of external libraries.  Here are some
+things to keep in mind when using `pip install`:
+
+## pandas
+
+Only when installing with `pip3 install 'records-mover[movercli]'`
+will you get pandas installed by default.
+
+Pandas a large dependency which is needed in cases where we need to
+process data locally.  If you are using cloud-native import/export
+functionality only, you shouldn't need it and can avoid the bloat.
+
+## psycopg2
+
+psycopg2 is a library used for access to both Redshift and PostgreSQL databases.
+
+The project is
+[dealing](https://www.postgresql.org/message-id/CA%2Bmi_8bd6kJHLTGkuyHSnqcgDrJ1uHgQWvXCKQFD3tPQBUa2Bw%40mail.gmail.com)
+[with](https://www.psycopg.org/articles/2018/02/08/psycopg-274-released/)
+a thorny compatibility issue with native code and threading.  They've
+published three separate versions of their library to PyPI as a
+result:
+
+* `psycopg2` - requires local compilation, and as such you need certain
+  tools and maybe configuration set up.  This is the hardest one to
+  install as a result.
+* `psycopg2-binary` - pre-compiled version that might have threading
+  issues if you try to use it in a multi-threaded environment with
+  other code that might be using libssl from a different source.
+* `psycopg2cffi` - The version to use if you use `pypy`
+
+If you are using the mvrec command line only, you can use `pip3
+install 'records-mover[movercli]` and it just uses `psycopg2-binary`.
+
+## pyarrow
+
+`pyarrow` is a Python wrapper around the Apache Arrow native library.
+It's used by records mover to manipulate Parquet files locally.
diff --git a/README.md b/README.md
index d03b99a71..2a0ec679e 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,76 @@
-# Records Mover
+# Records Mover - mvrec
+
+Records Mover is a command-line tool and Python library you can
+use to move relational data from one place to another.
+
+Relational data here means anything roughly "rectangular" - with
+columns and rows.  For example, CSV it supports reading and writing
+data in:
+
+* Databases, including using native high-speed methods of
+  import/export of bulk data.  Redshift and Vertica are
+  well-supported, with some support for BigQuery and PostgreSQL.
+* Google Sheets
+* Pandas DataFrames
+* CSV files, either alone or in a records directory - a structured
+  directory of CSV/Parquet/etc files containing some JSON metadata
+  about their format and origins.  Records directories are especially
+  helpful for the ever-ambiguous CSV format, where they solve the
+  problem of 'hey, this may be a CSV - but what's the schema?  What's
+  the format of the CSV itself?  How is it escaped?'
+
+The record mover can be exended expand to handle additional database
+and data file types by building on top of their
+[SQLAlchemy](https://www.sqlalchemy.org/) drivers, and is able to
+auto-negotiate the most efficient way of moving data from one to the
+other.
+
+Example CLI use:
+
+```sh
+pip3 install 'records_mover[movercli]'
+mvrec --help
+mvrec table2table mydb1 myschema1 mytable1 mydb2 myschema2 mytable2
+```
+
+For more installation notes, see [INSTALL.md](./INSTALL.md)
+
+Note that the connection details for the database names here must be
+configured using
+[db-facts](https://github.com/bluelabsio/db-facts/blob/master/CONFIGURATION.md).
+
+Example Python library use:
+
+```python
+#!/usr/bin/env python3
+
+# Pull in the job lib library - be sure to run the pip install above first!
+from records_mover import Session
+from pandas import DataFrame
+
+session = Session()
+records = session.records
+
+# This is a SQLAlchemy database engine.
+#
+# You can instead call job_context.get_db_engine('cred name').
+#
+# On your laptop, 'cred name' is the same thing passed to dbcli (mapping to something in LastPass).
+#
+# In Airflow, 'cred name' maps to the connection ID in the admin Connnections UI.
+#
+# Or you can build your own and pass it in!
+db_engine = job_context.get_default_db_engine()
+
+df = DataFrame.from_dict([{'a': 1}]) # or make your own!
+
+source = records.sources.dataframe(df=df)
+target = records.targets.table(schema_name='myschema',
+                               table_name='mytable',
+                               db_engine=db_engine)
+results = records.move(source, target)
+```
+
+When moving data, the sources supported can be found
+[here](./records_mover/records/sources/factory.py), and the
+targets supported can be found [here](./records_mover/records/targets/factory.py).
diff --git a/deps.sh b/deps.sh
index 0b42abfe1..ac73c7b96 100755
--- a/deps.sh
+++ b/deps.sh
@@ -11,16 +11,6 @@ python_version=3.8.1
 #    You may need `xcode-select --install` on OS X
 #    https://github.com/pyenv/pyenv/issues/451#issuecomment-151336786
 pyenv install -s "${python_version:?}"
-if [ "$(uname)" == Darwin ]
-then
-  # Python has needed this in the past when installed by 'pyenv
-  # install'.  The current version of 'psycopg2' seems to require it
-  # now, but Python complains when it *is* set.  🤦
-  CFLAGS="-I$(brew --prefix openssl)/include"
-  export CFLAGS
-  LDFLAGS="-L$(brew --prefix openssl)/lib"
-  export LDFLAGS
-fi
 pyenv virtualenv "${python_version:?}" records-mover-"${python_version:?}" || true
 pyenv local records-mover-"${python_version:?}"
 
diff --git a/setup.py b/setup.py
index bb46dc826..72cad1f87 100755
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@
       install_requires=[
           'boto>=2,<3', 'boto3',
           'jsonschema', 'timeout_decorator', 'awscli',
-          'PyYAML', 'psycopg2',
+          'PyYAML',
           # sqlalchemy-vertica-python 0.5.5 introduced
           # https://github.com/bluelabsio/sqlalchemy-vertica-python/pull/7
           # which fixed a bug pulling schema information from Vertica

From cf8189cf60ef2a5cc69a342f5a25ae32eff6f079 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 13:55:13 -0500
Subject: [PATCH 02/12] Clarify pyarrow need

---
 INSTALL.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index ccc30e294..1cd401e33 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -3,15 +3,17 @@
 You can install records-mover with the following 'extras':
 
 * `pip3 install records-mover` - Install minimal version, not
-  including Pandas (needed only for local data copy) or psycopg2
-  (needed for Redshift or PostgreSQL connections).
+  including `pandas` (needed only for local data copy), `psycopg2`
+  (needed for Redshift or PostgreSQL connections) or `pyarrow` (needed
+  for local Parquet manipulation).
 * `pip3 install records-mover[gsheets]` - Minimal install plus API
   libraries to access Google Sheets.
 * `pip3 install records-mover[mover-cli]` - Install everything and
   make assumptions compatible with using mvrec on the command line.
-  Installs `pandas` as well as `psycopg2-binary` and `pyarrow` for
-  local Parquet support.  Don't use if you plan on using the library
-  because of the `psycopg2-binary` risk below.
+  Installs `pandas`, `psycopg2-binary` and `pyarrow`.
+
+  Don't use this extra if you plan on using the library because of the
+  `psycopg2-binary` risk below.
 
 ## Why this is complicated
 
@@ -52,4 +54,7 @@ install 'records-mover[movercli]` and it just uses `psycopg2-binary`.
 ## pyarrow
 
 `pyarrow` is a Python wrapper around the Apache Arrow native library.
-It's used by records mover to manipulate Parquet files locally.
+It's used by records mover to manipulate Parquet files locally.  The
+Apache Arrow native library can require build tools to install and is
+large; if you don't need to deal with Parquet files in the local
+environment you can work without it.

From 48648a28819ac75c0fe2f3321a3991970919cae0 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 13:55:38 -0500
Subject: [PATCH 03/12] Tab in headers

---
 INSTALL.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 1cd401e33..712e7f65c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -20,7 +20,7 @@ You can install records-mover with the following 'extras':
 Records mover relies on a number of external libraries.  Here are some
 things to keep in mind when using `pip install`:
 
-## pandas
+### pandas
 
 Only when installing with `pip3 install 'records-mover[movercli]'`
 will you get pandas installed by default.
@@ -29,7 +29,7 @@ Pandas a large dependency which is needed in cases where we need to
 process data locally.  If you are using cloud-native import/export
 functionality only, you shouldn't need it and can avoid the bloat.
 
-## psycopg2
+### psycopg2
 
 psycopg2 is a library used for access to both Redshift and PostgreSQL databases.
 
@@ -51,7 +51,7 @@ result:
 If you are using the mvrec command line only, you can use `pip3
 install 'records-mover[movercli]` and it just uses `psycopg2-binary`.
 
-## pyarrow
+### pyarrow
 
 `pyarrow` is a Python wrapper around the Apache Arrow native library.
 It's used by records mover to manipulate Parquet files locally.  The

From 4e10cd2811cb230ede2528eac1fa88d07e25b71c Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 14:11:32 -0500
Subject: [PATCH 04/12] Whoops, initial install should have been movercli

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index c18a99afc..745380b61 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -24,7 +24,7 @@ commands:
               . venv/bin/activate
               # venv/ dir doesn't seem to save enough info to keep the
               # editable installation
-              pip install --progress-bar=off -e .
+              pip install --progress-bar=off -e '.[movercli]'
             else
               python -m venv venv
               . venv/bin/activate

From f487adc3458cb0e03254b00400513e0e99f0f8df Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 14:13:39 -0500
Subject: [PATCH 05/12] Add psycopg2-binary to movercli extra

---
 .circleci/config.yml | 4 ++--
 setup.py             | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 745380b61..cfaba5ff0 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -15,7 +15,7 @@ commands:
         type: string
     steps:
       - restore_cache:
-          key: deps-v1-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: deps-v2-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
       - run:
           name: Install python deps in venv
           command: |
@@ -37,7 +37,7 @@ commands:
               fi
             fi
       - save_cache:
-          key: deps-v1-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: deps-v2-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
           paths:
             - "venv"
 
diff --git a/setup.py b/setup.py
index 72cad1f87..fea4d2fe4 100755
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,9 @@
       ],
       extras_require={
           'gsheets': gsheet_dependencies,
-          'movercli': gsheet_dependencies + ['typing_inspect', 'docstring_parser',
+          'movercli': gsheet_dependencies + ['typing_inspect',
+                                             'docstring_parser',
+                                             'psycopg2-binary'
                                              'pandas<2',
                                              'pyarrow'],
       },

From 5ab62fe33c4cc17252e6421e4323984cce9a58d1 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 14:14:54 -0500
Subject: [PATCH 06/12] Add missing comma

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index fea4d2fe4..6741a1120 100755
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,7 @@
           'gsheets': gsheet_dependencies,
           'movercli': gsheet_dependencies + ['typing_inspect',
                                              'docstring_parser',
-                                             'psycopg2-binary'
+                                             'psycopg2-binary',
                                              'pandas<2',
                                              'pyarrow'],
       },

From 50fdbe3e3652da9c07049f6892606244bf8485d8 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 14:20:18 -0500
Subject: [PATCH 07/12] Ratchet mypy

---
 metrics/mypy_high_water_mark | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metrics/mypy_high_water_mark b/metrics/mypy_high_water_mark
index 6287f4489..fed8f6ec6 100644
--- a/metrics/mypy_high_water_mark
+++ b/metrics/mypy_high_water_mark
@@ -1 +1 @@
-88.8400
\ No newline at end of file
+88.8500
\ No newline at end of file

From 968de10de9095c157a0eb7bd876b6c84bcb07571 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 14:40:42 -0500
Subject: [PATCH 08/12] Add note on installing for library use

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index 2a0ec679e..66451b542 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,15 @@ configured using
 
 Example Python library use:
 
+First, install records_mover.  We'll also use Pandas, so we'll install
+that, too:
+
+```sh
+pip3 install records_mover pandas
+```
+
+Now we can run this code:
+
 ```python
 #!/usr/bin/env python3
 

From e05c210b898c0829022ec801375461e504b25722 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 14:43:46 -0500
Subject: [PATCH 09/12] deps-v2 -> deps-v1

---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index cfaba5ff0..745380b61 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -15,7 +15,7 @@ commands:
         type: string
     steps:
       - restore_cache:
-          key: deps-v2-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: deps-v1-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
       - run:
           name: Install python deps in venv
           command: |
@@ -37,7 +37,7 @@ commands:
               fi
             fi
       - save_cache:
-          key: deps-v2-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+          key: deps-v1-<<parameters.python_version>>-<<parameters.pandas_version>>-{{ .Branch }}-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
           paths:
             - "venv"
 

From f043117d920264d3cd0569da8cfe6ca1c81eab93 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Tue, 25 Feb 2020 19:39:37 -0500
Subject: [PATCH 10/12] Make PyYAML constraint transitive

---
 requirements.txt |  8 --------
 setup.py         | 15 +++++++++++++--
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5ccf61a73..0ac58414e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,14 +2,6 @@
 setuptools>34.3.0
 wheel
 twine
-#
-# awscli seems to artifically limit the max version of PyYAML:
-#
-# https://github.com/aws/aws-cli/pull/4403/files
-#
-# pip._vendor.pkg_resources.ContextualVersionConflict: (PyYAML 5.2 (/home/circleci/project/venv/lib/python3.6/site-packages), Requirement.parse('PyYAML<5.2,>=3.10; python_version != "2.6" and python_version != "3.3"'), {'awscli'})
-#
-PyYAML<5.2,>3.10
 flake8
 nose
 nose-progressive
diff --git a/setup.py b/setup.py
index 6741a1120..7b6b224ee 100755
--- a/setup.py
+++ b/setup.py
@@ -39,8 +39,19 @@
       },
       install_requires=[
           'boto>=2,<3', 'boto3',
-          'jsonschema', 'timeout_decorator', 'awscli',
-          'PyYAML',
+          'jsonschema', 'timeout_decorator',
+          'awscli>=1,<2',
+          # awscli pins PyYAML below 5.3 so they can maintain support
+          # for old versions of Python.  This can cause issues at
+          # run-time if we don't constrain things here as well, as a
+          # newer version seems to sneak in:
+          #
+          # pkg_resources.ContextualVersionConflict:
+          #   (PyYAML 5.3 (.../lib/python3.7/site-packages),
+          #     Requirement.parse('PyYAML<5.3,>=3.10'), {'awscli'})
+          #
+          # https://github.com/aws/aws-cli/blob/develop/setup.py
+          'PyYAML<5.3',
           # sqlalchemy-vertica-python 0.5.5 introduced
           # https://github.com/bluelabsio/sqlalchemy-vertica-python/pull/7
           # which fixed a bug pulling schema information from Vertica

From e576564d3a3295adf279ce01e54c9662303728a7 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Wed, 26 Feb 2020 09:47:36 -0500
Subject: [PATCH 11/12] Remove -

---
 INSTALL.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/INSTALL.md b/INSTALL.md
index 712e7f65c..a56a9f59b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -8,7 +8,7 @@ You can install records-mover with the following 'extras':
   for local Parquet manipulation).
 * `pip3 install records-mover[gsheets]` - Minimal install plus API
   libraries to access Google Sheets.
-* `pip3 install records-mover[mover-cli]` - Install everything and
+* `pip3 install records-mover[movercli]` - Install everything and
   make assumptions compatible with using mvrec on the command line.
   Installs `pandas`, `psycopg2-binary` and `pyarrow`.
 

From eb8cc0907715d2ee43941d9f00fd1447b8a8a847 Mon Sep 17 00:00:00 2001
From: Vince Broz <vince.broz@bluelabs.com>
Date: Wed, 26 Feb 2020 09:49:23 -0500
Subject: [PATCH 12/12] job_context -> session

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 66451b542..088be362a 100644
--- a/README.md
+++ b/README.md
@@ -62,14 +62,14 @@ records = session.records
 
 # This is a SQLAlchemy database engine.
 #
-# You can instead call job_context.get_db_engine('cred name').
+# You can instead call session.get_db_engine('cred name').
 #
 # On your laptop, 'cred name' is the same thing passed to dbcli (mapping to something in LastPass).
 #
 # In Airflow, 'cred name' maps to the connection ID in the admin Connnections UI.
 #
 # Or you can build your own and pass it in!
-db_engine = job_context.get_default_db_engine()
+db_engine = session.get_default_db_engine()
 
 df = DataFrame.from_dict([{'a': 1}]) # or make your own!