bluelabsio · vinceatbluelabs · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020 · Dec 1, 2020
diff --git a/records_mover/db/bigquery/bigquery_db_driver.py b/records_mover/db/bigquery/bigquery_db_driver.py
@@ -8,7 +8,9 @@
 from ...url.resolver import UrlResolver
 import sqlalchemy
 from .loader import BigQueryLoader
+from .unloader import BigQueryUnloader
 from ..loader import LoaderFromFileobj, LoaderFromRecordsDirectory
+from ..unloader import Unloader
 from ...url.base import BaseDirectoryUrl
 
 
@@ -26,15 +28,19 @@ def __init__(self,
             BigQueryLoader(db=self.db,
                            url_resolver=url_resolver,
                            gcs_temp_base_loc=gcs_temp_base_loc)
+        self._bigquery_unloader =\
+            BigQueryUnloader(db=self.db,
+                             url_resolver=url_resolver,
+                             gcs_temp_base_loc=gcs_temp_base_loc)
 
     def loader(self) -> Optional[LoaderFromRecordsDirectory]:
         return self._bigquery_loader
 
     def loader_from_fileobj(self) -> LoaderFromFileobj:
         return self._bigquery_loader
 
-    def unloader(self) -> None:
-        return None
+    def unloader(self) -> Unloader:
+        return self._bigquery_unloader
 
     def type_for_date_plus_time(self, has_tz: bool=False) -> sqlalchemy.sql.sqltypes.DateTime:
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types

diff --git a/records_mover/db/bigquery/load_job_config_options.py b/records_mover/db/bigquery/load_job_config_options.py
@@ -2,7 +2,9 @@
 from ...records.delimited import cant_handle_hint
 from typing import Set
 from ...records.load_plan import RecordsLoadPlan
-from ...records.records_format import DelimitedRecordsFormat, ParquetRecordsFormat
+from ...records.records_format import (
+    DelimitedRecordsFormat, ParquetRecordsFormat, AvroRecordsFormat
+)
 from records_mover.records.delimited import ValidatedRecordsHints
 from records_mover.mover_types import _assert_never
 from google.cloud.bigquery.job import CreateDisposition, WriteDisposition
@@ -118,6 +120,10 @@ def load_job_config(unhandled_hints: Set[str],
         config.source_format = 'PARQUET'
         return config
 
+    if isinstance(load_plan.records_format, AvroRecordsFormat):
+        config.source_format = 'AVRO'
+        return config
+
     raise NotImplementedError("Not currently able to load "
                               f"{load_plan.records_format.format_type}")
 

diff --git a/records_mover/db/bigquery/loader.py b/records_mover/db/bigquery/loader.py
@@ -6,7 +6,7 @@
 import sqlalchemy
 from ...records.load_plan import RecordsLoadPlan
 from ...records.records_format import (
-    BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat
+    BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat, AvroRecordsFormat
 )
 from ...records.records_directory import RecordsDirectory
 from ...records.processing_instructions import ProcessingInstructions
@@ -87,8 +87,6 @@ def load_from_fileobj(self, schema: str, table: str,
         # https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.load_table_from_file
         job = client.load_table_from_file(fileobj,
                                           f"{schema}.{table}",
-                                          # Must match the destination dataset location.
-                                          location="US",
                                           job_config=job_config)
 
         try:
@@ -165,6 +163,8 @@ def can_load_this_format(self, source_records_format: BaseRecordsFormat) -> bool
                                         processing_instructions=processing_instructions)
             if isinstance(load_plan.records_format, ParquetRecordsFormat):
                 return True
+            if isinstance(load_plan.records_format, AvroRecordsFormat):
+                return True
             if not isinstance(load_plan.records_format, DelimitedRecordsFormat):
                 return False
             unhandled_hints = set(load_plan.records_format.hints.keys())

diff --git a/records_mover/db/bigquery/unloader.py b/records_mover/db/bigquery/unloader.py
@@ -0,0 +1,107 @@
+import sqlalchemy
+import pprint
+from contextlib import contextmanager
+from typing import List, Iterator, Optional, Union, Tuple
+import logging
+from google.cloud.bigquery.dbapi.connection import Connection
+from google.cloud.bigquery.client import Client
+from google.cloud.bigquery.job import ExtractJobConfig
+from records_mover.db.unloader import Unloader
+from records_mover.records.records_format import BaseRecordsFormat, AvroRecordsFormat
+from records_mover.url.base import BaseDirectoryUrl
+from records_mover.url.resolver import UrlResolver
+from records_mover.records.unload_plan import RecordsUnloadPlan
+from records_mover.records.records_directory import RecordsDirectory
+from records_mover.db.errors import NoTemporaryBucketConfiguration
+
+logger = logging.getLogger(__name__)
+
+
+class BigQueryUnloader(Unloader):
+    def __init__(self,
+                 db: Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine],
+                 url_resolver: UrlResolver,
+                 gcs_temp_base_loc: Optional[BaseDirectoryUrl])\
+            -> None:
+        self.db = db
+        self.url_resolver = url_resolver
+        self.gcs_temp_base_loc = gcs_temp_base_loc
+        super().__init__(db=db)
+
+    def can_unload_format(self, target_records_format: BaseRecordsFormat) -> bool:
+        if isinstance(target_records_format, AvroRecordsFormat):
+            return True
+        return False
+
+    def can_unload_to_scheme(self, scheme: str) -> bool:
+        return scheme == 'gs'
+
+    def known_supported_records_formats_for_unload(self) -> List[BaseRecordsFormat]:
+        return [AvroRecordsFormat()]
+
+    @contextmanager
+    def temporary_unloadable_directory_loc(self) -> Iterator[BaseDirectoryUrl]:
+        if self.gcs_temp_base_loc is None:
+            raise NoTemporaryBucketConfiguration('Please provide a scratch GCS URL in your config '
+                                                 '(e.g., set SCRATCH_GCS_URL to a gs:// URL)')
+        else:
+            with self.gcs_temp_base_loc.temporary_directory() as temp_loc:
+                yield temp_loc
+
+    def _parse_bigquery_schema_name(self, schema: str) -> Tuple[Optional[str], str]:
+        # https://github.com/mxmzdlv/pybigquery/blob/master/pybigquery/sqlalchemy_bigquery.py#L320
+        dataset = None
+        project = None
+
+        schema_split = schema.split('.')
+        if len(schema_split) == 1:
+            dataset, = schema_split
+        elif len(schema_split) == 2:
+            project, dataset = schema_split
+        else:
+            raise ValueError(f"Could not understand schema name {schema}")
+
+        return (project, dataset)
+
+    def _extract_job_config(self, unload_plan: RecordsUnloadPlan) -> ExtractJobConfig:
+        config = ExtractJobConfig()
+        if isinstance(unload_plan.records_format, AvroRecordsFormat):
+            config.destination_format = 'AVRO'
+        else:
+            raise NotImplementedError(f'Please add support for {unload_plan.records_format}')
+        return config
+
+    def unload(self,
+               schema: str,
+               table: str,
+               unload_plan: RecordsUnloadPlan,
+               directory: RecordsDirectory) -> Optional[int]:
+        if directory.scheme != 'gs':
+            with self.temporary_unloadable_directory_loc() as temp_gcs_loc:
+                temp_directory = RecordsDirectory(temp_gcs_loc)
+                out = self.unload(schema=schema,
+                                  table=table,
+                                  unload_plan=unload_plan,
+                                  directory=temp_directory)
+                temp_directory.copy_to(directory.loc)
+                return out
+        logger.info("Loading from records directory into BigQuery")
+        # https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/tables.html#creating-a-table
+        connection: Connection =\
+            self.db.engine.raw_connection().connection
+        # https://google-cloud.readthedocs.io/en/latest/bigquery/generated/google.cloud.bigquery.client.Client.html
+        client: Client = connection._client
+        project_id, dataset_id = self._parse_bigquery_schema_name(schema)
+        job_config = self._extract_job_config(unload_plan)
+
+        records_format = unload_plan.records_format
+        filename = records_format.generate_filename('output')
+        destination_uri = directory.loc.file_in_this_directory(filename)
+        job = client.extract_table(f"{schema}.{table}",
+                                   destination_uri.url,
+                                   # Must match the destination dataset location.
+                                   job_config=job_config)
+        job.result()  # Waits for table load to complete.
+        logger.info(f"Unloaded from {dataset_id}:{table} into {filename}")
+        directory.save_preliminary_manifest()
+        return None
diff --git a/records_mover/db/unloader.py b/records_mover/db/unloader.py
@@ -34,7 +34,7 @@ def unload(self,
     def known_supported_records_formats_for_unload(self) -> List[BaseRecordsFormat]:
         """Supplies a list of the records formats which can be bulk exported
         from this database.  This may not be the full set - see
-        can_unlaod_this_format() to test other possibilities.
+        can_unload_this_format() to test other possibilities.
         """
         ...
 

diff --git a/records_mover/records/records_format.py b/records_mover/records/records_format.py
@@ -10,6 +10,22 @@
 logger = logging.getLogger(__name__)
 
 
+class AvroRecordsFormat(BaseRecordsFormat):
+    "Describes records files in `Avro <https://avro.apache.org/>`_ format"
+
+    def __init__(self) -> None:
+        "Create a new instance of AvroRecordsFormat"
+        self.format_type = 'avro'
+
+    def __str__(self) -> str:
+        return "AvroRecordsFormat"
+
+    def __repr__(self) -> str:
+        return str(self)
+
+    def generate_filename(self, basename: str) -> str:
+        return f"{basename}.avro"
+
 class ParquetRecordsFormat(BaseRecordsFormat):
     "Describes records files in `Parquet <https://parquet.apache.org/>`_ format"
 

diff --git a/records_mover/records/records_format_file.py b/records_mover/records/records_format_file.py
@@ -1,4 +1,6 @@
-from .records_format import BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat
+from .records_format import (
+    BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat, AvroRecordsFormat
+)
 from ..url.base import BaseDirectoryUrl, BaseFileUrl
 from .processing_instructions import ProcessingInstructions
 from .delimited import PartialRecordsHints
@@ -22,6 +24,8 @@ def load_format(self, fail_if_dont_understand: bool) -> BaseRecordsFormat:
             return self.load_delimited_format(format_loc, fail_if_dont_understand)
         elif format_type == 'parquet':
             return ParquetRecordsFormat()
+        elif format_type == 'avro':
+            return AvroRecordsFormat()
         else:
             raise TypeError(f"Format type {format_type} not yet supported in this library")
 

diff --git a/records_mover/records/records_types.py b/records_mover/records/records_types.py
@@ -36,6 +36,6 @@ class UrlDetailsEntry(TypedDict):
 UrlDetails = Dict[Url, UrlDetailsEntry]
 
 
-RecordsFormatType = Literal['delimited', 'parquet']
+RecordsFormatType = Literal['avro', 'delimited', 'parquet']
 
 DelimitedVariant = Literal['dumb', 'csv', 'bigquery', 'bluelabs', 'vertica']
diff --git a/tests/unit/db/bigquery/test_bigquery_loader.py b/tests/unit/db/bigquery/test_bigquery_loader.py
@@ -1,12 +1,19 @@
 import unittest
 
 from records_mover.db.bigquery.loader import BigQueryLoader
-from records_mover.records.records_format import DelimitedRecordsFormat, ParquetRecordsFormat
+from records_mover.records.records_format import (
+    DelimitedRecordsFormat, ParquetRecordsFormat, AvroRecordsFormat,
+    BaseRecordsFormat
+)
 from records_mover.db.errors import NoTemporaryBucketConfiguration
 from mock import MagicMock, Mock
 from unittest.mock import patch
 
 
+class NewRecordsFormat(BaseRecordsFormat):
+    ...
+
+
 class TestBigQueryLoader(unittest.TestCase):
     @patch('records_mover.db.bigquery.loader.load_job_config')
     def test_load_with_bad_schema_name(self, mock_load_job_config):
@@ -125,6 +132,33 @@ def test_can_load_this_format_true(self,
         mock_load_job_config.assert_called_with(set(), mock_load_plan)
         self.assertEqual(True, out)
 
+
+    @patch('records_mover.db.bigquery.loader.load_job_config')
+    @patch('records_mover.db.bigquery.loader.ProcessingInstructions')
+    @patch('records_mover.db.bigquery.loader.RecordsLoadPlan')
+    def test_can_load_this_format_delimited_false(self,
+                                                  mock_RecordsLoadPlan,
+                                                  mock_ProcessingInstructions,
+                                                  mock_load_job_config):
+        mock_db = Mock(name='db')
+        mock_source_records_format = Mock(name='source_records_format', spec=DelimitedRecordsFormat)
+        mock_source_records_format.format_type = 'delimited'
+        mock_processing_instructions = mock_ProcessingInstructions.return_value
+        mock_load_plan = mock_RecordsLoadPlan.return_value
+        mock_load_plan.records_format = mock_source_records_format
+        mock_url_resolver = Mock(name='url_resolver')
+        mock_load_job_config.side_effect = NotImplementedError
+        mock_source_records_format.hints = {}
+        bigquery_loader = BigQueryLoader(db=mock_db, url_resolver=mock_url_resolver,
+                                         gcs_temp_base_loc=None)
+        out = bigquery_loader.can_load_this_format(mock_source_records_format)
+        mock_ProcessingInstructions.assert_called_with()
+        mock_RecordsLoadPlan.\
+            assert_called_with(records_format=mock_source_records_format,
+                               processing_instructions=mock_processing_instructions)
+        mock_load_job_config.assert_called_with(set(), mock_load_plan)
+        self.assertEqual(False, out)
+
     @patch('records_mover.db.bigquery.loader.load_job_config')
     def test_load_from_fileobj_true(self, mock_load_job_config):
         mock_db = Mock(name='mock_db')
@@ -156,7 +190,6 @@ def test_load_from_fileobj_true(self, mock_load_job_config):
         mock_client.load_table_from_file.\
             assert_called_with(mock_fileobj,
                                'my_project.my_dataset.mytable',
-                               location="US",
                                job_config=mock_load_job_config.return_value)
         mock_job.result.assert_called_with()
 
@@ -200,7 +233,6 @@ def test_load_with_fileobj_fallback(self, mock_load_job_config):
         mock_client.load_table_from_file.\
             assert_called_with(mock_fileobj,
                                'my_project.my_dataset.mytable',
-                               location="US",
                                job_config=mock_load_job_config.return_value)
         mock_job.result.assert_called_with()
 
@@ -215,7 +247,7 @@ def test_can_load_this_format_true_parquet(self,
                                                mock_load_job_config):
         mock_db = Mock(name='db')
         mock_source_records_format = Mock(name='source_records_format', spec=ParquetRecordsFormat)
-        mock_source_records_format.format_type = 'delimited'
+        mock_source_records_format.format_type = 'parquet'
         mock_processing_instructions = mock_ProcessingInstructions.return_value
         mock_load_plan = mock_RecordsLoadPlan.return_value
         mock_load_plan.records_format = mock_source_records_format
@@ -230,6 +262,54 @@ def test_can_load_this_format_true_parquet(self,
                                processing_instructions=mock_processing_instructions)
         self.assertTrue(out)
 
+    @patch('records_mover.db.bigquery.loader.load_job_config')
+    @patch('records_mover.db.bigquery.loader.ProcessingInstructions')
+    @patch('records_mover.db.bigquery.loader.RecordsLoadPlan')
+    def test_can_load_this_format_true_avro(self,
+                                            mock_RecordsLoadPlan,
+                                            mock_ProcessingInstructions,
+                                            mock_load_job_config):
+        mock_db = Mock(name='db')
+        mock_source_records_format = Mock(name='source_records_format', spec=AvroRecordsFormat)
+        mock_source_records_format.format_type = 'avro'
+        mock_processing_instructions = mock_ProcessingInstructions.return_value
+        mock_load_plan = mock_RecordsLoadPlan.return_value
+        mock_load_plan.records_format = mock_source_records_format
+        mock_url_resolver = Mock(name='url_resolver')
+        mock_source_records_format.hints = {}
+        bigquery_loader = BigQueryLoader(db=mock_db, url_resolver=mock_url_resolver,
+                                         gcs_temp_base_loc=None)
+        out = bigquery_loader.can_load_this_format(mock_source_records_format)
+        mock_ProcessingInstructions.assert_called_with()
+        mock_RecordsLoadPlan.\
+            assert_called_with(records_format=mock_source_records_format,
+                               processing_instructions=mock_processing_instructions)
+        self.assertTrue(out)
+
+    @patch('records_mover.db.bigquery.loader.load_job_config')
+    @patch('records_mover.db.bigquery.loader.ProcessingInstructions')
+    @patch('records_mover.db.bigquery.loader.RecordsLoadPlan')
+    def test_can_load_this_format_false_newformat(self,
+                                                  mock_RecordsLoadPlan,
+                                                  mock_ProcessingInstructions,
+                                                  mock_load_job_config):
+        mock_db = Mock(name='db')
+        mock_source_records_format = Mock(name='source_records_format', spec=NewRecordsFormat)
+        mock_source_records_format.format_type = 'new'
+        mock_processing_instructions = mock_ProcessingInstructions.return_value
+        mock_load_plan = mock_RecordsLoadPlan.return_value
+        mock_load_plan.records_format = mock_source_records_format
+        mock_url_resolver = Mock(name='url_resolver')
+        mock_source_records_format.hints = {}
+        bigquery_loader = BigQueryLoader(db=mock_db, url_resolver=mock_url_resolver,
+                                         gcs_temp_base_loc=None)
+        out = bigquery_loader.can_load_this_format(mock_source_records_format)
+        mock_ProcessingInstructions.assert_called_with()
+        mock_RecordsLoadPlan.\
+            assert_called_with(records_format=mock_source_records_format,
+                               processing_instructions=mock_processing_instructions)
+        self.assertFalse(out)
+
     def test_known_supported_records_formats_for_load(self):
         mock_db = Mock(name='db')
         mock_url_resolver = Mock(name='url_resolver')