-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for BigQuery bulk export (to Avro, for now) #136
Changes from all commits
e095590
0fa5321
28f48ab
5ebb19b
de77caa
09ceea5
2a721af
f9f88ad
8b10df9
5c04074
c00afd2
d5c8e84
35e7e29
99631b6
ba7d15b
2dcfda0
8898433
16b041c
059b2a7
cba3e2e
f69b823
03f1e28
ebbf53f
0179a71
63114b5
1568ada
48a7fdb
e0aab23
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1103 | ||
1125 | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
93.6200 | ||
93.6500 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
92.4400 | ||
92.5500 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,16 @@ | ||
from ..driver import DBDriver | ||
import logging | ||
from ...records import RecordsSchema | ||
from ...records.records_format import BaseRecordsFormat, ParquetRecordsFormat | ||
from ...records.records_format import BaseRecordsFormat, ParquetRecordsFormat, AvroRecordsFormat | ||
from ...utils.limits import INT64_MAX, INT64_MIN, FLOAT64_SIGNIFICAND_BITS, num_digits | ||
import re | ||
from typing import Union, Optional, Tuple | ||
from ...url.resolver import UrlResolver | ||
import sqlalchemy | ||
from .loader import BigQueryLoader | ||
from .unloader import BigQueryUnloader | ||
from ..loader import LoaderFromFileobj, LoaderFromRecordsDirectory | ||
from ..unloader import Unloader | ||
from ...url.base import BaseDirectoryUrl | ||
|
||
|
||
|
@@ -26,15 +28,19 @@ def __init__(self, | |
BigQueryLoader(db=self.db, | ||
url_resolver=url_resolver, | ||
gcs_temp_base_loc=gcs_temp_base_loc) | ||
self._bigquery_unloader =\ | ||
BigQueryUnloader(db=self.db, | ||
url_resolver=url_resolver, | ||
gcs_temp_base_loc=gcs_temp_base_loc) | ||
|
||
def loader(self) -> Optional[LoaderFromRecordsDirectory]: | ||
return self._bigquery_loader | ||
|
||
def loader_from_fileobj(self) -> LoaderFromFileobj: | ||
return self._bigquery_loader | ||
|
||
def unloader(self) -> None: | ||
return None | ||
def unloader(self) -> Unloader: | ||
return self._bigquery_unloader | ||
|
||
def type_for_date_plus_time(self, has_tz: bool=False) -> sqlalchemy.sql.sqltypes.DateTime: | ||
# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types | ||
|
@@ -110,6 +116,25 @@ def tweak_records_schema_for_load(self, | |
# | ||
# So we need to make sure we don't create any DATETIME | ||
# columns if we're loading from a Parquet file. | ||
# | ||
# https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-parquet | ||
return records_schema.convert_datetimes_to_datetimetz() | ||
|
||
else: | ||
return records_schema | ||
|
||
def tweak_records_schema_after_unload(self, | ||
records_schema: RecordsSchema, | ||
records_format: BaseRecordsFormat) -> RecordsSchema: | ||
if isinstance(records_format, AvroRecordsFormat): | ||
# https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#logical_types | ||
# | ||
# "Note: There is no logical type that directly | ||
# corresponds to DATETIME, and BigQuery currently doesn't | ||
# support any direct conversion from an Avro type into a | ||
# DATETIME field." | ||
# | ||
# BigQuery exports this as an Avro string type | ||
return records_schema.convert_datetimes_to_string() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This came up during the table2table integration test; BigQuery fails to load if you try to load the same string it exports from a DATETIME column into a DATETIME column. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 😬 |
||
else: | ||
return records_schema |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,7 +6,7 @@ | |
import sqlalchemy | ||
from ...records.load_plan import RecordsLoadPlan | ||
from ...records.records_format import ( | ||
BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat | ||
BaseRecordsFormat, DelimitedRecordsFormat, ParquetRecordsFormat, AvroRecordsFormat | ||
) | ||
from ...records.records_directory import RecordsDirectory | ||
from ...records.processing_instructions import ProcessingInstructions | ||
|
@@ -87,8 +87,6 @@ def load_from_fileobj(self, schema: str, table: str, | |
# https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html#google.cloud.bigquery.client.Client.load_table_from_file | ||
job = client.load_table_from_file(fileobj, | ||
f"{schema}.{table}", | ||
# Must match the destination dataset location. | ||
location="US", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not convinced this was ever needed - it's in the example code, but everything seems to run fine without it, so there must be some inference logic to figure out the dataset location based on the client object. |
||
job_config=job_config) | ||
|
||
try: | ||
|
@@ -165,6 +163,8 @@ def can_load_this_format(self, source_records_format: BaseRecordsFormat) -> bool | |
processing_instructions=processing_instructions) | ||
if isinstance(load_plan.records_format, ParquetRecordsFormat): | ||
return True | ||
if isinstance(load_plan.records_format, AvroRecordsFormat): | ||
return True | ||
if not isinstance(load_plan.records_format, DelimitedRecordsFormat): | ||
return False | ||
unhandled_hints = set(load_plan.records_format.hints.keys()) | ||
|
@@ -176,4 +176,8 @@ def can_load_this_format(self, source_records_format: BaseRecordsFormat) -> bool | |
return False | ||
|
||
def known_supported_records_formats_for_load(self) -> List[BaseRecordsFormat]: | ||
return [DelimitedRecordsFormat(variant='bigquery'), ParquetRecordsFormat()] | ||
return [ | ||
DelimitedRecordsFormat(variant='bigquery'), | ||
ParquetRecordsFormat(), | ||
AvroRecordsFormat() | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import sqlalchemy | ||
from contextlib import contextmanager | ||
from typing import List, Iterator, Optional, Union, Tuple | ||
import logging | ||
from google.cloud.bigquery.dbapi.connection import Connection | ||
from google.cloud.bigquery.client import Client | ||
from google.cloud.bigquery.job import ExtractJobConfig | ||
from records_mover.db.unloader import Unloader | ||
from records_mover.records.records_format import BaseRecordsFormat, AvroRecordsFormat | ||
from records_mover.url.base import BaseDirectoryUrl | ||
from records_mover.url.resolver import UrlResolver | ||
from records_mover.records.unload_plan import RecordsUnloadPlan | ||
from records_mover.records.records_directory import RecordsDirectory | ||
from records_mover.db.errors import NoTemporaryBucketConfiguration | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class BigQueryUnloader(Unloader): | ||
def __init__(self, | ||
db: Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine], | ||
url_resolver: UrlResolver, | ||
gcs_temp_base_loc: Optional[BaseDirectoryUrl])\ | ||
-> None: | ||
self.db = db | ||
self.url_resolver = url_resolver | ||
self.gcs_temp_base_loc = gcs_temp_base_loc | ||
super().__init__(db=db) | ||
|
||
def can_unload_format(self, target_records_format: BaseRecordsFormat) -> bool: | ||
if isinstance(target_records_format, AvroRecordsFormat): | ||
return True | ||
return False | ||
|
||
def can_unload_to_scheme(self, scheme: str) -> bool: | ||
return scheme == 'gs' | ||
|
||
def known_supported_records_formats_for_unload(self) -> List[BaseRecordsFormat]: | ||
return [AvroRecordsFormat()] | ||
|
||
@contextmanager | ||
def temporary_unloadable_directory_loc(self) -> Iterator[BaseDirectoryUrl]: | ||
if self.gcs_temp_base_loc is None: | ||
raise NoTemporaryBucketConfiguration('Please provide a scratch GCS URL in your config ' | ||
'(e.g., set SCRATCH_GCS_URL to a gs:// URL)') | ||
else: | ||
with self.gcs_temp_base_loc.temporary_directory() as temp_loc: | ||
yield temp_loc | ||
|
||
def _parse_bigquery_schema_name(self, schema: str) -> Tuple[Optional[str], str]: | ||
# https://github.com/mxmzdlv/pybigquery/blob/master/pybigquery/sqlalchemy_bigquery.py#L320 | ||
dataset = None | ||
project = None | ||
|
||
schema_split = schema.split('.') | ||
if len(schema_split) == 1: | ||
dataset, = schema_split | ||
elif len(schema_split) == 2: | ||
project, dataset = schema_split | ||
else: | ||
raise ValueError(f"Could not understand schema name {schema}") | ||
|
||
return (project, dataset) | ||
|
||
def _extract_job_config(self, unload_plan: RecordsUnloadPlan) -> ExtractJobConfig: | ||
config = ExtractJobConfig() | ||
if isinstance(unload_plan.records_format, AvroRecordsFormat): | ||
config.destination_format = 'AVRO' | ||
# https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-avro#logical_types | ||
config.use_avro_logical_types = True | ||
else: | ||
raise NotImplementedError(f'Please add support for {unload_plan.records_format}') | ||
return config | ||
|
||
def unload(self, | ||
schema: str, | ||
table: str, | ||
unload_plan: RecordsUnloadPlan, | ||
directory: RecordsDirectory) -> Optional[int]: | ||
if directory.scheme != 'gs': | ||
with self.temporary_unloadable_directory_loc() as temp_gcs_loc: | ||
temp_directory = RecordsDirectory(temp_gcs_loc) | ||
out = self.unload(schema=schema, | ||
table=table, | ||
unload_plan=unload_plan, | ||
directory=temp_directory) | ||
temp_directory.copy_to(directory.loc) | ||
return out | ||
logger.info("Loading from records directory into BigQuery") | ||
# https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/tables.html#creating-a-table | ||
connection: Connection =\ | ||
self.db.engine.raw_connection().connection | ||
# https://google-cloud.readthedocs.io/en/latest/bigquery/generated/google.cloud.bigquery.client.Client.html | ||
client: Client = connection._client | ||
project_id, dataset_id = self._parse_bigquery_schema_name(schema) | ||
job_config = self._extract_job_config(unload_plan) | ||
|
||
records_format = unload_plan.records_format | ||
filename = records_format.generate_filename('output') | ||
destination_uri = directory.loc.file_in_this_directory(filename) | ||
job = client.extract_table(f"{schema}.{table}", | ||
destination_uri.url, | ||
# Must match the destination dataset location. | ||
job_config=job_config) | ||
job.result() # Waits for table load to complete. | ||
logger.info(f"Unloaded from {dataset_id}:{table} into {filename}") | ||
directory.save_preliminary_manifest() | ||
return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.