-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Load via INSERT on Redshift when scratch bucket not available #114
Changes from 15 commits
a66f4db
d20d77f
e4bd0ea
f53587a
ef0b8c1
643ee51
2ec1930
97dc95c
575a3cd
3314b64
ce9371e
66a5f2f
bdd6099
7b239cb
dc7eb21
ce539ec
f45011a
ad67b78
4caebd9
9ca1a7a
b03f064
504b39b
b58f999
b6fff82
d9ea8a1
4edebe3
82b940e
f2f1f0c
e5a49ea
44e7e68
aa05ef7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
93.5700 | ||
93.6900 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
92.3400 | ||
92.3500 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,8 +10,8 @@ | |
import logging | ||
from .records_copy import redshift_copy_options | ||
from ...records.load_plan import RecordsLoadPlan | ||
from ..errors import CredsDoNotSupportS3Import | ||
from typing import Optional, Union, Callable, ContextManager, List, Iterator | ||
from ..errors import CredsDoNotSupportS3Import, NoTemporaryBucketConfiguration | ||
from typing import Optional, Union, List, Iterator | ||
from ...url import BaseDirectoryUrl | ||
from botocore.credentials import Credentials | ||
from ...records.delimited import complain_on_unhandled_hints | ||
|
@@ -23,11 +23,19 @@ class RedshiftLoader(LoaderFromRecordsDirectory): | |
def __init__(self, | ||
db: Union[sqlalchemy.engine.Engine, sqlalchemy.engine.Connection], | ||
meta: sqlalchemy.MetaData, | ||
temporary_s3_directory_loc: Callable[[], ContextManager[BaseDirectoryUrl]])\ | ||
s3_temp_base_loc: Optional[BaseDirectoryUrl])\ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was doing a too-clever-by-half thing before and passing in a function that creates a temporary directory in the right place in S3. Now I need to know things like "do I actually have a bucket in which to create the temporary directory", so I'm passing in something more low level (at the price of duplicating a three line function between two classes with no inheritance relationship). |
||
-> None: | ||
self.db = db | ||
self.meta = meta | ||
self.temporary_s3_directory_loc = temporary_s3_directory_loc | ||
self.s3_temp_base_loc = s3_temp_base_loc | ||
|
||
@contextmanager | ||
def temporary_s3_directory_loc(self) -> Iterator[BaseDirectoryUrl]: | ||
if self.s3_temp_base_loc is None: | ||
raise NoTemporaryBucketConfiguration('Please provide a scratch S3 URL in your config') | ||
else: | ||
with self.s3_temp_base_loc.temporary_directory() as temp_loc: | ||
yield temp_loc | ||
|
||
def load(self, | ||
schema: str, | ||
|
@@ -158,3 +166,6 @@ def best_scheme_to_load_from(self) -> str: | |
def temporary_loadable_directory_loc(self) -> Iterator[BaseDirectoryUrl]: | ||
with self.temporary_s3_directory_loc() as temp_loc: | ||
yield temp_loc | ||
|
||
def has_temporary_loadable_directory_loc(self) -> bool: | ||
return self.s3_temp_base_loc is not None |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
SupportsToFileobjsSource, | ||
FileobjsSource, SupportsToDataframesSource) | ||
from .targets.base import (RecordsTarget, SupportsMoveFromRecordsDirectory, | ||
SupportsMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromTempLocAfterFillingIt, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another one of those interfaces implemented by sources and targets (in this case, targets). I renamed this interface to be more tentative - as now it can tell you at runtime whether it is able to do do what it says on the tin. Specifically, this interface has a function which tells the target to:
|
||
MightSupportMoveFromFileobjsSource, | ||
SupportsMoveFromDataframes) | ||
from .sources import base as sources_base | ||
|
@@ -123,8 +123,9 @@ def move(records_source: RecordsSource, | |
as fileobjs_source: | ||
return move(fileobjs_source, records_target, processing_instructions) | ||
elif (isinstance(records_source, SupportsMoveToRecordsDirectory) and | ||
isinstance(records_target, SupportsMoveFromTempLocAfterFillingIt) and | ||
records_source.has_compatible_format(records_target)): | ||
isinstance(records_target, MightSupportMoveFromTempLocAfterFillingIt) and | ||
records_source.has_compatible_format(records_target) and | ||
records_target.can_move_from_temp_loc_after_filling_it()): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a key bit - I'm changing the move() function (the key algorithm for Records Mover) to only use this interface's function if the interface says it's OK. In the most common case of a Table target, that means that the database in question has a temporary bucket location configured that it can do a bulk export to. |
||
logger.info(f"Mover: copying from {records_source} to {records_target} " | ||
f"by filling in a temporary location...") | ||
return records_target.move_from_temp_loc_after_filling_it(records_source, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,25 +30,27 @@ def __init__(self, | |
super().__init__(prep, target_table_details, processing_instructions) | ||
|
||
def move(self) -> MoveResult: | ||
if len(self.table_target.known_supported_records_formats()) != 0: | ||
if self.table_target.can_move_from_fileobjs_source(): | ||
return self.move_from_dataframes_source_via_fileobjs() | ||
else: | ||
# Some databases, like Redshift, can't load from a | ||
# stream, but can load from files on an object store | ||
# they're pointed to. | ||
return self.move_from_dataframes_source_via_records_directory() | ||
target_supports_formats = len(self.table_target.known_supported_records_formats()) != 0 | ||
if (target_supports_formats and self.table_target.can_move_from_fileobjs_source()): | ||
return self.move_from_dataframes_source_via_fileobjs() | ||
elif (target_supports_formats and | ||
self.table_target.can_move_from_temp_loc_after_filling_it()): | ||
# Some databases, like Redshift, can't load from a | ||
# stream, but can load from files on an object store | ||
# they're pointed to. | ||
return self.move_from_dataframes_source_via_temporary_records_directory() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a slightly more esoteric usage of this interface, where the database table target has been told to take a dataframe and load the data inside. There's a bit of a re-creation of a couple of parts of the move() algorithm here. Not sure I'm happy with that re-creation; it might be an area for future refactoring if I can figure out how to make it work as part of the main move() algorithm. But I digress. This changes things so that when we are given dataframe (or create one as an intermediary step), we can ingest it into a database even if that database doesn't have a temporary bucket configured. |
||
else: | ||
logger.info("Known formats for target database: " | ||
f"{self.table_target.known_supported_records_formats()}") | ||
logger.info("Table target can move from fileobjs source? " | ||
f"{self.table_target.can_move_from_fileobjs_source()}") | ||
logger.warning("Loading via INSERT statement as this DB " | ||
"driver does not yet support more direct LOAD methods. " | ||
"driver does not yet support or is not configured for " | ||
"more direct load methods. " | ||
"This may be very slow.") | ||
return self.move_from_dataframes_source_via_insert() | ||
|
||
def move_from_dataframes_source_via_records_directory(self) -> MoveResult: | ||
def move_from_dataframes_source_via_temporary_records_directory(self) -> MoveResult: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clarify that this function uses a temp directory, as it was a bit of a surprise to me as well by the original name. |
||
records_format = next(iter(self.table_target.known_supported_records_formats()), None) | ||
with self.dfs_source.to_fileobjs_source(self.processing_instructions, | ||
records_format) as fileobjs_source: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from records_mover.records.targets.base import ( | ||
SupportsMoveFromRecordsDirectory, | ||
SupportsMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromFileobjsSource, | ||
SupportsMoveFromDataframes, | ||
) | ||
|
@@ -31,7 +31,7 @@ | |
|
||
|
||
class TableRecordsTarget(SupportsMoveFromRecordsDirectory, | ||
SupportsMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromFileobjsSource, | ||
SupportsMoveFromDataframes, | ||
TargetTableDetails): | ||
|
@@ -109,6 +109,13 @@ def can_move_from_this_format(self, | |
return False | ||
return loader.can_load_this_format(source_records_format) | ||
|
||
def can_move_from_temp_loc_after_filling_it(self) -> bool: | ||
driver = self.db_driver(self.db_engine) | ||
loader = driver.loader() | ||
if loader is None: | ||
return False | ||
return loader.has_temporary_loadable_directory_loc() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The key implementation of the expanded interface. |
||
|
||
def move_from_temp_loc_after_filling_it(self, | ||
records_source: | ||
SupportsMoveToRecordsDirectory, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from airflow.contrib.hooks.gcp_api_base_hook import GoogleCloudBaseHook | ||
from records_mover.airflow.hooks.google_cloud_credentials_hook import GoogleCloudCredentialsHook | ||
from mock import Mock | ||
import unittest | ||
|
||
|
||
class TestGoogleCloudCredentialsHook(unittest.TestCase): | ||
def test_get_conn(self): | ||
mock_init = Mock('__init__') | ||
GoogleCloudBaseHook.__init__ = mock_init | ||
mock_init.return_value = None | ||
hook = GoogleCloudCredentialsHook() | ||
mock_get_credentials = Mock('get_credentials') | ||
hook._get_credentials = mock_get_credentials | ||
conn = hook.get_conn() | ||
self.assertEqual(conn, mock_get_credentials.return_value) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Loader is an abstract class made concrete by each of the database types which can load out of a records directory.