-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Load via INSERT on Redshift when scratch bucket not available #114
Changes from 2 commits
a66f4db
d20d77f
e4bd0ea
f53587a
ef0b8c1
643ee51
2ec1930
97dc95c
575a3cd
3314b64
ce9371e
66a5f2f
bdd6099
7b239cb
dc7eb21
ce539ec
f45011a
ad67b78
4caebd9
9ca1a7a
b03f064
504b39b
b58f999
b6fff82
d9ea8a1
4edebe3
82b940e
f2f1f0c
e5a49ea
44e7e68
aa05ef7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
92.3400 | ||
92.3600 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
SupportsToFileobjsSource, | ||
FileobjsSource, SupportsToDataframesSource) | ||
from .targets.base import (RecordsTarget, SupportsMoveFromRecordsDirectory, | ||
SupportsMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromTempLocAfterFillingIt, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is another one of those interfaces implemented by sources and targets (in this case, targets). I renamed this interface to be more tentative - as now it can tell you at runtime whether it is able to do do what it says on the tin. Specifically, this interface has a function which tells the target to:
|
||
MightSupportMoveFromFileobjsSource, | ||
SupportsMoveFromDataframes) | ||
from .sources import base as sources_base | ||
|
@@ -123,8 +123,9 @@ def move(records_source: RecordsSource, | |
as fileobjs_source: | ||
return move(fileobjs_source, records_target, processing_instructions) | ||
elif (isinstance(records_source, SupportsMoveToRecordsDirectory) and | ||
isinstance(records_target, SupportsMoveFromTempLocAfterFillingIt) and | ||
records_source.has_compatible_format(records_target)): | ||
isinstance(records_target, MightSupportMoveFromTempLocAfterFillingIt) and | ||
records_source.has_compatible_format(records_target) and | ||
records_target.can_move_from_temp_loc_after_filling_it()): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a key bit - I'm changing the move() function (the key algorithm for Records Mover) to only use this interface's function if the interface says it's OK. In the most common case of a Table target, that means that the database in question has a temporary bucket location configured that it can do a bulk export to. |
||
logger.info(f"Mover: copying from {records_source} to {records_target} " | ||
f"by filling in a temporary location...") | ||
return records_target.move_from_temp_loc_after_filling_it(records_source, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,25 +30,29 @@ def __init__(self, | |
super().__init__(prep, target_table_details, processing_instructions) | ||
|
||
def move(self) -> MoveResult: | ||
if len(self.table_target.known_supported_records_formats()) != 0: | ||
if self.table_target.can_move_from_fileobjs_source(): | ||
return self.move_from_dataframes_source_via_fileobjs() | ||
else: | ||
# Some databases, like Redshift, can't load from a | ||
# stream, but can load from files on an object store | ||
# they're pointed to. | ||
return self.move_from_dataframes_source_via_records_directory() | ||
# TODO: Clean up redundant clause | ||
vinceatbluelabs marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (len(self.table_target.known_supported_records_formats()) != 0 and | ||
self.table_target.can_move_from_fileobjs_source()): | ||
return self.move_from_dataframes_source_via_fileobjs() | ||
elif (len(self.table_target.known_supported_records_formats()) != 0 and | ||
self.table_target.can_move_from_fileobjs_source() and | ||
self.table_target.can_move_from_temp_loc_after_filling_it()): | ||
# Some databases, like Redshift, can't load from a | ||
# stream, but can load from files on an object store | ||
# they're pointed to. | ||
return self.move_from_dataframes_source_via_temporary_records_directory() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a slightly more esoteric usage of this interface, where the database table target has been told to take a dataframe and load the data inside. There's a bit of a re-creation of a couple of parts of the move() algorithm here. Not sure I'm happy with that re-creation; it might be an area for future refactoring if I can figure out how to make it work as part of the main move() algorithm. But I digress. This changes things so that when we are given dataframe (or create one as an intermediary step), we can ingest it into a database even if that database doesn't have a temporary bucket configured. |
||
else: | ||
logger.info("Known formats for target database: " | ||
f"{self.table_target.known_supported_records_formats()}") | ||
logger.info("Table target can move from fileobjs source? " | ||
f"{self.table_target.can_move_from_fileobjs_source()}") | ||
logger.warning("Loading via INSERT statement as this DB " | ||
"driver does not yet support more direct LOAD methods. " | ||
"driver does not yet support or is not configured for " | ||
"more direct load methods. " | ||
"This may be very slow.") | ||
return self.move_from_dataframes_source_via_insert() | ||
|
||
def move_from_dataframes_source_via_records_directory(self) -> MoveResult: | ||
def move_from_dataframes_source_via_temporary_records_directory(self) -> MoveResult: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clarify that this function uses a temp directory, as it was a bit of a surprise to me as well by the original name. |
||
records_format = next(iter(self.table_target.known_supported_records_formats()), None) | ||
with self.dfs_source.to_fileobjs_source(self.processing_instructions, | ||
records_format) as fileobjs_source: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from records_mover.records.targets.base import ( | ||
SupportsMoveFromRecordsDirectory, | ||
SupportsMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromFileobjsSource, | ||
SupportsMoveFromDataframes, | ||
) | ||
|
@@ -31,7 +31,7 @@ | |
|
||
|
||
class TableRecordsTarget(SupportsMoveFromRecordsDirectory, | ||
SupportsMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromTempLocAfterFillingIt, | ||
MightSupportMoveFromFileobjsSource, | ||
SupportsMoveFromDataframes, | ||
TargetTableDetails): | ||
|
@@ -109,6 +109,13 @@ def can_move_from_this_format(self, | |
return False | ||
return loader.can_load_this_format(source_records_format) | ||
|
||
def can_move_from_temp_loc_after_filling_it(self) -> bool: | ||
driver = self.db_driver(self.db_engine) | ||
loader = driver.loader() | ||
if loader is None: | ||
return False | ||
return loader.has_temporary_loadable_directory_loc() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The key implementation of the expanded interface. |
||
|
||
def move_from_temp_loc_after_filling_it(self, | ||
records_source: | ||
SupportsMoveToRecordsDirectory, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Loader is an abstract class made concrete by each of the database types which can load out of a records directory.