From 299a29e46488cee1d3c0769c5c3eb689be93d26a Mon Sep 17 00:00:00 2001 From: Matt Kornfield Date: Wed, 6 Sep 2023 12:07:07 -0700 Subject: [PATCH] Lower limit size --- src/gretel_trainer/relational/extractor.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gretel_trainer/relational/extractor.py b/src/gretel_trainer/relational/extractor.py index 7365763c..b1fefc6f 100644 --- a/src/gretel_trainer/relational/extractor.py +++ b/src/gretel_trainer/relational/extractor.py @@ -422,7 +422,7 @@ def handle_partition(df: pd.DataFrame, lock: Lock): table_session = self._get_table_session(pk_values.table_name) nonlocal row_count - chunk_size = 15_000 # limit how many checks go into a WHERE clause + chunk_size = 150 # limit how many checks go into a WHERE clause for _, chunk_df in df.groupby(np.arange(len(df)) // chunk_size): values_list = chunk_df.to_records(index=False).tolist() @@ -542,6 +542,7 @@ def _sample_table( if self._config.entire_table: logger.debug(f"Extracting entire table: {table_name}") with engine.connect() as conn: + # TODO: Add a loading percentage here? df_iter = pd.read_sql_table( table_name, conn, chunksize=self._chunk_size )