From 4071fbaca33de71852b32ee8619e38032e6a18ab Mon Sep 17 00:00:00 2001 From: Matt Kornfield Date: Fri, 16 Jun 2023 12:45:36 -0700 Subject: [PATCH 1/2] Do not encode keys for transforms by default --- src/gretel_trainer/relational/multi_table.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/gretel_trainer/relational/multi_table.py b/src/gretel_trainer/relational/multi_table.py index 2cf5436f..a5df6505 100644 --- a/src/gretel_trainer/relational/multi_table.py +++ b/src/gretel_trainer/relational/multi_table.py @@ -653,6 +653,7 @@ def run_transforms( identifier: Optional[str] = None, in_place: bool = False, data: Optional[dict[str, pd.DataFrame]] = None, + encode_keys: bool = False, ) -> None: """ identifier: (str, optional): Unique string identifying a specific call to this method. Defaults to `transforms_` + current timestamp @@ -663,6 +664,9 @@ def run_transforms( If `data` is supplied, runs only the supplied data through the corresponding transforms models. Otherwise runs source data through all existing transforms models. + + If `encode_keys` is set to True, then we'll internally track the keys and update them + instead of relying on whatever was transformed """ if data is not None: unrunnable_tables = [ @@ -706,9 +710,11 @@ def run_transforms( ) run_task(task, self._extended_sdk) - output_tables = self._strategy.label_encode_keys( - self.relational_data, task.output_tables - ) + output_tables = task.output_tables + if encode_keys: + output_tables = self._strategy.label_encode_keys( + self.relational_data, task.output_tables + ) if in_place: for table_name, transformed_table in output_tables.items(): From 2cbce02b29d3881a7d412d9c54068495d96b98c0 Mon Sep 17 00:00:00 2001 From: Matt Kornfield Date: Fri, 16 Jun 2023 12:49:04 -0700 Subject: [PATCH 2/2] John's better docs :) --- src/gretel_trainer/relational/multi_table.py | 21 ++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/gretel_trainer/relational/multi_table.py b/src/gretel_trainer/relational/multi_table.py index a5df6505..d36fb508 100644 --- a/src/gretel_trainer/relational/multi_table.py +++ b/src/gretel_trainer/relational/multi_table.py @@ -656,17 +656,18 @@ def run_transforms( encode_keys: bool = False, ) -> None: """ - identifier: (str, optional): Unique string identifying a specific call to this method. Defaults to `transforms_` + current timestamp + Run pre-trained Gretel Transform models on Relational table data: - If `in_place` set to True, overwrites source data in all locations - (internal Python state, local working directory, project artifact archive). - Used for transforms->synthetics workflows. - - If `data` is supplied, runs only the supplied data through the corresponding transforms models. - Otherwise runs source data through all existing transforms models. - - If `encode_keys` is set to True, then we'll internally track the keys and update them - instead of relying on whatever was transformed + Args: + identifier: Unique string identifying a specific call to this method. Defaults to `transforms_` + current timestamp + in_place: If True, overwrites source data in all locations + (internal Python state, local working directory, project artifact archive). + Used for transforms->synthetics workflows. + data: If supplied, runs only the supplied data through the corresponding transforms models. + Otherwise runs source data through all existing transforms models. + encode_keys: If set, primary and foreign keys will be replaced with label encoded variants. This can add + an additional level of privacy at the cost of referential integrity between transformed and + original data. """ if data is not None: unrunnable_tables = [