Skip to content

Commit

Permalink
adding model docs (#28)
Browse files Browse the repository at this point in the history
* adding model docs

* Update src/gretel_trainer/models.py

Co-authored-by: Scott Bailey <[email protected]>

* Update src/gretel_trainer/models.py

Co-authored-by: Scott Bailey <[email protected]>

* Update src/gretel_trainer/models.py

Co-authored-by: Scott Bailey <[email protected]>

* Update src/gretel_trainer/models.py

Co-authored-by: Scott Bailey <[email protected]>

* updating docs

Co-authored-by: Scott Bailey <[email protected]>
  • Loading branch information
MasonEgger and csbailey5t authored Oct 27, 2022
1 parent ba27d10 commit a50a38d
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"_build",
"Thumbs.db",
".DS_Store",
"venv/*",
]


Expand Down
3 changes: 2 additions & 1 deletion docs/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ Models
======

.. automodule:: gretel_trainer.models
:members:
:members:
:inherited-members:
32 changes: 31 additions & 1 deletion src/gretel_trainer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@


def determine_best_model(df: pd.DataFrame):
"""
Determine the Gretel model best suited for generating synthetic data
for your dataset.
Args:
df (pd.DataFrame): Pandas DataFrame containing the data used to train a synthetic model.
Returns:
A Gretel Model object preconfigured for your use case.
"""
row_count, column_count = df.shape

if row_count > HIGH_RECORD_THRESHOLD or column_count > HIGH_COLUMN_THRESHOLD:
Expand Down Expand Up @@ -93,7 +103,17 @@ def _replace_nested_key(self, data, key, value) -> dict:


class GretelLSTM(_BaseConfig):
"""
This model works for a variety of synthetic data tasks including time-series, tabular, and text data. Generally useful for a few thousand records and upward. Dataset generally has a mix of categorical, continuous, and numerical values
Source data should have <150 columns.
Args:
config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration
max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000
max_header_clusters (int, optional): Default: 20
enable_privacy_filters (bool, optional): Default: False
"""
_max_header_clusters_limit: int = 30
_max_rows_limit: int = 5000000
_model_slug: str = "synthetics"
Expand All @@ -114,7 +134,17 @@ def __init__(


class GretelCTGAN(_BaseConfig):
"""
This model works well for high dimensional, largely numeric data. Use for datasets with more than 20 columns and/or 50,000 rows.
Not ideal if dataset contains free text field
Args:
config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration
max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000
max_header_clusters (int, optional): Default: 20
enable_privacy_filters (bool, optional): Default: False
"""
_max_header_clusters_limit: int = 1000
_max_rows_limit: int = 5000000
_model_slug: str = "ctgan"
Expand All @@ -131,4 +161,4 @@ def __init__(
max_rows=max_rows,
max_header_clusters=max_header_clusters,
enable_privacy_filters=enable_privacy_filters,
)
)

0 comments on commit a50a38d

Please sign in to comment.