Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding model docs #28

Merged
merged 6 commits into from
Oct 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"_build",
"Thumbs.db",
".DS_Store",
"venv/*",
]


Expand Down
3 changes: 2 additions & 1 deletion docs/models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ Models
======

.. automodule:: gretel_trainer.models
:members:
:members:
:inherited-members:
32 changes: 31 additions & 1 deletion src/gretel_trainer/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@


def determine_best_model(df: pd.DataFrame):
"""
Determine the Gretel model best suited for generating synthetic data
for your dataset.

Args:
df (pd.DataFrame): Pandas DataFrame containing the data used to train a synthetic model.

Returns:
A Gretel Model object preconfigured for your use case.
"""
row_count, column_count = df.shape

if row_count > HIGH_RECORD_THRESHOLD or column_count > HIGH_COLUMN_THRESHOLD:
Expand Down Expand Up @@ -93,7 +103,17 @@ def _replace_nested_key(self, data, key, value) -> dict:


class GretelLSTM(_BaseConfig):
"""
This model works for a variety of synthetic data tasks including time-series, tabular, and text data. Generally useful for a few thousand records and upward. Dataset generally has a mix of categorical, continuous, and numerical values

Source data should have <150 columns.

Args:
config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration
max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000
max_header_clusters (int, optional): Default: 20
enable_privacy_filters (bool, optional): Default: False
"""
_max_header_clusters_limit: int = 30
_max_rows_limit: int = 5000000
_model_slug: str = "synthetics"
Expand All @@ -114,7 +134,17 @@ def __init__(


class GretelCTGAN(_BaseConfig):
"""
This model works well for high dimensional, largely numeric data. Use for datasets with more than 20 columns and/or 50,000 rows.

Not ideal if dataset contains free text field

Args:
config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration
max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000
max_header_clusters (int, optional): Default: 20
enable_privacy_filters (bool, optional): Default: False
"""
_max_header_clusters_limit: int = 1000
_max_rows_limit: int = 5000000
_model_slug: str = "ctgan"
Expand All @@ -131,4 +161,4 @@ def __init__(
max_rows=max_rows,
max_header_clusters=max_header_clusters,
enable_privacy_filters=enable_privacy_filters,
)
)