diff --git a/docs/conf.py b/docs/conf.py index f35dc2a9..cf15d1fc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -36,6 +36,7 @@ "_build", "Thumbs.db", ".DS_Store", + "venv/*", ] diff --git a/docs/models.rst b/docs/models.rst index b491b504..0ca566c8 100644 --- a/docs/models.rst +++ b/docs/models.rst @@ -2,4 +2,5 @@ Models ====== .. automodule:: gretel_trainer.models - :members: \ No newline at end of file + :members: + :inherited-members: \ No newline at end of file diff --git a/src/gretel_trainer/models.py b/src/gretel_trainer/models.py index 5676c60e..4ae7dc0f 100644 --- a/src/gretel_trainer/models.py +++ b/src/gretel_trainer/models.py @@ -16,6 +16,16 @@ def determine_best_model(df: pd.DataFrame): + """ + Determine the Gretel model best suited for generating synthetic data + for your dataset. + + Args: + df (pd.DataFrame): Pandas DataFrame containing the data used to train a synthetic model. + + Returns: + A Gretel Model object preconfigured for your use case. + """ row_count, column_count = df.shape if row_count > HIGH_RECORD_THRESHOLD or column_count > HIGH_COLUMN_THRESHOLD: @@ -93,7 +103,17 @@ def _replace_nested_key(self, data, key, value) -> dict: class GretelLSTM(_BaseConfig): + """ + This model works for a variety of synthetic data tasks including time-series, tabular, and text data. Generally useful for a few thousand records and upward. Dataset generally has a mix of categorical, continuous, and numerical values + + Source data should have <150 columns. + Args: + config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration + max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000 + max_header_clusters (int, optional): Default: 20 + enable_privacy_filters (bool, optional): Default: False + """ _max_header_clusters_limit: int = 30 _max_rows_limit: int = 5000000 _model_slug: str = "synthetics" @@ -114,7 +134,17 @@ def __init__( class GretelCTGAN(_BaseConfig): + """ + This model works well for high dimensional, largely numeric data. Use for datasets with more than 20 columns and/or 50,000 rows. + + Not ideal if dataset contains free text field + Args: + config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration + max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000 + max_header_clusters (int, optional): Default: 20 + enable_privacy_filters (bool, optional): Default: False + """ _max_header_clusters_limit: int = 1000 _max_rows_limit: int = 5000000 _model_slug: str = "ctgan" @@ -131,4 +161,4 @@ def __init__( max_rows=max_rows, max_header_clusters=max_header_clusters, enable_privacy_filters=enable_privacy_filters, - ) \ No newline at end of file + )