adding model docs (#28)

* adding model docs * Update src/gretel_trainer/models.py Co-authored-by: Scott Bailey <[email protected]> * Update src/gretel_trainer/models.py Co-authored-by: Scott Bailey <[email protected]> * Update src/gretel_trainer/models.py Co-authored-by: Scott Bailey <[email protected]> * Update src/gretel_trainer/models.py Co-authored-by: Scott Bailey <[email protected]> * updating docs Co-authored-by: Scott Bailey <[email protected]>
gretelai · Oct 27, 2022 · a50a38d · a50a38d
1 parent ba27d10
commit a50a38d
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 2 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -36,6 +36,7 @@
     "_build",
     "Thumbs.db",
     ".DS_Store",
+    "venv/*",
 ]
 
 

diff --git a/docs/models.rst b/docs/models.rst
@@ -2,4 +2,5 @@ Models
 ======
 
 .. automodule:: gretel_trainer.models
-    :members:
+    :members:
+    :inherited-members:
diff --git a/src/gretel_trainer/models.py b/src/gretel_trainer/models.py
@@ -16,6 +16,16 @@
 
 
 def determine_best_model(df: pd.DataFrame):
+    """
+    Determine the Gretel model best suited for generating synthetic data
+    for your dataset. 
+
+    Args:
+        df (pd.DataFrame): Pandas DataFrame containing the data used to train a synthetic model.
+
+    Returns:
+        A Gretel Model object preconfigured for your use case.
+    """
     row_count, column_count = df.shape
 
     if row_count > HIGH_RECORD_THRESHOLD or column_count > HIGH_COLUMN_THRESHOLD:
@@ -93,7 +103,17 @@ def _replace_nested_key(self, data, key, value) -> dict:
 
 
 class GretelLSTM(_BaseConfig):
+    """
+    This model works for a variety of synthetic data tasks including time-series, tabular, and text data. Generally useful for a few thousand records and upward. Dataset generally has a mix of categorical, continuous, and numerical values
+
+    Source data should have <150 columns.
 
+    Args:
+        config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration
+        max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000
+        max_header_clusters (int, optional): Default: 20
+        enable_privacy_filters (bool, optional): Default: False
+    """
     _max_header_clusters_limit: int = 30
     _max_rows_limit: int = 5000000
     _model_slug: str = "synthetics"
@@ -114,7 +134,17 @@ def __init__(
 
 
 class GretelCTGAN(_BaseConfig):
+    """
+    This model works well for high dimensional, largely numeric data. Use for datasets with more than 20 columns and/or 50,000 rows.  
+
+    Not ideal if dataset contains free text field
 
+    Args:
+        config (str/dict, optional): Either a string representing the path to the config on the local filesystem, a string representing a path to the default Gretel configurations, or a dictionary containing the configurations. Default: "synthetics/default", a default Gretel configuration
+        max_rows (int, optional): The number of rows of synthetic data to generate. Defaults to 50000
+        max_header_clusters (int, optional): Default: 20
+        enable_privacy_filters (bool, optional): Default: False
+    """
     _max_header_clusters_limit: int = 1000
     _max_rows_limit: int = 5000000
     _model_slug: str = "ctgan"
@@ -131,4 +161,4 @@ def __init__(
             max_rows=max_rows,
             max_header_clusters=max_header_clusters,
             enable_privacy_filters=enable_privacy_filters,
-        )
+        )