diff --git a/README.md b/README.md index d4cf697b..e0369552 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,13 @@ If you want to quickly get started synthesizing data with **Gretel.ai**, simply [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/gretelai/trainer/blob/main/notebooks/trainer-examples.ipynb) -## Join our Slack Workspace +## Join the Synthetic Data Community Discord -If you want to be part of the Gretel synthetic data community to receive announcements of the latest releases, +If you want to be part of the Synthetic Data Community to receive announcements of the latest releases, ask questions, suggest new features or participate in the development meetings, please join -our Slack Workspace! +the Synthetic Data Community Server! -[![Slack](https://img.shields.io/badge/Slack%20Workspace-Join%20now!-36C5F0?logo=slack)](https://gretel.ai/slackinvite) +[![Discord](https://img.shields.io/discord/1007817822614847500?label=Discord&logo=Discord)](https://gretel.ai/discord) # Install @@ -40,13 +40,13 @@ pip install -U gretel-trainer # Quickstart -### 1. Add your [Gretel API](https://console.gretel.cloud) key via the Gretel CLI. +## 1. Add your [Gretel API](https://console.gretel.cloud) key via the Gretel CLI. Use the Gretel client to store your API key to disk. This step is optional, the trainer will prompt for an API key in the next step. ```bash gretel configure ``` -### 2. Train or fine-tune a model using the Gretel API +## 2. Train or fine-tune a model using the Gretel API ```python3 from gretel_trainer import trainer @@ -57,7 +57,7 @@ model = trainer.Trainer() model.train(dataset) ``` -### 3. Generate synthetic data! +## 3. Generate synthetic data! ```python3 df = model.generate() ``` diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..f35dc2a9 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,86 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information +import os +import sys + +import sphinx + +sys.path.insert(0, os.path.abspath("../src")) + + +project = "Gretel Trainer" +copyright = "2022, Gretel Team" +author = "Gretel.ai" +release = "0.4.0" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.autodoc", + "sphinx.ext.coverage", + "sphinx.ext.napoleon", + "m2r", + "sphinx_rtd_theme", +] + +source_suffix = [".rst", ".md"] + +templates_path = ["_templates"] +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", +] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. + +html_theme = "sphinx_rtd_theme" +html_logo = "img/gretel_logo_white.png" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] +html_css_files = ["styles.css"] + +html_theme_options = { + "logo_only": True, + "display_version": True, + "style_nav_header_background": "#0c0c0d", +} + + +def monkeypatch(cls): + """decorator to monkey-patch methods""" + + def decorator(f): + method = f.__name__ + old_method = getattr(cls, method) + setattr( + cls, + method, + lambda self, *args, **kwargs: f(old_method, self, *args, **kwargs), + ) + + return decorator + + +# workaround until https://github.com/miyakogi/m2r/pull/55 is merged +@monkeypatch(sphinx.registry.SphinxComponentRegistry) +def add_source_parser(_old_add_source_parser, self, *args, **kwargs): + # signature is (parser: Type[Parser], **kwargs), but m2r expects + # the removed (str, parser: Type[Parser], **kwargs). + if isinstance(args[0], str): + args = args[1:] + return _old_add_source_parser(self, *args, **kwargs) diff --git a/docs/img/gretel-logo.png b/docs/img/gretel-logo.png new file mode 100644 index 00000000..43890250 Binary files /dev/null and b/docs/img/gretel-logo.png differ diff --git a/docs/img/gretel_logo_white.png b/docs/img/gretel_logo_white.png new file mode 100644 index 00000000..4d4215e2 Binary files /dev/null and b/docs/img/gretel_logo_white.png differ diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..b8df5e2d --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,105 @@ +.. Gretel Trainer documentation master file, created by + sphinx-quickstart on Tue Oct 11 09:08:14 2022. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Gretel Trainer +============== + +This module is designed to provide a simple interface to help users successfully train synthetic models on complex datasets with high row and column counts, and offers features such as Cloud SaaS based training and multi-GPU based parallelization. Get started for free with an API key from `Gretel.ai `_. + +Current functionality and features: +----------------------------------- + +* Synthetic data generators for text, tabular, and time-series data with the following features: + * Balance datasets or boost a minority class using Conditional Data Generation. + * Automated data validation. + * Synthetic data quality reports. + * Privacy filters and optional differential privacy support. +* Multiple `model types supported `_\: + * `Gretel-LSTM` model type supports text, tabular, time-series, and conditional data generation. + * `Gretel-CTGAN` model type supports tabular and conditional data generation. + * `Gretel-GPT` natural language synthesis based on an open-source implementation of GPT-3 (coming soon). + * `Gretel-DGAN` multi-variate time series based on DoppelGANger (coming soon). + +Train Synthetic Data in as Little as Three Lines of Code! +--------------------------------------------------------- + +#. Install the Gretel CLI and Gretel Trainer either on your system or in your Notebook. + + .. code-block:: bash + + # Command line installation + pip install -U gretel-client gretel-trainer + + # Notebook installation + !pip install -Uqq gretel-client gretel-trainer + +#. Add your `Gretel API `_ key via the Gretel CLI. + + Use the Gretel client to store your API key to disk. This step is optional, the trainer will prompt for an API key in the next step. + + .. code-block:: bash + + gretel configure + +#. Train or fine-tune a model using the Gretel API. + + .. code-block:: python3 + + from gretel_trainer import trainer + + dataset = "https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/USAdultIncome5k.csv" + + model = trainer.Trainer() + model.train(dataset) + +#. Generate synthetic data! + + .. code-block:: python3 + + df = model.generate() + +Try it out now! +--------------- + +If you want to quickly get started synthesizing data with **Gretel.ai**, simply click the button below and follow the examples. See additional Python3 and Jupyter Notebook examples in the `./notebooks` folder. + +.. image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/gretelai/trainer/blob/main/notebooks/trainer-examples.ipynb + :alt: Open in Colab + +Join the Synthetic Data Community Discord +----------------------------------------- + +If you want to be part of the Synthetic Data Community to receive announcements of the latest releases, +ask questions, suggest new features, or participate in the development meetings, please join +the Synthetic Data Community Server! + +.. image:: https://img.shields.io/discord/1007817822614847500?label=Discord&logo=Discord + :target: https://gretel.ai/discord + :alt: Discord + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + +Modules +======= + +.. toctree:: + :maxdepth: 2 + + quickstart.rst + trainer.rst + models.rst + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 00000000..747ffb7b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/models.rst b/docs/models.rst new file mode 100644 index 00000000..b491b504 --- /dev/null +++ b/docs/models.rst @@ -0,0 +1,5 @@ +Models +====== + +.. automodule:: gretel_trainer.models + :members: \ No newline at end of file diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 00000000..f590bf33 --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,93 @@ +Quickstart +========== + +Initial Setup +------------- + +#. Install the Gretel CLI and Gretel Trainer either on your system or in your Notebook. + + .. code-block:: bash + + # Command line installation + pip install -U gretel-client gretel-trainer + + # Notebook installation + !pip install -Uqq gretel-client gretel-trainer + +#. Add your `Gretel API `_ key via the Gretel CLI. + + Use the Gretel client to store your API key to disk. This step is optional, the trainer will prompt for an API key in the next step. + + .. code-block:: bash + + gretel configure + +Train Synthetic Data +-------------------- +.. image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/gretelai/trainer/blob/main/notebooks/trainer-examples.ipynb + :alt: Open in Colab + +#. Train or fine-tune a model using the Gretel API. + + .. code-block:: python3 + + from gretel_trainer import trainer + + dataset = "https://gretel-public-website.s3-us-west-2.amazonaws.com/datasets/USAdultIncome5k.csv" + + model = trainer.Trainer() + model.train(dataset) + +#. Generate synthetic data! + + .. code-block:: python3 + + df = model.generate() + +Conditional Data Generation +--------------------------- +.. image:: https://colab.research.google.com/assets/colab-badge.svg + :target: https://colab.research.google.com/github/gretelai/trainer/blob/main/notebooks/simple-conditional-generation.ipynb + :alt: Open in Colab + +#. Load and preview the dataset, and set seed fields. + + .. code-block:: python3 + + # Load and preview the patient dataset + import pandas as pd + from gretel_trainer import trainer + + DATASET_PATH = 'https://gretel-public-website.s3.amazonaws.com/datasets/mitre-synthea-health.csv' + SEED_FIELDS = ["RACE", "ETHNICITY", "GENDER"] + + print("\nPreviewing real world dataset\n") + pd.read_csv(DATASET_PATH) + +#. Train the model. + + .. code-block:: python3 + + # Train model + model = trainer.Trainer() + model.train(DATASET_PATH, seed_fields=SEED_FIELDS) + +#. Conditionally generate data. + + .. code-block:: python3 + + # Conditionally generate data + seed_df = pd.DataFrame(data=[ + ["black", "african", "F"], + ["black", "african", "F"], + ["black", "african", "F"], + ["black", "african", "F"], + ["asian", "chinese", "F"], + ["asian", "chinese", "F"], + ["asian", "chinese", "F"], + ["asian", "chinese", "F"], + ["asian", "chinese", "F"] + ], columns=SEED_FIELDS) + + model.generate(seed_df=seed_df) \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..e223617d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,88 @@ +absl-py==1.2.0 +alabaster==0.7.12 +attrs==22.1.0 +Babel==2.10.3 +backports.cached-property==1.0.0.post2 +boto3==1.24.89 +botocore==1.27.89 +category-encoders==2.5.1.post0 +certifi==2022.9.24 +chardet==3.0.4 +charset-normalizer==2.1.1 +click==7.1.2 +cloudpickle==2.2.0 +contourpy==1.0.5 +cycler==0.11.0 +decorator==5.1.1 +dill==0.3.5.1 +dm-tree==0.1.7 +docker==4.4.1 +docutils==0.17.1 +dython==0.7.2 +etils==0.8.0 +fonttools==4.37.4 +gast==0.5.3 +googleapis-common-protos==1.56.4 +gretel-client==0.15.1 +gretel-synthetics==0.19.0 +idna==2.10 +imagesize==1.4.1 +importlib-resources==5.10.0 +Jinja2==3.0.3 +jmespath==1.0.1 +joblib==1.2.0 +kiwisolver==1.4.4 +loky==2.9.0 +m2r==0.2.1 +MarkupSafe==2.1.1 +matplotlib==3.6.1 +mistune==0.8.4 +mpmath==1.2.1 +numpy==1.23.3 +packaging==21.3 +pandas==1.5.0 +patsy==0.5.3 +Pillow==9.2.0 +promise==2.3 +protobuf==4.21.7 +psutil==5.9.2 +pydantic==1.10.2 +Pygments==2.13.0 +pyparsing==3.0.9 +python-dateutil==2.8.2 +pytz==2022.4 +PyYAML==5.4.1 +requests==2.25.0 +s3transfer==0.6.0 +scikit-learn==1.1.2 +scikit-plot==0.3.7 +scipy==1.9.2 +seaborn==0.12.0 +sentencepiece==0.1.96 +six==1.16.0 +smart-open==5.2.1 +snowballstemmer==2.2.0 +Sphinx==3.0.3 +sphinx-rtd-theme==1.0.0 +sphinxcontrib-applehelp==1.0.2 +sphinxcontrib-devhelp==1.0.2 +sphinxcontrib-htmlhelp==2.0.0 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==1.0.3 +sphinxcontrib-serializinghtml==1.1.5 +statsmodels==0.13.2 +tabulate==0.8.9 +tenacity==6.2.0 +tensorflow-datasets==4.7.0 +tensorflow-estimator==2.10.0 +tensorflow-metadata==1.10.0 +tensorflow-privacy==0.7.3 +tensorflow-probability==0.16.0 +termcolor==2.0.1 +threadpoolctl==3.1.0 +toml==0.10.2 +tqdm==4.64.1 +typing-extensions==4.4.0 +urllib3==1.25.11 +websocket-client==1.4.1 +zipp==3.9.0 diff --git a/docs/trainer.rst b/docs/trainer.rst new file mode 100644 index 00000000..a0b8d0d3 --- /dev/null +++ b/docs/trainer.rst @@ -0,0 +1,5 @@ +Trainer +======= + +.. automodule:: gretel_trainer.trainer + :members: \ No newline at end of file