Merge pull request #4 from gretelai/jm/readme_scraping

Require README.md in each blueprint dir, added READMEs for existing BPs
gretelai · Oct 14, 2020 · 0d8a428 · 0d8a428
2 parents f502d08 + 9cbfe40
commit 0d8a428
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 3 deletions.
diff --git a/construct_manifests.py b/construct_manifests.py
@@ -7,6 +7,7 @@
 from typing import Dict, List
 from pathlib import Path
 import json
+import base64
 
 from marshmallow import Schema, fields, validate, ValidationError
 import requests
@@ -26,6 +27,9 @@
 SAMPLE_DATA_KEY = "sample_data_key"
 FEATURED = "featured"
 SHIP = "ship-gretel"
+README_FILE = "README.md"
+README = "readme"
+README_MAX = 64 * 1024
 
 REPO_BASE = "https://github.com/gretelai/gretel-blueprints"
 
@@ -61,6 +65,10 @@ class ManifestSchema(Schema):
     language = fields.String(required=True, validate=validate.OneOf(LANGS))
     blog_url = fields.String(missing=None)
 
+    # NOTE: This gets added separately as its not part of the
+    # manifest.json file
+    readme = fields.String(missing=None)
+
 
 @dataclass
 class PrimaryManifest:
@@ -120,6 +128,17 @@ def process_manifest_dir(manifest_dir: str, subdir: str, sample_data_map: dict)
             f"Invalid sample data key: {sample_data} in {manifest_dir}"
         )  # noqa
 
+    # scrape the README contents from the directory and b64 encode it
+    readme_file = _base / README_FILE
+    if not readme_file.is_file():
+        raise ManifestError(f"Directory {manifest_dir} missing {README_FILE}!")  # noqa
+
+    readme_contents = open(readme_file).read()
+    if len(readme_contents) > README_MAX:
+        raise ManifestError(f"README must be less than f{README_MAX} bytes")
+
+    manifest_dict[README] = base64.b64encode(readme_contents.encode()).decode()
+
     return manifest_dict
 
 
@@ -176,5 +195,8 @@ def deploy_manifest(manifest: dict, deploy_mode: str, manifest_type: str):
     for base in (GRETEL,):
         manifest_dict = create_manifest(base)
 
+        if not deploy_mode:
+            print(json.dumps(manifest_dict))
+
         if deploy_mode in (SHIP,):
             deploy_manifest(manifest_dict, deploy_mode, base)
diff --git a/gretel/create_synthetic_data_from_csv_or_df/README.md b/gretel/create_synthetic_data_from_csv_or_df/README.md
@@ -0,0 +1,8 @@
+# Create synthetic data from your own CSV or DataFrame
+
+This blueprint will walk you through the process of creating your own synthetic data from a CSV or a DataFrame of your choosing.
+
+No data will be sent to Gretel Cloud, to get started you just need to provide your own DF or CSV file. 
+
+Please get an API key from the Integration menu, then start the Notebook flow. An example public dataset is provided for you 
+but you can switch out your own dataset as needed.
diff --git a/gretel/create_synthetic_data_from_gretel_project/README.md b/gretel/create_synthetic_data_from_gretel_project/README.md
@@ -0,0 +1,15 @@
+# Create synthetic data from Gretel Cloud
+
+If you've uploaded records to a Gretel Cloud project, this Blueprint is for you! It shows you how to quickly download
+records from the project and create a synthetic version of these records.
+
+This blueprint assumes you have data in a Gretel Project. If you have not done this you can login and create a new project and select this
+blueprint.
+
+Get started with this blueprint in the console, visit https://console.gretel.cloud and select this blueprint and upload the sample data!
+
+To get started:
+
+- Head to the **Transform** tab
+- Get your Gretel URI from the **Integration** menu
+- Launch this blueprint's notebook
diff --git a/gretel/download_synthetic_model_and_generate/README.md b/gretel/download_synthetic_model_and_generate/README.md
@@ -0,0 +1,14 @@
+# Downloading a saved synthetic model
+
+This blueprint does not require Gretel Cloud Project, simply launch the notebook and get started!
+
+When generating synthetic data, you are probably familiar with using our "bundle" interface. Models can take a long time to train and
+you may want to save that model for use later to generate more data. As you've seen in other blueprints, this can be done by doing:
+
+```
+bundle.save("my_model.tar.gz")
+```
+
+The same `SyntheticDataBundle` class has a factory method that can load an unarchived model and generate data from it.
+
+This blueprint has sample code to download a remotely saved model, decmpress and un-tar it and load it back into the bundle interface.
diff --git a/gretel/labeling_pub_sub_basic/README.md b/gretel/labeling_pub_sub_basic/README.md
@@ -0,0 +1,8 @@
+# Using Gretel Cloud for Publish and Subscribe Data Labeling
+
+This blueprint demonsrates how to send raw records and consume labeled records from Gretel's APIs.  It automatically
+creates a temporary project for you with use of a context manager. You can, of course, create a project more permmanently
+and write records to it and consume from them in a similar way.
+
+When you call `start()`, the Gretel Console URL will be printed. Feel free to visit this link and observe how the
+records are ingested, labeled, and explorable in our console!
diff --git a/gretel/labeling_pub_sub_basic/blueprint.ipynb b/gretel/labeling_pub_sub_basic/blueprint.ipynb
@@ -19,7 +19,7 @@
    "source": [
     "%%capture\n",
     "\n",
-    "!pip install gretel-client faker"
+    "!pip install -U gretel-client faker"
    ]
   },
   {
@@ -66,12 +66,17 @@
     "            \"phone\": fake.phone_number()\n",
     "        }\n",
     "        project.send(rec)\n",
-    "        time.sleep(1)\n",
+    "        time.sleep(2)\n",
     "\n",
     "\n",
     "def start(api_key: str):\n",
     "    client = get_cloud_client(\"api\", api_key)\n",
+    "    \n",
+    "    # NOTE: When this context handler exits, the project will\n",
+    "    # be automatically deleted.\n",
     "    with temporary_project(client) as project:\n",
+    "        print(f\"*** Console Project URL: {project.get_console_url()} ***\\n\")\n",
+    "        print(\"Visit the link above while this example runs to see your records in Gretel Cloud!\\n\\n\")\n",
     "        publish_event = threading.Event()\n",
     "        publish_thread = threading.Thread(target=publish, args=(project, publish_event))\n",
     "        publish_thread.start()\n",

diff --git a/test_construct_manifest.py b/test_construct_manifest.py
@@ -29,12 +29,21 @@ def default_manifest(name: str):
     }
 
 
-def create_manifest(target_dir: str, manifest: dict, bp_fname: str):
+def create_manifest(
+    target_dir: str,
+    manifest: dict,
+    bp_fname: str,
+    readme_fname: str = cm.README_FILE,
+    readme_contents: str = "booyah"
+):
     dest = Path(target_dir / uuid.uuid4().hex)
     dest.mkdir()
     with open(dest / bp_fname, "w") as fout:
         fout.write("# nada")
 
+    with open(dest / readme_fname, "w") as fout:
+        fout.write(readme_contents)
+
     with open(dest / "manifest.json", "w") as fout:
         fout.write(json.dumps(manifest))
 
@@ -88,3 +97,21 @@ def test_bad_data_sample_name(tmpdir):
 
     with pytest.raises(cm.ManifestError):
         cm.create_manifest(tmpdir)
+
+
+def test_missing_readme(tmpdir):
+    m1 = default_manifest("one")
+    create_manifest(tmpdir, m1, "blueprint.py", readme_fname="nope.md")
+
+    with pytest.raises(cm.ManifestError):
+        cm.create_manifest(tmpdir)
+
+
+def test_readme_too_big(tmpdir):
+    m1 = default_manifest("one")
+    contents = "A" * (cm.README_MAX + 1)
+    create_manifest(tmpdir, m1, "blueprint.py", readme_contents=contents)
+
+    with pytest.raises(cm.ManifestError) as err:
+        cm.create_manifest(tmpdir)
+    assert "README must be less" in str(err.value)