Skip to content

Commit

Permalink
Merge pull request #4 from gretelai/jm/readme_scraping
Browse files Browse the repository at this point in the history
Require README.md in each blueprint dir, added READMEs for existing BPs
  • Loading branch information
johntmyers authored Oct 14, 2020
2 parents f502d08 + 9cbfe40 commit 0d8a428
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 3 deletions.
22 changes: 22 additions & 0 deletions construct_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import Dict, List
from pathlib import Path
import json
import base64

from marshmallow import Schema, fields, validate, ValidationError
import requests
Expand All @@ -26,6 +27,9 @@
SAMPLE_DATA_KEY = "sample_data_key"
FEATURED = "featured"
SHIP = "ship-gretel"
README_FILE = "README.md"
README = "readme"
README_MAX = 64 * 1024

REPO_BASE = "https://github.com/gretelai/gretel-blueprints"

Expand Down Expand Up @@ -61,6 +65,10 @@ class ManifestSchema(Schema):
language = fields.String(required=True, validate=validate.OneOf(LANGS))
blog_url = fields.String(missing=None)

# NOTE: This gets added separately as its not part of the
# manifest.json file
readme = fields.String(missing=None)


@dataclass
class PrimaryManifest:
Expand Down Expand Up @@ -120,6 +128,17 @@ def process_manifest_dir(manifest_dir: str, subdir: str, sample_data_map: dict)
f"Invalid sample data key: {sample_data} in {manifest_dir}"
) # noqa

# scrape the README contents from the directory and b64 encode it
readme_file = _base / README_FILE
if not readme_file.is_file():
raise ManifestError(f"Directory {manifest_dir} missing {README_FILE}!") # noqa

readme_contents = open(readme_file).read()
if len(readme_contents) > README_MAX:
raise ManifestError(f"README must be less than f{README_MAX} bytes")

manifest_dict[README] = base64.b64encode(readme_contents.encode()).decode()

return manifest_dict


Expand Down Expand Up @@ -176,5 +195,8 @@ def deploy_manifest(manifest: dict, deploy_mode: str, manifest_type: str):
for base in (GRETEL,):
manifest_dict = create_manifest(base)

if not deploy_mode:
print(json.dumps(manifest_dict))

if deploy_mode in (SHIP,):
deploy_manifest(manifest_dict, deploy_mode, base)
8 changes: 8 additions & 0 deletions gretel/create_synthetic_data_from_csv_or_df/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Create synthetic data from your own CSV or DataFrame

This blueprint will walk you through the process of creating your own synthetic data from a CSV or a DataFrame of your choosing.

No data will be sent to Gretel Cloud, to get started you just need to provide your own DF or CSV file.

Please get an API key from the Integration menu, then start the Notebook flow. An example public dataset is provided for you
but you can switch out your own dataset as needed.
15 changes: 15 additions & 0 deletions gretel/create_synthetic_data_from_gretel_project/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Create synthetic data from Gretel Cloud

If you've uploaded records to a Gretel Cloud project, this Blueprint is for you! It shows you how to quickly download
records from the project and create a synthetic version of these records.

This blueprint assumes you have data in a Gretel Project. If you have not done this you can login and create a new project and select this
blueprint.

Get started with this blueprint in the console, visit https://console.gretel.cloud and select this blueprint and upload the sample data!

To get started:

- Head to the **Transform** tab
- Get your Gretel URI from the **Integration** menu
- Launch this blueprint's notebook
14 changes: 14 additions & 0 deletions gretel/download_synthetic_model_and_generate/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Downloading a saved synthetic model

This blueprint does not require Gretel Cloud Project, simply launch the notebook and get started!

When generating synthetic data, you are probably familiar with using our "bundle" interface. Models can take a long time to train and
you may want to save that model for use later to generate more data. As you've seen in other blueprints, this can be done by doing:

```
bundle.save("my_model.tar.gz")
```

The same `SyntheticDataBundle` class has a factory method that can load an unarchived model and generate data from it.

This blueprint has sample code to download a remotely saved model, decmpress and un-tar it and load it back into the bundle interface.
8 changes: 8 additions & 0 deletions gretel/labeling_pub_sub_basic/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Using Gretel Cloud for Publish and Subscribe Data Labeling

This blueprint demonsrates how to send raw records and consume labeled records from Gretel's APIs. It automatically
creates a temporary project for you with use of a context manager. You can, of course, create a project more permmanently
and write records to it and consume from them in a similar way.

When you call `start()`, the Gretel Console URL will be printed. Feel free to visit this link and observe how the
records are ingested, labeled, and explorable in our console!
9 changes: 7 additions & 2 deletions gretel/labeling_pub_sub_basic/blueprint.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"source": [
"%%capture\n",
"\n",
"!pip install gretel-client faker"
"!pip install -U gretel-client faker"
]
},
{
Expand Down Expand Up @@ -66,12 +66,17 @@
" \"phone\": fake.phone_number()\n",
" }\n",
" project.send(rec)\n",
" time.sleep(1)\n",
" time.sleep(2)\n",
"\n",
"\n",
"def start(api_key: str):\n",
" client = get_cloud_client(\"api\", api_key)\n",
" \n",
" # NOTE: When this context handler exits, the project will\n",
" # be automatically deleted.\n",
" with temporary_project(client) as project:\n",
" print(f\"*** Console Project URL: {project.get_console_url()} ***\\n\")\n",
" print(\"Visit the link above while this example runs to see your records in Gretel Cloud!\\n\\n\")\n",
" publish_event = threading.Event()\n",
" publish_thread = threading.Thread(target=publish, args=(project, publish_event))\n",
" publish_thread.start()\n",
Expand Down
29 changes: 28 additions & 1 deletion test_construct_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,21 @@ def default_manifest(name: str):
}


def create_manifest(target_dir: str, manifest: dict, bp_fname: str):
def create_manifest(
target_dir: str,
manifest: dict,
bp_fname: str,
readme_fname: str = cm.README_FILE,
readme_contents: str = "booyah"
):
dest = Path(target_dir / uuid.uuid4().hex)
dest.mkdir()
with open(dest / bp_fname, "w") as fout:
fout.write("# nada")

with open(dest / readme_fname, "w") as fout:
fout.write(readme_contents)

with open(dest / "manifest.json", "w") as fout:
fout.write(json.dumps(manifest))

Expand Down Expand Up @@ -88,3 +97,21 @@ def test_bad_data_sample_name(tmpdir):

with pytest.raises(cm.ManifestError):
cm.create_manifest(tmpdir)


def test_missing_readme(tmpdir):
m1 = default_manifest("one")
create_manifest(tmpdir, m1, "blueprint.py", readme_fname="nope.md")

with pytest.raises(cm.ManifestError):
cm.create_manifest(tmpdir)


def test_readme_too_big(tmpdir):
m1 = default_manifest("one")
contents = "A" * (cm.README_MAX + 1)
create_manifest(tmpdir, m1, "blueprint.py", readme_contents=contents)

with pytest.raises(cm.ManifestError) as err:
cm.create_manifest(tmpdir)
assert "README must be less" in str(err.value)

0 comments on commit 0d8a428

Please sign in to comment.