From 2241d9ad4daeacb25e18240eb742ed493a7c1308 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 07:36:19 -0800 Subject: [PATCH 1/9] wip --- gretel/gc-nlp_text_analysis/README.md | 5 + gretel/gc-nlp_text_analysis/blueprint.ipynb | 452 ++++++++++++++++++++ 2 files changed, 457 insertions(+) create mode 100644 gretel/gc-nlp_text_analysis/README.md create mode 100644 gretel/gc-nlp_text_analysis/blueprint.ipynb diff --git a/gretel/gc-nlp_text_analysis/README.md b/gretel/gc-nlp_text_analysis/README.md new file mode 100644 index 00000000..6d6cdd90 --- /dev/null +++ b/gretel/gc-nlp_text_analysis/README.md @@ -0,0 +1,5 @@ +# Working Safely with Sensitive Free Text Using Gretel Cloud and NLP + +Using Gretel.ai's [NER and NLP features](https://gretel.ai/platform/data-cataloghttps://gretel.ai/platform/data-catalog), we analyze and label chat logs looking for PII and other potentially sensitive information. After labeling the dataset, we build a transformation pipeline that will redact and replace any sensitive strings from chat messages. + +At the end of the notebook we'll have a dataset that is safe to share without compromising a user's personal information. \ No newline at end of file diff --git a/gretel/gc-nlp_text_analysis/blueprint.ipynb b/gretel/gc-nlp_text_analysis/blueprint.ipynb new file mode 100644 index 00000000..15b0d048 --- /dev/null +++ b/gretel/gc-nlp_text_analysis/blueprint.ipynb @@ -0,0 +1,452 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -Uqq spacy gretel-client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Working Safely with Sensitive Free Text Using Gretel Cloud and NLP\n", + "\n", + "Using Gretel.ai's [NER and NLP features](https://gretel.ai/platform/data-cataloghttps://gretel.ai/platform/data-catalog), we analyze and label chat logs looking for PII and other potentially sensitive information. After labeling the dataset, we build a transformation pipeline that will redact and replace any sensitive strings from chat messages.\n", + "\n", + "At the end of the notebook we'll have a dataset that is safe to share without compromising a user's personal information." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from gretel_client import get_cloud_client\n", + "\n", + "client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "client.install_packages()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load the dataset\n", + "\n", + "For this blueprint, we use a modified dataset from the Ubuntu Chat Corpus. It represents an archived set of IRC logs from Ubuntu's technical support channel. This data primarily contains free form text that we will pass through a NER pipeline for labeling and PII discovery." + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "source_df = pd.read_csv(\"https://gretel-public-website.s3.us-west-2.amazonaws.com/blueprints/nlp_text_analysis/chat_logs.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
folderdialogueIDdatefromtotext
0371195.tsv2010-08-27T11:31:00.000ZKB1JWQmjwalkerphone number 123.453.8920
13126125.tsv2008-04-23T14:55:00.000Zbad_imageNaNlocation San Diego
23126125.tsv2008-04-23T14:56:00.000Zbad_imageNaNmy name is Linus
33126125.tsv2008-04-23T14:57:00.000Zlordleemobad_imagelocation United States
4364545.tsv2009-08-01T06:22:00.000ZmechtechNaNcity is San Diego email address is test@exampl...
\n", + "
" + ], + "text/plain": [ + " folder dialogueID date from to \\\n", + "0 3 71195.tsv 2010-08-27T11:31:00.000Z KB1JWQ mjwalker \n", + "1 3 126125.tsv 2008-04-23T14:55:00.000Z bad_image NaN \n", + "2 3 126125.tsv 2008-04-23T14:56:00.000Z bad_image NaN \n", + "3 3 126125.tsv 2008-04-23T14:57:00.000Z lordleemo bad_image \n", + "4 3 64545.tsv 2009-08-01T06:22:00.000Z mechtech NaN \n", + "\n", + " text \n", + "0 phone number 123.453.8920 \n", + "1 location San Diego \n", + "2 my name is Linus \n", + "3 location United States \n", + "4 city is San Diego email address is test@exampl... " + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "source_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Label the source text\n", + "\n", + "With the data loaded into the notebook, we now create a Gretel Project, and upload the records to the project for labeling." + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "project = client.get_project(create=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`detection_mode` configures the NER pipeline that is responsible for labeling the data. Using `detection_mode=all` we configure the records to be labeled using all of " + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "6 records [00:00, 15.01 records/s] \n" + ] + } + ], + "source": [ + "project.send_dataframe(source_df, detection_mode=\"all\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For extra credit, you can navigate to the project's console view to better inspect and visualize the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://console.gretel.cloud/drew-0a1c3'" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "project.get_console_url()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inspect labeled data\n", + "\n", + "In this next cell, we download the labeled records and inspect each chat message to see what entities were detected. Gretel uses a combination of NLP models, regex, and custom heuristics to detect named entities in structured and unstructured data.\n", + "\n", + "For a list of entities that Gretel can detect, [click here](https://gretel.ai/gretel-cloud-faqs/what-types-of-entities-can-gretel-identify)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from gretel_helpers.spacy import display_entities\n", + "\n", + "TEXT_FIELD = \"text\"\n", + "\n", + "for record in project.iter_records(direction=\"backward\"):\n", + " display_entities(record, text_field)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build a transformation pipeline\n", + "\n", + "After labeling the dataset, we've identified messages that contain PII, such as names and emails. The final step in this blueprint is to build a transformation pipeline that will replace names and other identifying information with fake representations of the data.\n", + "\n", + "We make a point to replace rather than redact sensitive information. This preservation ensures the dataset remains valuable for downstream use cases such as machine learning, where the structure and contents of the data are essential.\n", + "\n", + "To learn more about data transformation pipelines with Gretel, check our [website](https://gretel.ai/platform/transform) or [SDK documentation](https://gretel-client.readthedocs.io/en/stable/transformers/api_ref.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "\n", + "from gretel_client.transformers import DataPath, DataTransformPipeline\n", + "from gretel_client.transformers import FakeConstantConfig\n", + "\n", + "SEED = uuid.uuid1().int" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Configure the pipeline. `FakeConstantConfig` will replace any entities configured under `labels` with a fake version of the entity." + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "fake_xf = FakeConstantConfig(seed=SEED, labels=[\"person_name\", \"email_address\", \"phone_number\"])\n", + "\n", + "paths = [\n", + " DataPath(input=TEXT_FIELD, xforms=[fake_xf]),\n", + " DataPath(input=\"*\"),\n", + "]\n", + "\n", + "pipeline = DataTransformPipeline(paths)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run the pipeline to redact any sensitive strings" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "xf_records = [\n", + " pipeline.transform_record(record)[\"record\"]\n", + " for record in \n", + " project.iter_records(direction=\"backward\")\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inspect the transformed version of the dataset. Notice that entities such as names and emails have been replace with fake instances of the entity." + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0location United States
1my name is Darrell Long
2city is San Diego email address is garciajim@y...
3location San Diego
4phone number 532-677-7284
\n", + "
" + ], + "text/plain": [ + " text\n", + "0 location United States\n", + "1 my name is Darrell Long\n", + "2 city is San Diego email address is garciajim@y...\n", + "3 location San Diego\n", + "4 phone number 532-677-7284" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "xf_df = pd.DataFrame(xf_records)\n", + "\n", + "xf_df[[TEXT_FIELD]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that you've completed this notebook, you've seen how it's possible to take a corpus of free text, label it using Gretel's NER pipeline, and safely anonymize the dataset while retaining its utility." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From dc266b39bf3ea717528bbfe4b99f873411d568d3 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 07:40:42 -0800 Subject: [PATCH 2/9] add output --- gretel/gc-nlp_text_analysis/blueprint.ipynb | 223 +++++++++++++++++++- 1 file changed, 217 insertions(+), 6 deletions(-) diff --git a/gretel/gc-nlp_text_analysis/blueprint.ipynb b/gretel/gc-nlp_text_analysis/blueprint.ipynb index 15b0d048..ccc76fcb 100644 --- a/gretel/gc-nlp_text_analysis/blueprint.ipynb +++ b/gretel/gc-nlp_text_analysis/blueprint.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Working Safely with Sensitive Free Text Using Gretel Cloud and NLP\n", + "# Working Safely with Sensitive Free Text Using Gretel.ai and NLP\n", "\n", "Using Gretel.ai's [NER and NLP features](https://gretel.ai/platform/data-cataloghttps://gretel.ai/platform/data-catalog), we analyze and label chat logs looking for PII and other potentially sensitive information. After labeling the dataset, we build a transformation pipeline that will redact and replace any sensitive strings from chat messages.\n", "\n", @@ -197,7 +197,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`detection_mode` configures the NER pipeline that is responsible for labeling the data. Using `detection_mode=all` we configure the records to be labeled using all of " + "`detection_mode` configures what models the NER pipeline uses for labeling. Using `detection_mode=all` we configure records to be labeled using all available models." ] }, { @@ -221,7 +221,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For extra credit, you can navigate to the project's console view to better inspect and visualize the dataset." + "For extra credit, you can navigate to the project's console view to better inspect and visualize the source dataset." ] }, { @@ -257,9 +257,220 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
my name is \n", + "\n", + " Linus\n", + " person_name\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " location United States\n", + " location\n", + "\n", + "\n", + "\n", + " United States\n", + " location\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
city is \n", + "\n", + " San Diego\n", + " location\n", + "\n", + " email address is \n", + "\n", + " test@example.com\n", + " email_address\n", + "\n", + "\n", + "\n", + " example.com\n", + " domain_name\n", + "\n", + "\n", + "\n", + " example.com\n", + " hostname\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
location \n", + "\n", + " San Diego\n", + " location\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
phone number \n", + "\n", + " 123.453.8920\n", + " phone_number\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + " location United States\n", + " location\n", + "\n", + "\n", + "\n", + " United States\n", + " location\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
my name is \n", + "\n", + " Linus\n", + " person_name\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
city is \n", + "\n", + " San Diego\n", + " location\n", + "\n", + " email address is \n", + "\n", + " test@example.com\n", + " email_address\n", + "\n", + "\n", + "\n", + " example.com\n", + " domain_name\n", + "\n", + "\n", + "\n", + " example.com\n", + " hostname\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
location \n", + "\n", + " San Diego\n", + " location\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
phone number \n", + "\n", + " 123.453.8920\n", + " phone_number\n", + "\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from gretel_helpers.spacy import display_entities\n", "\n", @@ -275,7 +486,7 @@ "source": [ "## Build a transformation pipeline\n", "\n", - "After labeling the dataset, we've identified messages that contain PII, such as names and emails. The final step in this blueprint is to build a transformation pipeline that will replace names and other identifying information with fake representations of the data.\n", + "After labeling the dataset, we've identified chats that contain PII, such as names and emails. The final step in this blueprint is to build a transformation pipeline that will replace names and other identifying information with fake representations of the data.\n", "\n", "We make a point to replace rather than redact sensitive information. This preservation ensures the dataset remains valuable for downstream use cases such as machine learning, where the structure and contents of the data are essential.\n", "\n", From 18af489043386b17db51293c3f62b3452a9f3a54 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 09:41:21 -0800 Subject: [PATCH 3/9] manifiest + sample data --- gretel/gc-nlp_text_analysis/blueprint.ipynb | 435 +------------------- gretel/gc-nlp_text_analysis/manifest.json | 7 + 2 files changed, 28 insertions(+), 414 deletions(-) create mode 100644 gretel/gc-nlp_text_analysis/manifest.json diff --git a/gretel/gc-nlp_text_analysis/blueprint.ipynb b/gretel/gc-nlp_text_analysis/blueprint.ipynb index ccc76fcb..863fcf44 100644 --- a/gretel/gc-nlp_text_analysis/blueprint.ipynb +++ b/gretel/gc-nlp_text_analysis/blueprint.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 92, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -59,118 +59,18 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "source_df = pd.read_csv(\"https://gretel-public-website.s3.us-west-2.amazonaws.com/blueprints/nlp_text_analysis/chat_logs.csv\")" + "source_df = pd.read_csv(\"https://gretel-public-website.s3.us-west-2.amazonaws.com/blueprints/nlp_text_analysis/chat_logs_sampled.csv\")" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
folderdialogueIDdatefromtotext
0371195.tsv2010-08-27T11:31:00.000ZKB1JWQmjwalkerphone number 123.453.8920
13126125.tsv2008-04-23T14:55:00.000Zbad_imageNaNlocation San Diego
23126125.tsv2008-04-23T14:56:00.000Zbad_imageNaNmy name is Linus
33126125.tsv2008-04-23T14:57:00.000Zlordleemobad_imagelocation United States
4364545.tsv2009-08-01T06:22:00.000ZmechtechNaNcity is San Diego email address is test@exampl...
\n", - "
" - ], - "text/plain": [ - " folder dialogueID date from to \\\n", - "0 3 71195.tsv 2010-08-27T11:31:00.000Z KB1JWQ mjwalker \n", - "1 3 126125.tsv 2008-04-23T14:55:00.000Z bad_image NaN \n", - "2 3 126125.tsv 2008-04-23T14:56:00.000Z bad_image NaN \n", - "3 3 126125.tsv 2008-04-23T14:57:00.000Z lordleemo bad_image \n", - "4 3 64545.tsv 2009-08-01T06:22:00.000Z mechtech NaN \n", - "\n", - " text \n", - "0 phone number 123.453.8920 \n", - "1 location San Diego \n", - "2 my name is Linus \n", - "3 location United States \n", - "4 city is San Diego email address is test@exampl... " - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "source_df.head()" ] @@ -186,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -202,17 +102,9 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "6 records [00:00, 15.01 records/s] \n" - ] - } - ], + "outputs": [], "source": [ "project.send_dataframe(source_df, detection_mode=\"all\")" ] @@ -226,20 +118,9 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'https://console.gretel.cloud/drew-0a1c3'" - ] - }, - "execution_count": 91, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "project.get_console_url()" ] @@ -257,220 +138,9 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
my name is \n", - "\n", - " Linus\n", - " person_name\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - " location United States\n", - " location\n", - "\n", - "\n", - "\n", - " United States\n", - " location\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
city is \n", - "\n", - " San Diego\n", - " location\n", - "\n", - " email address is \n", - "\n", - " test@example.com\n", - " email_address\n", - "\n", - "\n", - "\n", - " example.com\n", - " domain_name\n", - "\n", - "\n", - "\n", - " example.com\n", - " hostname\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
location \n", - "\n", - " San Diego\n", - " location\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
phone number \n", - "\n", - " 123.453.8920\n", - " phone_number\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - " location United States\n", - " location\n", - "\n", - "\n", - "\n", - " United States\n", - " location\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
my name is \n", - "\n", - " Linus\n", - " person_name\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
city is \n", - "\n", - " San Diego\n", - " location\n", - "\n", - " email address is \n", - "\n", - " test@example.com\n", - " email_address\n", - "\n", - "\n", - "\n", - " example.com\n", - " domain_name\n", - "\n", - "\n", - "\n", - " example.com\n", - " hostname\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
location \n", - "\n", - " San Diego\n", - " location\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
phone number \n", - "\n", - " 123.453.8920\n", - " phone_number\n", - "\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "from gretel_helpers.spacy import display_entities\n", "\n", @@ -495,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -516,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -539,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -547,7 +217,9 @@ " pipeline.transform_record(record)[\"record\"]\n", " for record in \n", " project.iter_records(direction=\"backward\")\n", - "]" + "]\n", + "\n", + "xf_df = pd.DataFrame(xf_records)" ] }, { @@ -559,75 +231,10 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
text
0location United States
1my name is Darrell Long
2city is San Diego email address is garciajim@y...
3location San Diego
4phone number 532-677-7284
\n", - "
" - ], - "text/plain": [ - " text\n", - "0 location United States\n", - "1 my name is Darrell Long\n", - "2 city is San Diego email address is garciajim@y...\n", - "3 location San Diego\n", - "4 phone number 532-677-7284" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "xf_df = pd.DataFrame(xf_records)\n", - "\n", "xf_df[[TEXT_FIELD]]" ] }, diff --git a/gretel/gc-nlp_text_analysis/manifest.json b/gretel/gc-nlp_text_analysis/manifest.json new file mode 100644 index 00000000..7ae99e9e --- /dev/null +++ b/gretel/gc-nlp_text_analysis/manifest.json @@ -0,0 +1,7 @@ +{ + "name": "Working Safely with Sensitive Free Text Using Gretel.ai and NLP", + "description": "Label and anonymize free text chat logs.", + "tags": ["ner", "nlp", "transformers"], + "language": "python", + "featured": false, +} \ No newline at end of file From b5f46dfad2d2a84f480e450a048267bf830fb68e Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 09:45:40 -0800 Subject: [PATCH 4/9] fix mainifest --- gretel/gc-nlp_text_analysis/manifest.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gretel/gc-nlp_text_analysis/manifest.json b/gretel/gc-nlp_text_analysis/manifest.json index 7ae99e9e..601afca5 100644 --- a/gretel/gc-nlp_text_analysis/manifest.json +++ b/gretel/gc-nlp_text_analysis/manifest.json @@ -3,5 +3,5 @@ "description": "Label and anonymize free text chat logs.", "tags": ["ner", "nlp", "transformers"], "language": "python", - "featured": false, + "featured": false } \ No newline at end of file From 7932e0d653de9ce222b826c71b8e4ab3ae16eb95 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 11:36:33 -0800 Subject: [PATCH 5/9] misc changes --- gretel/gc-nlp_text_analysis/blueprint.ipynb | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gretel/gc-nlp_text_analysis/blueprint.ipynb b/gretel/gc-nlp_text_analysis/blueprint.ipynb index 863fcf44..0861c2c0 100644 --- a/gretel/gc-nlp_text_analysis/blueprint.ipynb +++ b/gretel/gc-nlp_text_analysis/blueprint.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -Uqq spacy gretel-client" + "!pip install -Uqq spacy gretel-client # we install spacy for their visualization helper, displacy" ] }, { @@ -36,6 +36,8 @@ "import pandas as pd\n", "from gretel_client import get_cloud_client\n", "\n", + "pd.set_option('max_colwidth', None)\n", + "\n", "client = get_cloud_client(prefix=\"api\", api_key=\"prompt\")" ] }, @@ -45,7 +47,7 @@ "metadata": {}, "outputs": [], "source": [ - "client.install_packages()" + "client.install_packages(version=\"dev\")" ] }, { @@ -146,8 +148,8 @@ "\n", "TEXT_FIELD = \"text\"\n", "\n", - "for record in project.iter_records(direction=\"backward\"):\n", - " display_entities(record, text_field)" + "for record in project.iter_records(direction=\"backward\", record_limit=100):\n", + " display_entities(record, TEXT_FIELD)" ] }, { @@ -226,7 +228,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Inspect the transformed version of the dataset. Notice that entities such as names and emails have been replace with fake instances of the entity." + "Inspect the transformed version of the dataset." ] }, { From f31be791b08e8080c8b64c7562dc400af942c66c Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 11:44:31 -0800 Subject: [PATCH 6/9] remove dev identifier --- gretel/gc-nlp_text_analysis/blueprint.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gretel/gc-nlp_text_analysis/blueprint.ipynb b/gretel/gc-nlp_text_analysis/blueprint.ipynb index 0861c2c0..b7833168 100644 --- a/gretel/gc-nlp_text_analysis/blueprint.ipynb +++ b/gretel/gc-nlp_text_analysis/blueprint.ipynb @@ -47,7 +47,7 @@ "metadata": {}, "outputs": [], "source": [ - "client.install_packages(version=\"dev\")" + "client.install_packages()" ] }, { From d104e6d8ccc1f6e7db2c56bd2d05763cdf5f5698 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 13:06:54 -0800 Subject: [PATCH 7/9] Update README.md --- gretel/gc-nlp_text_analysis/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gretel/gc-nlp_text_analysis/README.md b/gretel/gc-nlp_text_analysis/README.md index 6d6cdd90..b2982365 100644 --- a/gretel/gc-nlp_text_analysis/README.md +++ b/gretel/gc-nlp_text_analysis/README.md @@ -1,5 +1,5 @@ -# Working Safely with Sensitive Free Text Using Gretel Cloud and NLP +# Work Safely with Sensitive Free Text Using Gretel -Using Gretel.ai's [NER and NLP features](https://gretel.ai/platform/data-cataloghttps://gretel.ai/platform/data-catalog), we analyze and label chat logs looking for PII and other potentially sensitive information. After labeling the dataset, we build a transformation pipeline that will redact and replace any sensitive strings from chat messages. +Using Gretel.ai's [NER and NLP features](https://gretel.ai/platform/data-catalog), we analyze and label chat logs looking for PII and other potentially sensitive information. After labeling the dataset, we build a transformation pipeline that will redact and replace any sensitive strings from chat messages. -At the end of the notebook we'll have a dataset that is safe to share without compromising a user's personal information. \ No newline at end of file +At the end of the notebook we'll have a dataset that is safe to share without compromising a user's personal information. From 814a1f330f357a296dc71ea7d667bc6d294f3fb9 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 13:15:14 -0800 Subject: [PATCH 8/9] Update manifest.json --- gretel/gc-nlp_text_analysis/manifest.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gretel/gc-nlp_text_analysis/manifest.json b/gretel/gc-nlp_text_analysis/manifest.json index 601afca5..4d51f5fd 100644 --- a/gretel/gc-nlp_text_analysis/manifest.json +++ b/gretel/gc-nlp_text_analysis/manifest.json @@ -1,7 +1,7 @@ { - "name": "Working Safely with Sensitive Free Text Using Gretel.ai and NLP", - "description": "Label and anonymize free text chat logs.", + "name": "Work Safely with Sensitive Free Text Using Gretel", + "description": "Label and anonymize free text chat logs using Gretel NER and NLP pipelines.", "tags": ["ner", "nlp", "transformers"], "language": "python", "featured": false -} \ No newline at end of file +} From 279c5ef10e524a886ea25a6f6f69e62cb6162360 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 3 Dec 2020 13:18:43 -0800 Subject: [PATCH 9/9] update nb title --- gretel/gc-nlp_text_analysis/blueprint.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gretel/gc-nlp_text_analysis/blueprint.ipynb b/gretel/gc-nlp_text_analysis/blueprint.ipynb index b7833168..4e699263 100644 --- a/gretel/gc-nlp_text_analysis/blueprint.ipynb +++ b/gretel/gc-nlp_text_analysis/blueprint.ipynb @@ -13,7 +13,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Working Safely with Sensitive Free Text Using Gretel.ai and NLP\n", + "# Work Safely with Sensitive Free Text Using Gretel\n", "\n", "Using Gretel.ai's [NER and NLP features](https://gretel.ai/platform/data-cataloghttps://gretel.ai/platform/data-catalog), we analyze and label chat logs looking for PII and other potentially sensitive information. After labeling the dataset, we build a transformation pipeline that will redact and replace any sensitive strings from chat messages.\n", "\n",