Skip to content

Commit

Permalink
Revert "Notebook fixed and cleaned (#1726)"
Browse files Browse the repository at this point in the history
This reverts commit b68acb4.
  • Loading branch information
hongshanli23 authored Nov 11, 2020
1 parent 3bd6ecb commit fd652b5
Showing 1 changed file with 118 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
"## Setup\n",
"\n",
"Let's start by specifying:\n",
"- The S3 buckets and prefixes that you want to use for saving model data and where training data is located. These should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n",
"\n",
"- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n",
"- The IAM role ARN used to give SageMaker access to your data. It can be fetched using the **get_execution_role** method from sagemaker python SDK."
]
},
Expand All @@ -54,14 +55,11 @@
"sess = sagemaker.Session()\n",
"\n",
"role = get_execution_role()\n",
"print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n",
"\n",
"output_bucket = sess.default_bucket() # Replace with your own bucket name if needed\n",
"print(output_bucket)\n",
"output_prefix = \"sagemaker/DEMO-blazingtext-text8\" # Replace with the prefix under which you want to store the data if needed\n",
"print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n",
"\n",
"data_bucket = \"penny-cache-alpha-us-west-2\" # Replace with the bucket where your data is located\n",
"data_prefix = \"1p-notebooks/data/text8\""
"bucket = sess.default_bucket() # Replace with your own bucket name if needed\n",
"print(bucket)\n",
"prefix = 'sagemaker/DEMO-blazingtext-text8' #Replace with the prefix under which you want to store the data if needed"
]
},
{
Expand All @@ -70,7 +68,9 @@
"source": [
"### Data Ingestion\n",
"\n",
"BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence. In this example, let us train the vectors on [text8](http://mattmahoney.net/dc/textdata.html) dataset (100 MB), which is a small (already preprocessed) version of Wikipedia dump. Data is already downloaded from [here](http://mattmahoney.net/dc/text8.zip), uncompressed and stored in a S3 bucket. "
"Next, we download a dataset from the web on which we want to train the word vectors. BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence.\n",
"\n",
"In this example, let us train the vectors on [text8](http://mattmahoney.net/dc/textdata.html) dataset (100 MB), which is a small (already preprocessed) version of Wikipedia dump. "
]
},
{
Expand All @@ -79,9 +79,41 @@
"metadata": {},
"outputs": [],
"source": [
"train_channel = f\"{data_prefix}/train\"\n",
"!wget http://mattmahoney.net/dc/text8.zip -O text8.gz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Uncompressing\n",
"!gzip -d text8.gz -f"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After the data downloading and uncompressing is complete, we need to upload it to S3 so that it can be consumed by SageMaker to execute training jobs. We'll use Python SDK to upload these two files to the bucket and prefix location that we have set above."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_channel = prefix + '/train'\n",
"\n",
"sess.upload_data(path='text8', bucket=bucket, key_prefix=train_channel)\n",
"\n",
"s3_train_data = f\"s3://{data_bucket}/{train_channel}\""
"s3_train_data = 's3://{}/{}'.format(bucket, train_channel)"
]
},
{
Expand All @@ -94,10 +126,12 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"s3_output_location = f\"s3://{output_bucket}/{output_prefix}/output\""
"s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)"
]
},
{
Expand All @@ -111,7 +145,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"region_name = boto3.Session().region_name"
Expand All @@ -124,7 +160,7 @@
"outputs": [],
"source": [
"container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, \"blazingtext\", \"latest\")\n",
"print(f\"Using SageMaker BlazingText container: {container} ({region_name})\")"
"print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))"
]
},
{
Expand Down Expand Up @@ -175,20 +211,20 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bt_model = sagemaker.estimator.Estimator(\n",
" container,\n",
" role,\n",
" train_instance_count=2,\n",
" train_instance_type=\"ml.c4.2xlarge\",\n",
" train_volume_size=5,\n",
" train_max_run=360000,\n",
" input_mode=\"File\",\n",
" output_path=s3_output_location,\n",
" sagemaker_session=sess,\n",
")"
"bt_model = sagemaker.estimator.Estimator(container,\n",
" role, \n",
" train_instance_count=2, \n",
" train_instance_type='ml.c4.2xlarge',\n",
" train_volume_size = 5,\n",
" train_max_run = 360000,\n",
" input_mode= 'File',\n",
" output_path=s3_output_location,\n",
" sagemaker_session=sess)"
]
},
{
Expand All @@ -201,22 +237,22 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"bt_model.set_hyperparameters(\n",
" mode=\"batch_skipgram\",\n",
" epochs=5,\n",
" min_count=5,\n",
" sampling_threshold=0.0001,\n",
" learning_rate=0.05,\n",
" window_size=5,\n",
" vector_dim=100,\n",
" negative_samples=5,\n",
" batch_size=11, # = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)\n",
" evaluation=True, # Perform similarity evaluation on WS-353 dataset at the end of training\n",
" subwords=False,\n",
") # Subword embedding learning is not supported by batch_skipgram"
"bt_model.set_hyperparameters(mode=\"batch_skipgram\",\n",
" epochs=5,\n",
" min_count=5,\n",
" sampling_threshold=0.0001,\n",
" learning_rate=0.05,\n",
" window_size=5,\n",
" vector_dim=100,\n",
" negative_samples=5,\n",
" batch_size=11, # = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)\n",
" evaluation=True,# Perform similarity evaluation on WS-353 dataset at the end of training\n",
" subwords=False) # Subword embedding learning is not supported by batch_skipgram"
]
},
{
Expand All @@ -229,13 +265,14 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"train_data = sagemaker.session.s3_input(\n",
" s3_train_data, distribution=\"FullyReplicated\", content_type=\"text/plain\", s3_data_type=\"S3Prefix\"\n",
")\n",
"data_channels = {\"train\": train_data}"
"train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', \n",
" content_type='text/plain', s3_data_type='S3Prefix')\n",
"data_channels = {'train': train_data}"
]
},
{
Expand Down Expand Up @@ -270,7 +307,7 @@
"metadata": {},
"outputs": [],
"source": [
"bt_endpoint = bt_model.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
"bt_endpoint = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')"
]
},
{
Expand All @@ -291,16 +328,16 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"words = [\"awesome\", \"blazing\"]\n",
"\n",
"payload = {\"instances\": words}\n",
"payload = {\"instances\" : words}\n",
"\n",
"response = bt_endpoint.predict(\n",
" json.dumps(payload), initial_args={\"ContentType\": \"application/json\", \"Accept\": \"application/json\"}\n",
")\n",
"response = bt_endpoint.predict(json.dumps(payload))\n",
"\n",
"vecs = json.loads(response)\n",
"print(vecs)"
Expand Down Expand Up @@ -330,13 +367,15 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"s3 = boto3.resource(\"s3\")\n",
"s3 = boto3.resource('s3')\n",
"\n",
"key = bt_model.model_data[bt_model.model_data.find(\"/\", 5) + 1 :]\n",
"s3.Bucket(output_bucket).download_file(key, \"model.tar.gz\")"
"key = bt_model.model_data[bt_model.model_data.find(\"/\", 5)+1:]\n",
"s3.Bucket(bucket).download_file(key, 'model.tar.gz')"
]
},
{
Expand Down Expand Up @@ -371,7 +410,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"!cat eval.json"
Expand All @@ -387,7 +428,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
Expand All @@ -398,7 +441,7 @@
"\n",
"first_line = True\n",
"index_to_word = []\n",
"with open(\"vectors.txt\", \"r\") as f:\n",
"with open(\"vectors.txt\",\"r\") as f:\n",
" for line_num, line in enumerate(f):\n",
" if first_line:\n",
" dim = int(line.strip().split()[1])\n",
Expand All @@ -407,7 +450,7 @@
" continue\n",
" line = line.strip()\n",
" word = line.split()[0]\n",
" vec = word_vecs[line_num - 1]\n",
" vec = word_vecs[line_num-1]\n",
" for index, vec_val in enumerate(line.split()[1:]):\n",
" vec[index] = float(vec_val)\n",
" index_to_word.append(word)\n",
Expand All @@ -419,12 +462,14 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"\n",
"tsne = TSNE(perplexity=40, n_components=2, init=\"pca\", n_iter=10000)\n",
"tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=10000)\n",
"two_d_embeddings = tsne.fit_transform(word_vecs[:num_points])\n",
"labels = index_to_word[:num_points]"
]
Expand All @@ -439,16 +484,14 @@
"%matplotlib inline\n",
"\n",
"def plot(embeddings, labels):\n",
" pylab.figure(figsize=(20, 20))\n",
" pylab.figure(figsize=(20,20))\n",
" for i, label in enumerate(labels):\n",
" x, y = embeddings[i, :]\n",
" x, y = embeddings[i,:]\n",
" pylab.scatter(x, y)\n",
" pylab.annotate(\n",
" label, xy=(x, y), xytext=(5, 2), textcoords=\"offset points\", ha=\"right\", va=\"bottom\"\n",
" )\n",
" pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',\n",
" ha='right', va='bottom')\n",
" pylab.show()\n",
"\n",
"\n",
"plot(two_d_embeddings, labels)"
]
},
Expand Down Expand Up @@ -477,19 +520,20 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sess.delete_endpoint(bt_endpoint.endpoint)"
]
}
],
"metadata": {
"instance_type": "ml.t3.medium",
"kernelspec": {
"display_name": "Python 3 (Data Science)",
"display_name": "conda_python3",
"language": "python",
"name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0"
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -501,10 +545,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.6.2"
},
"notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
},
"nbformat": 4,
"nbformat_minor": 4
"nbformat_minor": 2
}

0 comments on commit fd652b5

Please sign in to comment.