diff --git a/introduction_to_amazon_algorithms/blazingtext_word2vec_text8/blazingtext_word2vec_text8.ipynb b/introduction_to_amazon_algorithms/blazingtext_word2vec_text8/blazingtext_word2vec_text8.ipynb index c172bd3fa2..d26ef1dd6b 100644 --- a/introduction_to_amazon_algorithms/blazingtext_word2vec_text8/blazingtext_word2vec_text8.ipynb +++ b/introduction_to_amazon_algorithms/blazingtext_word2vec_text8/blazingtext_word2vec_text8.ipynb @@ -34,7 +34,8 @@ "## Setup\n", "\n", "Let's start by specifying:\n", - "- The S3 buckets and prefixes that you want to use for saving model data and where training data is located. These should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n", + "\n", + "- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting. If you don't specify a bucket, SageMaker SDK will create a default bucket following a pre-defined naming convention in the same region. \n", "- The IAM role ARN used to give SageMaker access to your data. It can be fetched using the **get_execution_role** method from sagemaker python SDK." ] }, @@ -54,14 +55,11 @@ "sess = sagemaker.Session()\n", "\n", "role = get_execution_role()\n", - "print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n", - "\n", - "output_bucket = sess.default_bucket() # Replace with your own bucket name if needed\n", - "print(output_bucket)\n", - "output_prefix = \"sagemaker/DEMO-blazingtext-text8\" # Replace with the prefix under which you want to store the data if needed\n", + "print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf\n", "\n", - "data_bucket = \"penny-cache-alpha-us-west-2\" # Replace with the bucket where your data is located\n", - "data_prefix = \"1p-notebooks/data/text8\"" + "bucket = sess.default_bucket() # Replace with your own bucket name if needed\n", + "print(bucket)\n", + "prefix = 'sagemaker/DEMO-blazingtext-text8' #Replace with the prefix under which you want to store the data if needed" ] }, { @@ -70,7 +68,9 @@ "source": [ "### Data Ingestion\n", "\n", - "BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence. In this example, let us train the vectors on [text8](http://mattmahoney.net/dc/textdata.html) dataset (100 MB), which is a small (already preprocessed) version of Wikipedia dump. Data is already downloaded from [here](http://mattmahoney.net/dc/text8.zip), uncompressed and stored in a S3 bucket. " + "Next, we download a dataset from the web on which we want to train the word vectors. BlazingText expects a single preprocessed text file with space separated tokens and each line of the file should contain a single sentence.\n", + "\n", + "In this example, let us train the vectors on [text8](http://mattmahoney.net/dc/textdata.html) dataset (100 MB), which is a small (already preprocessed) version of Wikipedia dump. " ] }, { @@ -79,9 +79,41 @@ "metadata": {}, "outputs": [], "source": [ - "train_channel = f\"{data_prefix}/train\"\n", + "!wget http://mattmahoney.net/dc/text8.zip -O text8.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Uncompressing\n", + "!gzip -d text8.gz -f" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the data downloading and uncompressing is complete, we need to upload it to S3 so that it can be consumed by SageMaker to execute training jobs. We'll use Python SDK to upload these two files to the bucket and prefix location that we have set above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_channel = prefix + '/train'\n", + "\n", + "sess.upload_data(path='text8', bucket=bucket, key_prefix=train_channel)\n", "\n", - "s3_train_data = f\"s3://{data_bucket}/{train_channel}\"" + "s3_train_data = 's3://{}/{}'.format(bucket, train_channel)" ] }, { @@ -94,10 +126,12 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "s3_output_location = f\"s3://{output_bucket}/{output_prefix}/output\"" + "s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)" ] }, { @@ -111,7 +145,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "region_name = boto3.Session().region_name" @@ -124,7 +160,7 @@ "outputs": [], "source": [ "container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, \"blazingtext\", \"latest\")\n", - "print(f\"Using SageMaker BlazingText container: {container} ({region_name})\")" + "print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))" ] }, { @@ -175,20 +211,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "bt_model = sagemaker.estimator.Estimator(\n", - " container,\n", - " role,\n", - " train_instance_count=2,\n", - " train_instance_type=\"ml.c4.2xlarge\",\n", - " train_volume_size=5,\n", - " train_max_run=360000,\n", - " input_mode=\"File\",\n", - " output_path=s3_output_location,\n", - " sagemaker_session=sess,\n", - ")" + "bt_model = sagemaker.estimator.Estimator(container,\n", + " role, \n", + " train_instance_count=2, \n", + " train_instance_type='ml.c4.2xlarge',\n", + " train_volume_size = 5,\n", + " train_max_run = 360000,\n", + " input_mode= 'File',\n", + " output_path=s3_output_location,\n", + " sagemaker_session=sess)" ] }, { @@ -201,22 +237,22 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "bt_model.set_hyperparameters(\n", - " mode=\"batch_skipgram\",\n", - " epochs=5,\n", - " min_count=5,\n", - " sampling_threshold=0.0001,\n", - " learning_rate=0.05,\n", - " window_size=5,\n", - " vector_dim=100,\n", - " negative_samples=5,\n", - " batch_size=11, # = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)\n", - " evaluation=True, # Perform similarity evaluation on WS-353 dataset at the end of training\n", - " subwords=False,\n", - ") # Subword embedding learning is not supported by batch_skipgram" + "bt_model.set_hyperparameters(mode=\"batch_skipgram\",\n", + " epochs=5,\n", + " min_count=5,\n", + " sampling_threshold=0.0001,\n", + " learning_rate=0.05,\n", + " window_size=5,\n", + " vector_dim=100,\n", + " negative_samples=5,\n", + " batch_size=11, # = (2*window_size + 1) (Preferred. Used only if mode is batch_skipgram)\n", + " evaluation=True,# Perform similarity evaluation on WS-353 dataset at the end of training\n", + " subwords=False) # Subword embedding learning is not supported by batch_skipgram" ] }, { @@ -229,13 +265,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "train_data = sagemaker.session.s3_input(\n", - " s3_train_data, distribution=\"FullyReplicated\", content_type=\"text/plain\", s3_data_type=\"S3Prefix\"\n", - ")\n", - "data_channels = {\"train\": train_data}" + "train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', \n", + " content_type='text/plain', s3_data_type='S3Prefix')\n", + "data_channels = {'train': train_data}" ] }, { @@ -270,7 +307,7 @@ "metadata": {}, "outputs": [], "source": [ - "bt_endpoint = bt_model.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")" + "bt_endpoint = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')" ] }, { @@ -291,16 +328,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "words = [\"awesome\", \"blazing\"]\n", "\n", - "payload = {\"instances\": words}\n", + "payload = {\"instances\" : words}\n", "\n", - "response = bt_endpoint.predict(\n", - " json.dumps(payload), initial_args={\"ContentType\": \"application/json\", \"Accept\": \"application/json\"}\n", - ")\n", + "response = bt_endpoint.predict(json.dumps(payload))\n", "\n", "vecs = json.loads(response)\n", "print(vecs)" @@ -330,13 +367,15 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ - "s3 = boto3.resource(\"s3\")\n", + "s3 = boto3.resource('s3')\n", "\n", - "key = bt_model.model_data[bt_model.model_data.find(\"/\", 5) + 1 :]\n", - "s3.Bucket(output_bucket).download_file(key, \"model.tar.gz\")" + "key = bt_model.model_data[bt_model.model_data.find(\"/\", 5)+1:]\n", + "s3.Bucket(bucket).download_file(key, 'model.tar.gz')" ] }, { @@ -371,7 +410,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "!cat eval.json" @@ -387,7 +428,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -398,7 +441,7 @@ "\n", "first_line = True\n", "index_to_word = []\n", - "with open(\"vectors.txt\", \"r\") as f:\n", + "with open(\"vectors.txt\",\"r\") as f:\n", " for line_num, line in enumerate(f):\n", " if first_line:\n", " dim = int(line.strip().split()[1])\n", @@ -407,7 +450,7 @@ " continue\n", " line = line.strip()\n", " word = line.split()[0]\n", - " vec = word_vecs[line_num - 1]\n", + " vec = word_vecs[line_num-1]\n", " for index, vec_val in enumerate(line.split()[1:]):\n", " vec[index] = float(vec_val)\n", " index_to_word.append(word)\n", @@ -419,12 +462,14 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.manifold import TSNE\n", "\n", - "tsne = TSNE(perplexity=40, n_components=2, init=\"pca\", n_iter=10000)\n", + "tsne = TSNE(perplexity=40, n_components=2, init='pca', n_iter=10000)\n", "two_d_embeddings = tsne.fit_transform(word_vecs[:num_points])\n", "labels = index_to_word[:num_points]" ] @@ -439,16 +484,14 @@ "%matplotlib inline\n", "\n", "def plot(embeddings, labels):\n", - " pylab.figure(figsize=(20, 20))\n", + " pylab.figure(figsize=(20,20))\n", " for i, label in enumerate(labels):\n", - " x, y = embeddings[i, :]\n", + " x, y = embeddings[i,:]\n", " pylab.scatter(x, y)\n", - " pylab.annotate(\n", - " label, xy=(x, y), xytext=(5, 2), textcoords=\"offset points\", ha=\"right\", va=\"bottom\"\n", - " )\n", + " pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',\n", + " ha='right', va='bottom')\n", " pylab.show()\n", "\n", - "\n", "plot(two_d_embeddings, labels)" ] }, @@ -477,7 +520,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "sess.delete_endpoint(bt_endpoint.endpoint)" @@ -485,11 +530,10 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -501,10 +545,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.6.2" }, "notice": "Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 2 }