From ecf4ab542d371fe5131106285c27379971875497 Mon Sep 17 00:00:00 2001
From: neelamkoshiya <neelamkoshiya@gmail.com>
Date: Tue, 20 Sep 2022 09:40:05 -0700
Subject: [PATCH] Add files via upload

Updated per comments from aqyt
---
 .../joined-dataflow/explore_data.ipynb        | 100 +++++++++---------
 .../joined-dataflow/join.flow                 |  56 +---------
 2 files changed, 54 insertions(+), 102 deletions(-)
diff --git a/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb b/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb
index 2cfaec1bec..e29aa727ad 100644
--- a/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb
+++ b/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb
@@ -1,5 +1,21 @@
 {
  "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Explore and Prepare Data for SageMaker DataWrangler"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "----\n",
+    "## Background\n",
+    "In this notebook, we will download and explore the data that is used to build the SageMaker DataWrangler flow file for data processing. After running this notebook, you can follow the [README.md](README.md) for the step by step instructions to build the data processing flow file to prepare data for the machine learning task."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -23,6 +39,26 @@
     "!mkdir data"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<a id='get-the-data'></a>\n",
+    "\n",
+    "## Prerequisites: Get Data \n",
+    "\n",
+    "----\n",
+    "\n",
+    "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we will import the necessary python libraries and set up the environment"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -57,6 +93,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# define the functions that will be used to download data\n",
     "def get_data(public_s3_data, to_bucket, sample_data=1):\n",
     "    new_paths = []\n",
     "    for f in public_s3_data:\n",
@@ -82,50 +119,7 @@
     "            \"./data/{}\".format(filename), to_bucket, os.path.join(prefix, \"input\", filename)\n",
     "        )\n",
     "\n",
-    "    return new_paths\n",
-    "\n",
-    "\n",
-    "def update_data_sources(flow_path, tracks_data_source, ratings_data_source):\n",
-    "    with open(flow_path) as flowf:\n",
-    "        flow = json.load(flowf)\n",
-    "\n",
-    "    for node in flow[\"nodes\"]:\n",
-    "        # if the key exists for our s3 endpoint\n",
-    "        try:\n",
-    "            if node[\"parameters\"][\"dataset_definition\"][\"name\"] == \"tracks.csv\":\n",
-    "                # reset the s3 data source for tracks data\n",
-    "                old_source = node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\"s3Uri\"]\n",
-    "                print(\"Changed {} to {}\".format(old_source, tracks_data_source))\n",
-    "                node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\n",
-    "                    \"s3Uri\"\n",
-    "                ] = tracks_data_source\n",
-    "            elif node[\"parameters\"][\"dataset_definition\"][\"name\"] == \"ratings.csv\":\n",
-    "                # reset the s3 data source for ratings data\n",
-    "                old_source = node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\"s3Uri\"]\n",
-    "                print(\"Changed {} to {}\".format(old_source, ratings_data_source))\n",
-    "                node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\n",
-    "                    \"s3Uri\"\n",
-    "                ] = ratings_data_source\n",
-    "        except:\n",
-    "            continue\n",
-    "    # write out the updated json flow file\n",
-    "    with open(flow_path, \"w\") as outfile:\n",
-    "        json.dump(flow, outfile)\n",
-    "\n",
-    "    return flow"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<a id='get-the-data'></a>\n",
-    "\n",
-    "## Prerequisites: Get Data \n",
-    "\n",
-    "----\n",
-    "\n",
-    "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
+    "    return new_paths"
    ]
   },
   {
@@ -175,7 +169,9 @@
     "##### [back to top](#00-nb)\n",
     "\n",
     "\n",
-    "----"
+    "----\n",
+    "\n",
+    "In this section, we will perform preliminary data exploration to understand the data."
    ]
   },
   {
@@ -188,6 +184,13 @@
     "ratings = pd.read_csv(\"./data/ratings.csv\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We use the [pandas DataFrame head function](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html) to view the first five rows in each of the dataframes."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -212,6 +215,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# print the numbers of unique songs/tracks, users and user rating events\n",
     "print(\"{:,} different songs/tracks\".format(tracks[\"trackId\"].nunique()))\n",
     "print(\"{:,} users\".format(ratings[\"userId\"].nunique()))\n",
     "print(\"{:,} user rating events\".format(ratings[\"ratingEventId\"].nunique()))"
@@ -223,6 +227,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# plot a bar chart to display the number of tracks per genre to see the distribution\n",
     "tracks.groupby(\"genre\")[\"genre\"].count().plot.bar(title=\"Tracks by Genre\");"
    ]
   },
@@ -232,6 +237,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# plot the histogram to view the distribution of the number of ratings by user id\n",
     "ratings[[\"ratingEventId\", \"userId\"]].plot.hist(\n",
     "    by=\"userId\", bins=50, title=\"Distribution of # of Ratings by User\"\n",
     ");"
@@ -243,9 +249,7 @@
    "source": [
     "----\n",
     "\n",
-    "# Music Recommender Lab 1: Data Prep using SageMaker Data Wrangler\n",
-    "\n",
-    "After you completed running this notebook, you can follow the steps in the README."
+    "After you completed running this notebook, you can follow the steps in the README to start building the DataWrangler flow file."
    ]
   },
   {
diff --git a/sagemaker-datawrangler/joined-dataflow/join.flow b/sagemaker-datawrangler/joined-dataflow/join.flow
index 3eaeb46a4b..4e71fe8f05 100644
--- a/sagemaker-datawrangler/joined-dataflow/join.flow
+++ b/sagemaker-datawrangler/joined-dataflow/join.flow
@@ -17,7 +17,7 @@
           "description": null,
           "s3ExecutionContext": {
             "__typename": "S3ExecutionContext",
-            "s3Uri": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/tracks.csv",
+            "s3Uri": "s3://sagemaker-sample-files/datasets/tabular/synthetic-music/tracks.csv",
             "s3ContentType": "csv",
             "s3HasHeader": true,
             "s3FieldDelimiter": ",",
@@ -77,7 +77,7 @@
           "description": null,
           "s3ExecutionContext": {
             "__typename": "S3ExecutionContext",
-            "s3Uri": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/ratings.csv",
+            "s3Uri": "s3://sagemaker-sample-files/datasets/tabular/synthetic-music/ratings.csv",
             "s3ContentType": "csv",
             "s3HasHeader": true,
             "s3FieldDelimiter": ",",
@@ -415,58 +415,6 @@
           "name": "default"
         }
       ]
-    },
-    {
-      "node_id": "a50d670e-07a6-4146-8f2f-ddf87d0bfa5d",
-      "type": "DESTINATION",
-      "operator": "sagemaker.spark.s3_destination_0.1",
-      "name": "S3: training",
-      "parameters": {
-        "output_config": {
-          "compression": "none",
-          "output_path": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/train/",
-          "output_content_type": "CSV",
-          "delimiter": ","
-        }
-      },
-      "inputs": [
-        {
-          "name": "default",
-          "node_id": "f088dbd3-274c-4335-94ae-d51278cbadb7",
-          "output_name": "default"
-        }
-      ],
-      "outputs": [
-        {
-          "name": "default"
-        }
-      ]
-    },
-    {
-      "node_id": "fe15bf83-02ff-4397-b4e1-cd0e4de5b1fc",
-      "type": "DESTINATION",
-      "operator": "sagemaker.spark.s3_destination_0.1",
-      "name": "S3: test",
-      "parameters": {
-        "output_config": {
-          "compression": "none",
-          "output_path": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/test/",
-          "output_content_type": "CSV",
-          "delimiter": ","
-        }
-      },
-      "inputs": [
-        {
-          "name": "default",
-          "node_id": "1d573e54-75bd-4e18-a3f1-04159d564fc2",
-          "output_name": "default"
-        }
-      ],
-      "outputs": [
-        {
-          "name": "default"
-        }
-      ]
     }
   ]
 }
\ No newline at end of file