From ecf4ab542d371fe5131106285c27379971875497 Mon Sep 17 00:00:00 2001 From: neelamkoshiya Date: Tue, 20 Sep 2022 09:40:05 -0700 Subject: [PATCH] Add files via upload Updated per comments from aqyt --- .../joined-dataflow/explore_data.ipynb | 100 +++++++++--------- .../joined-dataflow/join.flow | 56 +--------- 2 files changed, 54 insertions(+), 102 deletions(-) diff --git a/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb b/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb index 2cfaec1bec..e29aa727ad 100644 --- a/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb +++ b/sagemaker-datawrangler/joined-dataflow/explore_data.ipynb @@ -1,5 +1,21 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Explore and Prepare Data for SageMaker DataWrangler" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "## Background\n", + "In this notebook, we will download and explore the data that is used to build the SageMaker DataWrangler flow file for data processing. After running this notebook, you can follow the [README.md](README.md) for the step by step instructions to build the data processing flow file to prepare data for the machine learning task." + ] + }, { "cell_type": "code", "execution_count": null, @@ -23,6 +39,26 @@ "!mkdir data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "## Prerequisites: Get Data \n", + "\n", + "----\n", + "\n", + "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we will import the necessary python libraries and set up the environment" + ] + }, { "cell_type": "code", "execution_count": null, @@ -57,6 +93,7 @@ "metadata": {}, "outputs": [], "source": [ + "# define the functions that will be used to download data\n", "def get_data(public_s3_data, to_bucket, sample_data=1):\n", " new_paths = []\n", " for f in public_s3_data:\n", @@ -82,50 +119,7 @@ " \"./data/{}\".format(filename), to_bucket, os.path.join(prefix, \"input\", filename)\n", " )\n", "\n", - " return new_paths\n", - "\n", - "\n", - "def update_data_sources(flow_path, tracks_data_source, ratings_data_source):\n", - " with open(flow_path) as flowf:\n", - " flow = json.load(flowf)\n", - "\n", - " for node in flow[\"nodes\"]:\n", - " # if the key exists for our s3 endpoint\n", - " try:\n", - " if node[\"parameters\"][\"dataset_definition\"][\"name\"] == \"tracks.csv\":\n", - " # reset the s3 data source for tracks data\n", - " old_source = node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\"s3Uri\"]\n", - " print(\"Changed {} to {}\".format(old_source, tracks_data_source))\n", - " node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\n", - " \"s3Uri\"\n", - " ] = tracks_data_source\n", - " elif node[\"parameters\"][\"dataset_definition\"][\"name\"] == \"ratings.csv\":\n", - " # reset the s3 data source for ratings data\n", - " old_source = node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\"s3Uri\"]\n", - " print(\"Changed {} to {}\".format(old_source, ratings_data_source))\n", - " node[\"parameters\"][\"dataset_definition\"][\"s3ExecutionContext\"][\n", - " \"s3Uri\"\n", - " ] = ratings_data_source\n", - " except:\n", - " continue\n", - " # write out the updated json flow file\n", - " with open(flow_path, \"w\") as outfile:\n", - " json.dump(flow, outfile)\n", - "\n", - " return flow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Prerequisites: Get Data \n", - "\n", - "----\n", - "\n", - "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. " + " return new_paths" ] }, { @@ -175,7 +169,9 @@ "##### [back to top](#00-nb)\n", "\n", "\n", - "----" + "----\n", + "\n", + "In this section, we will perform preliminary data exploration to understand the data." ] }, { @@ -188,6 +184,13 @@ "ratings = pd.read_csv(\"./data/ratings.csv\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use the [pandas DataFrame head function](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html) to view the first five rows in each of the dataframes." + ] + }, { "cell_type": "code", "execution_count": null, @@ -212,6 +215,7 @@ "metadata": {}, "outputs": [], "source": [ + "# print the numbers of unique songs/tracks, users and user rating events\n", "print(\"{:,} different songs/tracks\".format(tracks[\"trackId\"].nunique()))\n", "print(\"{:,} users\".format(ratings[\"userId\"].nunique()))\n", "print(\"{:,} user rating events\".format(ratings[\"ratingEventId\"].nunique()))" @@ -223,6 +227,7 @@ "metadata": {}, "outputs": [], "source": [ + "# plot a bar chart to display the number of tracks per genre to see the distribution\n", "tracks.groupby(\"genre\")[\"genre\"].count().plot.bar(title=\"Tracks by Genre\");" ] }, @@ -232,6 +237,7 @@ "metadata": {}, "outputs": [], "source": [ + "# plot the histogram to view the distribution of the number of ratings by user id\n", "ratings[[\"ratingEventId\", \"userId\"]].plot.hist(\n", " by=\"userId\", bins=50, title=\"Distribution of # of Ratings by User\"\n", ");" @@ -243,9 +249,7 @@ "source": [ "----\n", "\n", - "# Music Recommender Lab 1: Data Prep using SageMaker Data Wrangler\n", - "\n", - "After you completed running this notebook, you can follow the steps in the README." + "After you completed running this notebook, you can follow the steps in the README to start building the DataWrangler flow file." ] }, { diff --git a/sagemaker-datawrangler/joined-dataflow/join.flow b/sagemaker-datawrangler/joined-dataflow/join.flow index 3eaeb46a4b..4e71fe8f05 100644 --- a/sagemaker-datawrangler/joined-dataflow/join.flow +++ b/sagemaker-datawrangler/joined-dataflow/join.flow @@ -17,7 +17,7 @@ "description": null, "s3ExecutionContext": { "__typename": "S3ExecutionContext", - "s3Uri": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/tracks.csv", + "s3Uri": "s3://sagemaker-sample-files/datasets/tabular/synthetic-music/tracks.csv", "s3ContentType": "csv", "s3HasHeader": true, "s3FieldDelimiter": ",", @@ -77,7 +77,7 @@ "description": null, "s3ExecutionContext": { "__typename": "S3ExecutionContext", - "s3Uri": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/ratings.csv", + "s3Uri": "s3://sagemaker-sample-files/datasets/tabular/synthetic-music/ratings.csv", "s3ContentType": "csv", "s3HasHeader": true, "s3FieldDelimiter": ",", @@ -415,58 +415,6 @@ "name": "default" } ] - }, - { - "node_id": "a50d670e-07a6-4146-8f2f-ddf87d0bfa5d", - "type": "DESTINATION", - "operator": "sagemaker.spark.s3_destination_0.1", - "name": "S3: training", - "parameters": { - "output_config": { - "compression": "none", - "output_path": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/train/", - "output_content_type": "CSV", - "delimiter": "," - } - }, - "inputs": [ - { - "name": "default", - "node_id": "f088dbd3-274c-4335-94ae-d51278cbadb7", - "output_name": "default" - } - ], - "outputs": [ - { - "name": "default" - } - ] - }, - { - "node_id": "fe15bf83-02ff-4397-b4e1-cd0e4de5b1fc", - "type": "DESTINATION", - "operator": "sagemaker.spark.s3_destination_0.1", - "name": "S3: test", - "parameters": { - "output_config": { - "compression": "none", - "output_path": "s3://sagemaker-us-east-1-631450739534/music-recommendation-workshop/input/test/", - "output_content_type": "CSV", - "delimiter": "," - } - }, - "inputs": [ - { - "name": "default", - "node_id": "1d573e54-75bd-4e18-a3f1-04159d564fc2", - "output_name": "default" - } - ], - "outputs": [ - { - "name": "default" - } - ] } ] } \ No newline at end of file