Fix broken links and cleanup (#3451)

Co-authored-by: A Yacat <[email protected]>
aws · Jun 10, 2022 · 0373fbc · 0373fbc
1 parent da58c29
commit 0373fbc
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 36 deletions.
diff --git a/sagemaker-pipeline-multi-model/README.md b/sagemaker-pipeline-multi-model/README.md
@@ -54,7 +54,7 @@ The following IAM roles are required:
 - AmazonS3FullAccess
 - AWSGlueServiceRole
 
-[restate-project.ipynb](https://github.com/aws/amazon-sagemaker-samples/blob/main/sagemaker-pipeline-multi-model/restate-project.ipynb) has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed. This SageMaker notebook is attached with an IAM role with the following managed policies:
+[restate-project.ipynb](restate-project.ipynb) has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed. This SageMaker notebook is attached with an IAM role with the following managed policies:
 - AmazonEC2ContainerRegistryFullAccess
 - AmazonS3FullAccess
 - AWSGlueServiceNotebookRole
@@ -79,8 +79,6 @@ This SageMaker notebook is attached with an IAM role with the following in-line
 }
 ```
 
-See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
-
 ## License
 
 This library is licensed under the MIT-0 License. See the LICENSE file.
diff --git a/sagemaker-pipeline-multi-model/restate-project.ipynb b/sagemaker-pipeline-multi-model/restate-project.ipynb
@@ -13,7 +13,7 @@
    "id": "0d3deddf",
    "metadata": {},
    "source": [
-    "Before proceeding, please see context of this notebook in [README.md](https://github.com/aws/amazon-sagemaker-samples/blob/main/sagemaker-pipeline-multi-model/README.md). This notebook has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed, e.g. conda_mxnet_latest_p37. Make sure you have created a SageMaker project outside of this notebook with the name `restate`. Recommendation is to create a SageMaker project using [SageMaker-provide MLOps template for model building, training, and deployment template](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-projects-templates-sm.html#sagemaker-projects-templates-code-commit). Note that this notebook will not create the SageMaker project for you. \n"
+    "Before proceeding, please see context of this notebook in [README.md](README.md). This notebook has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed, e.g. conda_mxnet_latest_p37. Make sure you have created a SageMaker project outside of this notebook with the name `restate`. Recommendation is to create a SageMaker project using [SageMaker-provide MLOps template for model building, training, and deployment template](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-projects-templates-sm.html#sagemaker-projects-templates-code-commit). Note that this notebook will not create the SageMaker project for you. \n"
    ]
   },
   {
@@ -163,7 +163,7 @@
    "id": "d3930b84",
    "metadata": {},
    "source": [
-    "At this point, it is assumed that S3 bucket sagemaker-restate-`<AWS ACCOUNT ID>` and the necessary IAM roles are created. For the complete list of prerequisites, please see [README.md](https://github.com/aws/amazon-sagemaker-samples/blob/main/sagemaker-pipeline-multi-model/README.md). \n",
+    "At this point, it is assumed that S3 bucket sagemaker-restate-`<AWS ACCOUNT ID>` and the necessary IAM roles are created. For the complete list of prerequisites, please see [README.md](README.md). \n",
     "\n",
     "We move the raw data to S3 bucket sagemaker-restate-`<AWS ACCOUNT ID>`."
    ]

diff --git a/sagemaker-pipeline-multi-model/sagemaker-pipeline/pipelines/restate/pipeline.py b/sagemaker-pipeline-multi-model/sagemaker-pipeline/pipelines/restate/pipeline.py
@@ -104,7 +104,7 @@ def get_pipeline(
     role=None,
     default_bucket=None,
     model_package_group_name="restatePackageGroup",  # Choose any name
-    pipeline_name="restate-p-roj2jxb1j0eu",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
+    pipeline_name="restate-p-XXXXXXXXX",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
     base_job_prefix="restate",  # Choose any name
 ):
     """Gets a SageMaker ML Pipeline instance working with on RE data.
@@ -134,12 +134,15 @@ def get_pipeline(
     input_data = ParameterString(
         name="InputDataUrl",
         default_value=f"",  # Change this to point to the s3 location of your raw input data.
-        # default_value=f"s3://sagemaker-project-p-3xbrq5pwzvlw/export-flow-02-23-52-23-354f2c00/output/pipelines-4y3vbujineuo-DataWranglerProcessi-A5QsWJSsSJ/05ba8da3-2ced-4ecb-aad9-22704674d567/default/part-00000-76487ef2-610f-413c-a489-9158b771ef51-c000TEMP.csv",
-        # default_value=None
     )
 
-    ### ATHENA PIPELINE BEGIN
     data_sources = []
+    # Sagemaker session
+    sess = sagemaker_session
+
+    # You can configure this with your own bucket name, e.g.
+    # bucket = "my-bucket"
+    bucket = sess.default_bucket()
 
     data_sources.append(
         ProcessingInput(
@@ -151,29 +154,21 @@ def get_pipeline(
                 athena_dataset_definition=AthenaDatasetDefinition(
                     catalog="AwsDataCatalog",
                     database="restate",
-                    # query_string="SELECT * FROM resvm.russia_3870",
                     query_string="SELECT * FROM restate.california_10",
-                    output_s3_uri="s3://sagemaker-restate-240964962523/athena/data-wrangler",
+                    output_s3_uri=f"s3://{bucket}/athena/",
                     output_format="PARQUET",
                 ),
             ),
         )
     )
 
-    # Sagemaker session
-    sess = sagemaker_session
-
-    # You can configure this with your own bucket name, e.g.
-    # bucket = "my-bucket"
-    bucket = sess.default_bucket()
     print(f"Data Wrangler export storage bucket: {bucket}")
 
     # unique flow export ID
     flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}"
     flow_export_name = f"flow-{flow_export_id}"
 
     # Output name is auto-generated from the select node's ID + output name from the flow file.
-    # output_name = "dc0bda28-f867-4501-9323-b3d571ae5c35.default"
     output_name = "99ae1ec3-dd5f-453c-bfae-721dac423cd7.default"
 
     s3_output_prefix = f"export-{flow_export_name}/output"
@@ -230,9 +225,6 @@ def get_pipeline(
         framework="data-wrangler",  # we are using the Sagemaker built in xgboost algorithm
         region=region,
     )
-    # container_uri = "119527597002.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-data-wrangler-container:1.x"
-    # Pinned Data Wrangler Container URL.
-    # container_uri_pinned = "119527597002.dkr.ecr.ap-southeast-1.amazonaws.com/sagemaker-data-wrangler-container:1.14.3"
 
     # Processing Job Instance count and instance type.
     instance_count = 2
@@ -276,8 +268,6 @@ def get_pipeline(
         job_arguments=[f"--output-config '{json.dumps(output_config)}'"],
     )
 
-    ### ATHENA PIPELINE END
-
     # Processing step for feature engineering
     # this processor does not have awswrangler installed
     sklearn_processor = SKLearnProcessor(
@@ -500,7 +490,6 @@ def get_pipeline(
         processor=xgb_script_eval,
         inputs=[
             ProcessingInput(
-                # source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
                 source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
                 destination="/opt/ml/processing/model",
             ),
@@ -541,22 +530,19 @@ def get_pipeline(
     xgb_eval_metrics = JsonGet(
         step=xgb_step_eval,
         property_file=xgb_evaluation_report,
-        # json_path="binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
         json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
     )
 
     dtree_eval_metrics = JsonGet(
         step=dtree_step_eval,
         property_file=dtree_evaluation_report,
-        # json_path="binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
         json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
     )
 
     # Register model step that will be conditionally executed
     dtree_step_register = RegisterModel(
         name="DTreeReg",
         estimator=dtree_train,
-        # model_data=dtree_step_train.properties.ModelArtifacts.S3ModelArtifacts,
         model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
         content_types=["text/csv"],
         response_types=["text/csv"],
@@ -571,7 +557,6 @@ def get_pipeline(
     xgb_step_register = RegisterModel(
         name="XGBReg",
         estimator=xgb_train,
-        # model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
         model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
         content_types=["text/csv"],
         response_types=["text/csv"],
@@ -587,13 +572,11 @@ def get_pipeline(
         left=JsonGet(
             step=dtree_step_eval,
             property_file=dtree_evaluation_report,
-            # json_path="binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
             json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
         ),
         right=JsonGet(
             step=xgb_step_eval,
             property_file=xgb_evaluation_report,
-            # json_path="binary_classification_metrics.accuracy.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
             json_path="regression_metrics.r2s.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
         ),  # You can change the threshold here
     )
@@ -615,7 +598,6 @@ def get_pipeline(
             training_instance_type,
             model_approval_status,
             input_data
-            # input_data_wr
         ],
         pipeline_experiment_config=PipelineExperimentConfig(
             pipeline_name + "-" + create_date, "restate-{}".format(create_date)

diff --git a/sagemaker-pipeline-multi-model/sagemaker-pipeline/restate-athena-california.flow b/sagemaker-pipeline-multi-model/sagemaker-pipeline/restate-athena-california.flow
@@ -15,8 +15,6 @@
           "name": "restate-california",
           "catalogName": "AwsDataCatalog",
           "databaseName": "restate",
-          "queryString": "SELECT * FROM \"restate\".\"california\"",
-          "s3OutputLocation": "s3://sagemaker-ap-southeast-1-240964962523/athena/",
           "outputFormat": "parquet",
           "workgroup": null,
           "workgroupOutputLocation": null,
@@ -25,8 +23,7 @@
       },
       "trained_parameters": {
         "query_execution_id": "ca8a88de-b2b8-403b-b841-f3a9c1a750a0",
-        "ctas_table_name": "sagemaker_tmp_5f97be7f253044558c96b79e9a8fad40",
-        "ctas_s3_output_location": "s3://sagemaker-ap-southeast-1-240964962523/athena/5f97be7f253044558c96b79e9a8fad40/data/"
+        "ctas_table_name": "sagemaker_tmp_5f97be7f253044558c96b79e9a8fad40"
       },
       "inputs": [],
       "outputs": [
@@ -123,4 +120,4 @@
       ]
     }
   ]
-}
+}