From 33194c3c03439a3a7f06f6cb1e15ff6e509492ec Mon Sep 17 00:00:00 2001 From: Kartik Kalamadi Date: Thu, 7 May 2020 16:14:55 -0700 Subject: [PATCH 1/5] don't use aws-secret and update readme for sample pipelines --- .../ground_truth_pipeline_demo/README.md | 105 +++++++++++--- .../mini-image-classification-pipeline.py | 8 +- .../mnist-kmeans-sagemaker/README.md | 105 +++++++++++--- .../kmeans-hpo-pipeline.py | 2 +- .../simple_train_pipeline/README.md | 136 +++++++++++------- .../training-pipeline.py | 2 +- .../titanic-survival-prediction.py | 6 +- 7 files changed, 266 insertions(+), 98 deletions(-) diff --git a/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md b/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md index aca55819e2f..3359aca13a4 100644 --- a/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md +++ b/samples/contrib/aws-samples/ground_truth_pipeline_demo/README.md @@ -34,25 +34,92 @@ client_ID = App client > Note : Once you start a run on the pipeline you will receive the ground_truth labeling jobs at "Labeling portal sign-in URL" link -## SageMaker permission - -In order to run this pipeline, we need to prepare an IAM Role to run Sagemaker jobs. You need this `role_arn` to run a pipeline. Check [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) for details. - -This pipeline also use aws-secret to get access to Sagemaker services, please also make sure you have a `aws-secret` in the kubeflow namespace. - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: aws-secret - namespace: kubeflow -type: Opaque -data: - AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY - AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS -``` - -> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64` +## IAM Roles + +We need two IAM roles to run AWS KFP components. You only have to do this once. (Re-use the Role ARNs if you have done this before) + +**Role 1]** For KFP pods to access AWS Sagemaker. Here are the steps to create it. +1. Enable OIDC support on the EKS cluster + ``` + eksctl utils associate-iam-oidc-provider --cluster \ + --region --approve + ``` +2. Take note of the [OIDC](https://openid.net/connect/) issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. + ``` + aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text + ``` +3. Create a file named trust.json with the following content. + Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. + ``` + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam:::oidc-provider/" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + ":aud": "sts.amazonaws.com", + ":sub": "system:serviceaccount:kubeflow:pipeline-runner" + } + } + } + ] + } + ``` +4. Create an IAM role using trust.json. Make a note of the ARN returned in the output. + ``` + aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json + aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' + ``` +5. Edit your pipeline-runner service account. + ``` + kubectl edit -n kubeflow serviceaccount pipeline-runner + ``` + Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: + ``` + apiVersion: v1 + kind: ServiceAccount + metadata: + annotations: + eks.amazonaws.com/role-arn: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} + creationTimestamp: "2020-04-16T05:48:06Z" + labels: + app: pipeline-runner + app.kubernetes.io/component: pipelines-runner + app.kubernetes.io/instance: pipelines-runner-0.2.0 + app.kubernetes.io/managed-by: kfctl + app.kubernetes.io/name: pipelines-runner + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/version: 0.2.0 + name: pipeline-runner + namespace: kubeflow + resourceVersion: "11787" + selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner + uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 + secrets: + - name: pipeline-runner-token-dkjrk + ``` +**Role 2]** For sagemaker job to access S3 buckets and other Sagemaker services. This Role ARN is given as an input to the components. + ``` + SAGEMAKER_EXECUTION_ROLE_NAME=kfp-example-sagemaker-execution-role + + TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" + aws iam create-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME}--assume-role-policy-document "$TRUST" + aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess + + aws iam get-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --output text --query 'Role.Arn' + + # Note down the role arn which is of the form + arn:aws:iam:::role/ + ``` ## Compiling the pipeline template diff --git a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py index efabfd7e637..4f7d82d6c91 100644 --- a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py +++ b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py @@ -73,7 +73,7 @@ def ground_truth_test(region='us-west-2', user_pool=user_pool, user_groups=user_groups, client_id=client_id - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) ground_truth_train = sagemaker_gt_op( region=region, @@ -93,7 +93,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) ground_truth_validation = sagemaker_gt_op( region=region, @@ -113,7 +113,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) channelObj['ChannelName'] = 'train' channelObj['DataSource']['S3DataSource']['S3Uri'] = str(ground_truth_train.outputs['output_manifest_location']) @@ -134,7 +134,7 @@ def ground_truth_test(region='us-west-2', max_run_time=training_max_run_time, model_artifact_path=training_output_location, role=role_arn - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': kfp.compiler.Compiler().compile(ground_truth_test, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md index a549062e0e0..289ab2c0a77 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md @@ -69,25 +69,92 @@ s3_client.upload_file('valid-data.csv', bucket, input_key) ``` Run this file `python s3_sample_data_creator.py` -## SageMaker permission - -In order to run this pipeline, we need to prepare an IAM Role to run Sagemaker jobs. You need this `role_arn` to run a pipeline. Check [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) for details. - -This pipeline also use aws-secret to get access to Sagemaker services, please also make sure you have a `aws-secret` in the kubeflow namespace. - -```yaml -apiVersion: v1 -kind: Secret -metadata: - name: aws-secret - namespace: kubeflow -type: Opaque -data: - AWS_ACCESS_KEY_ID: YOUR_BASE64_ACCESS_KEY - AWS_SECRET_ACCESS_KEY: YOUR_BASE64_SECRET_ACCESS -``` - -> Note: To get base64 string, try `echo -n $AWS_ACCESS_KEY_ID | base64` +## IAM Roles + +We need two IAM roles to run AWS KFP components. You only have to do this once. (Re-use the Role ARNs if you have done this before) + +**Role 1]** For KFP pods to access AWS Sagemaker. Here are the steps to create it. +1. Enable OIDC support on the EKS cluster + ``` + eksctl utils associate-iam-oidc-provider --cluster \ + --region --approve + ``` +2. Take note of the [OIDC](https://openid.net/connect/) issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. + ``` + aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text + ``` +3. Create a file named trust.json with the following content. + Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. + ``` + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam:::oidc-provider/" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + ":aud": "sts.amazonaws.com", + ":sub": "system:serviceaccount:kubeflow:pipeline-runner" + } + } + } + ] + } + ``` +4. Create an IAM role using trust.json. Make a note of the ARN returned in the output. + ``` + aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json + aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' + ``` +5. Edit your pipeline-runner service account. + ``` + kubectl edit -n kubeflow serviceaccount pipeline-runner + ``` + Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: + ``` + apiVersion: v1 + kind: ServiceAccount + metadata: + annotations: + eks.amazonaws.com/role-arn: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} + creationTimestamp: "2020-04-16T05:48:06Z" + labels: + app: pipeline-runner + app.kubernetes.io/component: pipelines-runner + app.kubernetes.io/instance: pipelines-runner-0.2.0 + app.kubernetes.io/managed-by: kfctl + app.kubernetes.io/name: pipelines-runner + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/version: 0.2.0 + name: pipeline-runner + namespace: kubeflow + resourceVersion: "11787" + selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner + uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 + secrets: + - name: pipeline-runner-token-dkjrk + ``` +**Role 2]** For sagemaker job to access S3 buckets and other Sagemaker services. This Role ARN is given as an input to the components. + ``` + SAGEMAKER_EXECUTION_ROLE_NAME=kfp-example-sagemaker-execution-role + + TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" + aws iam create-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME}--assume-role-policy-document "$TRUST" + aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess + + aws iam get-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --output text --query 'Role.Arn' + + # Note down the role arn which is of the form + arn:aws:iam:::role/ + ``` ## Compiling the pipeline template diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py index c3cf49f14d9..1787ba6d0c6 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py @@ -115,7 +115,7 @@ def hpo_test(region='us-west-2', checkpoint_config=checkpoint_config, tags=tags, role=role_arn, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': kfp.compiler.Compiler().compile(hpo_test, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/simple_train_pipeline/README.md b/samples/contrib/aws-samples/simple_train_pipeline/README.md index 812bfbbb483..b875be29e38 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/README.md +++ b/samples/contrib/aws-samples/simple_train_pipeline/README.md @@ -41,57 +41,91 @@ An example pipeline with only [train component](https://github.com/kubeflow/pipe boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf) ``` Run this file `python s3_sample_data_creator.py` -3. Prepare an IAM role with permissions to run SageMaker jobs and access to S3 buckets. - - create a new file "trust.json" with following content - ```buildoutcfg - { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "", - "Effect": "Allow", - "Principal": { - "Service": "sagemaker.amazonaws.com" - }, - "Action": "sts:AssumeRole" - } - ] - } - ``` - ```buildoutcfg - - # run these commands to create a role named "SageMakerExecutorKFP" with SageMaker and S3 access - aws iam create-role --role-name SageMakerExecutorKFP --assume-role-policy-document file://trust.json - aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess --role-name SageMakerExecutorKFP - aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess --role-name SageMakerExecutorKFP - - # Note down the role ARN - aws iam get-role --role-name SageMakerExecutorKFP # | jq .Role.Arn - ``` -4. Add 'aws-secret' to your Kubeflow namespace. - ``` - # 1. get aws key and secret in base64 format: - - echo -n "" | base64 - echo -n "" | base64 - - # 2. Create new file secret.yaml with following content - - apiVersion: v1 - kind: Secret - metadata: - name: aws-secret - namespace: kubeflow - type: Opaque - data: - AWS_ACCESS_KEY_ID: - AWS_SECRET_ACCESS_KEY: - - # 3. Now apply to the cluster's kubeflow namespace: - - kubectl -n kubeflow apply -f secret.yaml - ``` +3. **Prepare IAM Roles** + + We need two IAM roles to run AWS KFP components. You only have to do this once. (Re-use the Role ARNs if you have done this before) + + **Role 1]** For KFP pods to access AWS Sagemaker. Here are the steps to create it. + i. Enable OIDC support on the EKS cluster + ``` + eksctl utils associate-iam-oidc-provider --cluster \ + --region --approve + ``` + ii. Take note of the [OIDC](https://openid.net/connect/) issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. + ``` + aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text + ``` + iii. Create a file named trust.json with the following content. + Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. + ``` + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam:::oidc-provider/" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + ":aud": "sts.amazonaws.com", + ":sub": "system:serviceaccount:kubeflow:pipeline-runner" + } + } + } + ] + } + ``` + iv. Create an IAM role using trust.json. Make a note of the ARN returned in the output. + ``` + aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json + aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' + ``` + v. Edit your pipeline-runner service account. + ``` + kubectl edit -n kubeflow serviceaccount pipeline-runner + ``` + Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: + ``` + apiVersion: v1 + kind: ServiceAccount + metadata: + annotations: + eks.amazonaws.com/role-arn: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} + creationTimestamp: "2020-04-16T05:48:06Z" + labels: + app: pipeline-runner + app.kubernetes.io/component: pipelines-runner + app.kubernetes.io/instance: pipelines-runner-0.2.0 + app.kubernetes.io/managed-by: kfctl + app.kubernetes.io/name: pipelines-runner + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/version: 0.2.0 + name: pipeline-runner + namespace: kubeflow + resourceVersion: "11787" + selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner + uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 + secrets: + - name: pipeline-runner-token-dkjrk + ``` + **Role 2]** For sagemaker job to access S3 buckets and other Sagemaker services. This Role ARN is given as an input to the components. + ``` + SAGEMAKER_EXECUTION_ROLE_NAME=kfp-example-sagemaker-execution-role + + TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" + aws iam create-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME}--assume-role-policy-document "$TRUST" + aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess + + aws iam get-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --output text --query 'Role.Arn' + + # Note down the role arn which is of the form + arn:aws:iam:::role/ 5. Compile the pipeline: `dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz` 6. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. diff --git a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py index d21320fdf4a..aa308f1c545 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py +++ b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py @@ -74,7 +74,7 @@ def training( max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': kfp.compiler.Compiler().compile(training, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py b/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py index 647fda5345a..ccd9fb3f909 100644 --- a/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py +++ b/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py @@ -32,7 +32,7 @@ def titanic_suvival_prediction(region='us-west-2', instance_type=instance_type, instance_count=instance_count, log_s3_uri=log_s3_uri, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) training_and_prediction = emr_submit_spark_job_op( region=region, @@ -42,13 +42,13 @@ def titanic_suvival_prediction(region='us-west-2', main_class=main_class, input=input, output=output - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) delete_cluster = emr_delete_cluster_op( region=region, jobflow_id=create_cluster.output, dependent=training_and_prediction.outputs['job_id'] - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': kfp.compiler.Compiler().compile(titanic_suvival_prediction, __file__ + '.zip') From d6dfa06e2e95b069e2e53f881b1859eaed532f31 Mon Sep 17 00:00:00 2001 From: Kartik Kalamadi Date: Wed, 13 May 2020 21:50:55 -0700 Subject: [PATCH 2/5] Addressed comments on PR and few more readme changes --- components/aws/sagemaker/deploy/README.md | 2 +- components/aws/sagemaker/deploy/src/deploy.py | 12 +- .../sagemaker/hyperparameter_tuning/README.md | 2 +- .../src/hyperparameter_tuning.py | 4 +- components/aws/sagemaker/train/README.md | 4 +- components/aws/sagemaker/train/src/train.py | 4 +- samples/contrib/aws-samples/README.md | 209 ++++++++++++++++++ .../ground_truth_pipeline_demo/README.md | 91 +------- .../mini-image-classification-pipeline.py | 14 +- .../mnist-kmeans-sagemaker/README.md | 155 +------------ .../kmeans-hpo-pipeline.py | 10 +- .../mnist-classification-pipeline.py | 25 ++- .../simple_train_pipeline/README.md | 133 +---------- .../training-pipeline.py | 8 +- .../titanic-survival-prediction.py | 6 +- 15 files changed, 272 insertions(+), 407 deletions(-) create mode 100644 samples/contrib/aws-samples/README.md diff --git a/components/aws/sagemaker/deploy/README.md b/components/aws/sagemaker/deploy/README.md index 95c1c68651b..c69525cea96 100644 --- a/components/aws/sagemaker/deploy/README.md +++ b/components/aws/sagemaker/deploy/README.md @@ -31,7 +31,7 @@ Argument | Description | Optional (in pipeline definition :--- | :---------- | :---------- | :---------- | :----------| :---------- | :----------| model_name_[1, 3] | The name of the model that you want to host. This is the name that you specified when creating the model | No | No | String | | | variant_name_[1, 3] | The name of the production variant | Yes | Yes | String | | variant_name_[1, 3] | -instance_type_[1, 3] | The ML compute instance type | Yes | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge | +instance_type_[1, 3] | The ML compute instance type | Yes | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/)| ml.m4.xlarge | initial_instance_count_[1, 3] | Number of instances to launch initially | Yes | Yes | Integer | ≥ 1 | 1 | initial_variant_weight_[1, 3] | Determines initial traffic distribution among all of the models that you specify in the endpoint configuration. The traffic to a production variant is determined by the ratio of the VariantWeight to the sum of all VariantWeight values across all ProductionVariants. | Yes | Yes | Float | Minimum value of 0 | | accelerator_type_[1, 3] | The size of the Elastic Inference (EI) instance to use for the production variant | Yes | Yes | String| ml.eia1.medium, ml.eia1.large, ml.eia1.xlarge | | diff --git a/components/aws/sagemaker/deploy/src/deploy.py b/components/aws/sagemaker/deploy/src/deploy.py index 519b3f5d0b2..d11eab83990 100644 --- a/components/aws/sagemaker/deploy/src/deploy.py +++ b/components/aws/sagemaker/deploy/src/deploy.py @@ -23,25 +23,19 @@ def create_parser(): parser.add_argument('--variant_name_1', type=str.strip, required=False, help='The name of the production variant.', default='variant-name-1') parser.add_argument('--model_name_1', type=str.strip, required=True, help='The model name used for endpoint deployment.') parser.add_argument('--initial_instance_count_1', type=_utils.str_to_int, required=False, help='Number of instances to launch initially.', default=1) - parser.add_argument('--instance_type_1', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type_1', type=str.strip, required=False, help='The ML compute instance type.') parser.add_argument('--initial_variant_weight_1', type=_utils.str_to_float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0) parser.add_argument('--accelerator_type_1', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str.strip, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='') parser.add_argument('--variant_name_2', type=str.strip, required=False, help='The name of the production variant.', default='variant-name-2') parser.add_argument('--model_name_2', type=str.strip, required=False, help='The model name used for endpoint deployment.', default='') parser.add_argument('--initial_instance_count_2', type=_utils.str_to_int, required=False, help='Number of instances to launch initially.', default=1) - parser.add_argument('--instance_type_2', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type_2', type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--initial_variant_weight_2', type=_utils.str_to_float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0) parser.add_argument('--accelerator_type_2', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str.strip, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='') parser.add_argument('--variant_name_3', type=str.strip, required=False, help='The name of the production variant.', default='variant-name-3') parser.add_argument('--model_name_3', type=str.strip, required=False, help='The model name used for endpoint deployment.', default='') parser.add_argument('--initial_instance_count_3', type=_utils.str_to_int, required=False, help='Number of instances to launch initially.', default=1) - parser.add_argument('--instance_type_3', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge', ''], type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type_3', type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--initial_variant_weight_3', type=_utils.str_to_float, required=False, help='Determines initial traffic distribution among all of the models that you specify in the endpoint configuration.', default=1.0) parser.add_argument('--accelerator_type_3', choices=['ml.eia1.medium', 'ml.eia1.large', 'ml.eia1.xlarge', ''], type=str.strip, required=False, help='The size of the Elastic Inference (EI) instance to use for the production variant.', default='') parser.add_argument('--resource_encryption_key', type=str.strip, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') diff --git a/components/aws/sagemaker/hyperparameter_tuning/README.md b/components/aws/sagemaker/hyperparameter_tuning/README.md index 9a6a0090063..f7117283c28 100644 --- a/components/aws/sagemaker/hyperparameter_tuning/README.md +++ b/components/aws/sagemaker/hyperparameter_tuning/README.md @@ -28,7 +28,7 @@ categorical_parameters | The array of CategoricalParameterRange objects that spe channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | | output_location | The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job | No | No | String | | | output_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts | Yes | Yes | String | | | -instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge | +instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/)| ml.m4.xlarge | instance_count | The number of ML compute instances to use in each training job | Yes | Yes | Int | ≥ 1 | 1 | volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Yes | Int | ≥ 1 | 30 | max_num_jobs | The maximum number of training jobs that a hyperparameter tuning job can launch | No | No | Int | [1, 500] | | diff --git a/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py b/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py index 68fe2fefa58..e4fc026779e 100644 --- a/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py +++ b/components/aws/sagemaker/hyperparameter_tuning/src/hyperparameter_tuning.py @@ -37,9 +37,7 @@ def create_parser(): parser.add_argument('--channels', type=_utils.str_to_json_list, required=True, help='A list of dicts specifying the input channels. Must have at least one.') parser.add_argument('--output_location', type=str.strip, required=True, help='The Amazon S3 path where you want Amazon SageMaker to store the results of the transform job.') parser.add_argument('--output_encryption_key', type=str.strip, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt the model artifacts.', default='') - parser.add_argument('--instance_type', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', type=str.strip, required=False, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--instance_count', type=_utils.str_to_int, required=False, help='The number of ML compute instances to use in each training job.', default=1) parser.add_argument('--volume_size', type=_utils.str_to_int, required=False, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument('--max_num_jobs', type=_utils.str_to_int, required=True, help='The maximum number of training jobs that a hyperparameter tuning job can launch.') diff --git a/components/aws/sagemaker/train/README.md b/components/aws/sagemaker/train/README.md index 5b9e68eeac0..6659e801f15 100644 --- a/components/aws/sagemaker/train/README.md +++ b/components/aws/sagemaker/train/README.md @@ -21,7 +21,7 @@ metric_definitions | The dictionary of name-regex pairs specify the metrics that put_mode | The input mode that the algorithm supports | No | String | File, Pipe | File | hyperparameters | Hyperparameters for the selected algorithm | No | Dict | [Depends on Algo](https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html)| | channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | | -instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge | ml.m4.xlarge | +instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/) | ml.m4.xlarge | instance_count | The number of ML compute instances to use in each training job | Yes | Int | ≥ 1 | 1 | volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Int | ≥ 1 | 30 | resource_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s) | Yes | String | | | @@ -42,7 +42,7 @@ tags | Key-value pairs to categorize AWS resources | Yes | Dict | | {} | Stores the Model in the s3 bucket you specified # Example code -Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/documents/samples/contrib/aws-samples/simple_train_pipeline) +Simple example pipeline with only Train component : [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) # Resources * [Using Amazon built-in algorithms](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html) diff --git a/components/aws/sagemaker/train/src/train.py b/components/aws/sagemaker/train/src/train.py index 448e86341b3..2b67d32b203 100644 --- a/components/aws/sagemaker/train/src/train.py +++ b/components/aws/sagemaker/train/src/train.py @@ -27,9 +27,7 @@ def create_parser(): parser.add_argument('--training_input_mode', choices=['File', 'Pipe'], type=str.strip, help='The input mode that the algorithm supports. File or Pipe.', default='File') parser.add_argument('--hyperparameters', type=_utils.str_to_json_dict, help='Dictionary of hyperparameters for the the algorithm.', default='{}') parser.add_argument('--channels', type=_utils.str_to_json_list, required=True, help='A list of dicts specifying the input channels. Must have at least one.') - parser.add_argument('--instance_type', required=True, choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str.strip, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', required=True, type=str.strip, help='The ML compute instance type.') parser.add_argument('--instance_count', required=True, type=_utils.str_to_int, help='The registry path of the Docker image that contains the training algorithm.', default=1) parser.add_argument('--volume_size', type=_utils.str_to_int, required=True, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument('--resource_encryption_key', type=str.strip, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') diff --git a/samples/contrib/aws-samples/README.md b/samples/contrib/aws-samples/README.md new file mode 100644 index 00000000000..198a1c586c6 --- /dev/null +++ b/samples/contrib/aws-samples/README.md @@ -0,0 +1,209 @@ +# Sample AWS SageMaker Kubeflow Pipelines + +This folder contains many example pipelines which use [AWS SageMaker Components for KFP](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker). Following sections explain the setup you need to run these pipelines. Once you are done with the setup, [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) is a good place to start if you have never used these components before. + + + +## Prerequisites + +1. You need a cluster with Kubeflow installed on it. [Install Kubeflow on AWS cluster](https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/) +2. Install the following on your local machine or EC2 instance (These are recommended tools. Not all of these are required) + 1. [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html). If you are using an IAM user, configure your [Access Key ID, Secret Access Key](https://docs.aws.amazon.com/general/latest/gr/aws-sec-cred-types.html#access-keys-and-secret-access-keys) and preferred AWS Region by running: + `aws configure` + 2. [aws-iam-authenticator](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) version 0.1.31 and above + 3. [eksctl](https://github.com/weaveworks/eksctl) version above 0.15 + 4. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) version 1.17 and above + 5. [KFP SDK](https://www.kubeflow.org/docs/pipelines/sdk/install-sdk/#install-the-kubeflow-pipelines-sdk) (installs the dsl-compile and kfp cli) + 6. [jq](https://stedolan.github.io/jq/download/) + + +## IAM Permissions + +You need two IAM Roles/Users +1. For KFP pods to access AWS SageMaker. Let's call this one **kfp-pod-permissions** +2. For SageMaker job to access S3 buckets and other SageMaker services. Let's call this one **kfp-sagemaker-execution-permissions** + +### kfp-pod-permissions + +**Option 1]** [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html) (This is the recommended way) + 1. Enable OIDC support on the EKS cluster + ``` + eksctl utils associate-iam-oidc-provider --cluster \ + --region --approve + ``` + 2. Take note of the OIDC issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. + ``` + aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text + ``` + 3. Create a file named trust.json with the following content. + Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. + ``` + # Replace these two with proper values + OIDC_URL="" + AWS_ACC_NUM="" + ``` + ``` + # Run this to create trust.json file + cat < trust.json + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": "arn:aws:iam::$AWS_ACC_NUM:oidc-provider/$OIDC_URL" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "$OIDC_URL:aud": "sts.amazonaws.com", + "$OIDC_URL:sub": "system:serviceaccount:kubeflow:pipeline-runner" + } + } + } + ] + } + EOF + ``` + 4. Create an IAM role using trust.json. Make a note of the ARN returned in the output. + ``` + aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json + aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' + ``` + 5. Edit your pipeline-runner service account. + ``` + kubectl edit -n kubeflow serviceaccount pipeline-runner + ``` + Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: **(add only line 5)** + ``` + apiVersion: v1 + kind: ServiceAccount + metadata: + annotations: + eks.amazonaws.com/role-arn: + kubectl.kubernetes.io/last-applied-configuration: | + {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} + creationTimestamp: "2020-04-16T05:48:06Z" + labels: + app: pipeline-runner + app.kubernetes.io/component: pipelines-runner + app.kubernetes.io/instance: pipelines-runner-0.2.0 + app.kubernetes.io/managed-by: kfctl + app.kubernetes.io/name: pipelines-runner + app.kubernetes.io/part-of: kubeflow + app.kubernetes.io/version: 0.2.0 + name: pipeline-runner + namespace: kubeflow + resourceVersion: "11787" + selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner + uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 + secrets: + - name: pipeline-runner-token-dkjrk + ``` +**option 2]** Create an IAM User and store the credentials as a `aws-secret` in kubernetes cluster. Then use those in the components. + 1. Create an IAM User with SageMaker permissions + ``` + aws iam create-user --user-name kfp-example-pod-user + aws iam attach-user-policy --user-name kfp-example-pod-user --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess + aws iam create-access-key --user-name kfp-example-pod-user > /tmp/create_output.json + ``` + 2. Convert the Access key and secret to base64. + ``` + export AWS_ACCESS_KEY_ID_VALUE=$(jq -j .AccessKey.AccessKeyId /tmp/create_output.json | base64) + export AWS_SECRET_ACCESS_KEY_VALUE=$(jq -j .AccessKey.SecretAccessKey /tmp/create_output.json | base64) + ``` + 3. Apply them to k8s cluster + ``` + cat < Note : Once you start a run on the pipeline you will receive the ground_truth labeling jobs at "Labeling portal sign-in URL" link -## IAM Roles - -We need two IAM roles to run AWS KFP components. You only have to do this once. (Re-use the Role ARNs if you have done this before) - -**Role 1]** For KFP pods to access AWS Sagemaker. Here are the steps to create it. -1. Enable OIDC support on the EKS cluster - ``` - eksctl utils associate-iam-oidc-provider --cluster \ - --region --approve - ``` -2. Take note of the [OIDC](https://openid.net/connect/) issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. - ``` - aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text - ``` -3. Create a file named trust.json with the following content. - Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. - ``` - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Federated": "arn:aws:iam:::oidc-provider/" - }, - "Action": "sts:AssumeRoleWithWebIdentity", - "Condition": { - "StringEquals": { - ":aud": "sts.amazonaws.com", - ":sub": "system:serviceaccount:kubeflow:pipeline-runner" - } - } - } - ] - } - ``` -4. Create an IAM role using trust.json. Make a note of the ARN returned in the output. - ``` - aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json - aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' - ``` -5. Edit your pipeline-runner service account. - ``` - kubectl edit -n kubeflow serviceaccount pipeline-runner - ``` - Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: - ``` - apiVersion: v1 - kind: ServiceAccount - metadata: - annotations: - eks.amazonaws.com/role-arn: - kubectl.kubernetes.io/last-applied-configuration: | - {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} - creationTimestamp: "2020-04-16T05:48:06Z" - labels: - app: pipeline-runner - app.kubernetes.io/component: pipelines-runner - app.kubernetes.io/instance: pipelines-runner-0.2.0 - app.kubernetes.io/managed-by: kfctl - app.kubernetes.io/name: pipelines-runner - app.kubernetes.io/part-of: kubeflow - app.kubernetes.io/version: 0.2.0 - name: pipeline-runner - namespace: kubeflow - resourceVersion: "11787" - selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner - uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 - secrets: - - name: pipeline-runner-token-dkjrk - ``` -**Role 2]** For sagemaker job to access S3 buckets and other Sagemaker services. This Role ARN is given as an input to the components. - ``` - SAGEMAKER_EXECUTION_ROLE_NAME=kfp-example-sagemaker-execution-role - - TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" - aws iam create-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME}--assume-role-policy-document "$TRUST" - aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess - - aws iam get-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --output text --query 'Role.Arn' - - # Note down the role arn which is of the form - arn:aws:iam:::role/ - ``` - ## Compiling the pipeline template diff --git a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py index 4f7d82d6c91..3f94bf76495 100644 --- a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py +++ b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + import kfp import json import copy @@ -59,7 +62,7 @@ def ground_truth_test(region='us-west-2', training_input_mode='Pipe', training_hyperparameters='{"num_classes": "2", "num_training_samples": "14", "mini_batch_size": "2"}', training_output_location='s3://your-bucket-name/mini-image-classification/training-output', - training_instance_type='ml.p2.xlarge', + training_instance_type='ml.m5.2xlarge', training_instance_count='1', training_volume_size='50', training_max_run_time='3600', @@ -73,7 +76,7 @@ def ground_truth_test(region='us-west-2', user_pool=user_pool, user_groups=user_groups, client_id=client_id - ) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) ground_truth_train = sagemaker_gt_op( region=region, @@ -93,7 +96,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - ) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) ground_truth_validation = sagemaker_gt_op( region=region, @@ -113,7 +116,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - ) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) channelObj['ChannelName'] = 'train' channelObj['DataSource']['S3DataSource']['S3Uri'] = str(ground_truth_train.outputs['output_manifest_location']) @@ -134,7 +137,8 @@ def ground_truth_test(region='us-west-2', max_run_time=training_max_run_time, model_artifact_path=training_output_location, role=role_arn - ) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + if __name__ == '__main__': kfp.compiler.Compiler().compile(ground_truth_test, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md index 289ab2c0a77..3c54f9ec436 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/README.md @@ -1,160 +1,10 @@ The `mnist-classification-pipeline.py` sample runs a pipeline to train a classficiation model using Kmeans with MNIST dataset on Sagemaker. The `kmeans-hpo-pipeline.py` is a single component hyper parameter optimisation pipeline which has default values set to use Kmeans. -If you do not have `train_data`, `test_data`, and `valid_data` you can use the following code to get sample data which -(This data can be used for both of these pipelines) -## The sample dataset +## Prerequisites -This sample is based on the [Train a Model with a Built-in Algorithm and Deploy it](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1.html). - -The sample trains and deploy a model based on the [MNIST dataset](http://www.deeplearning.net/tutorial/gettingstarted.html). - - -Create an S3 bucket and use the following python script to copy `train_data`, `test_data`, and `valid_data.csv` to your buckets. -(create the bucket in `us-west-2` region if you are gonna use default values of the pipeline) -https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html - -Create a new file named `s3_sample_data_creator.py` with following content : -```python -import pickle, gzip, numpy, urllib.request, json -from urllib.parse import urlparse - -# Load the dataset -urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz") -with gzip.open('mnist.pkl.gz', 'rb') as f: - train_set, valid_set, test_set = pickle.load(f, encoding='latin1') - - -# Upload dataset to S3 -from sagemaker.amazon.common import write_numpy_to_dense_tensor -import io -import boto3 - -################################################################### -# This is the only thing that you need to change to run this code -# Give the name of your S3 bucket -bucket = 'bucket-name' - -# If you are gonna use the default values of the pipeline then -# give a bucket name which is in us-west-2 region -################################################################### - -train_data_key = 'mnist_kmeans_example/train_data' -test_data_key = 'mnist_kmeans_example/test_data' -train_data_location = 's3://{}/{}'.format(bucket, train_data_key) -test_data_location = 's3://{}/{}'.format(bucket, test_data_key) -print('training data will be uploaded to: {}'.format(train_data_location)) -print('training data will be uploaded to: {}'.format(test_data_location)) - -# Convert the training data into the format required by the SageMaker KMeans algorithm -buf = io.BytesIO() -write_numpy_to_dense_tensor(buf, train_set[0], train_set[1]) -buf.seek(0) - -boto3.resource('s3').Bucket(bucket).Object(train_data_key).upload_fileobj(buf) - -# Convert the test data into the format required by the SageMaker KMeans algorithm -write_numpy_to_dense_tensor(buf, test_set[0], test_set[1]) -buf.seek(0) - -boto3.resource('s3').Bucket(bucket).Object(test_data_key).upload_fileobj(buf) - -# Convert the valid data into the format required by the SageMaker KMeans algorithm -numpy.savetxt('valid-data.csv', valid_set[0], delimiter=',', fmt='%g') -s3_client = boto3.client('s3') -input_key = "{}/valid_data.csv".format("mnist_kmeans_example/input") -s3_client.upload_file('valid-data.csv', bucket, input_key) - -``` - -Run this file `python s3_sample_data_creator.py` -## IAM Roles - -We need two IAM roles to run AWS KFP components. You only have to do this once. (Re-use the Role ARNs if you have done this before) - -**Role 1]** For KFP pods to access AWS Sagemaker. Here are the steps to create it. -1. Enable OIDC support on the EKS cluster - ``` - eksctl utils associate-iam-oidc-provider --cluster \ - --region --approve - ``` -2. Take note of the [OIDC](https://openid.net/connect/) issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. - ``` - aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text - ``` -3. Create a file named trust.json with the following content. - Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. - ``` - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Federated": "arn:aws:iam:::oidc-provider/" - }, - "Action": "sts:AssumeRoleWithWebIdentity", - "Condition": { - "StringEquals": { - ":aud": "sts.amazonaws.com", - ":sub": "system:serviceaccount:kubeflow:pipeline-runner" - } - } - } - ] - } - ``` -4. Create an IAM role using trust.json. Make a note of the ARN returned in the output. - ``` - aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json - aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' - ``` -5. Edit your pipeline-runner service account. - ``` - kubectl edit -n kubeflow serviceaccount pipeline-runner - ``` - Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: - ``` - apiVersion: v1 - kind: ServiceAccount - metadata: - annotations: - eks.amazonaws.com/role-arn: - kubectl.kubernetes.io/last-applied-configuration: | - {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} - creationTimestamp: "2020-04-16T05:48:06Z" - labels: - app: pipeline-runner - app.kubernetes.io/component: pipelines-runner - app.kubernetes.io/instance: pipelines-runner-0.2.0 - app.kubernetes.io/managed-by: kfctl - app.kubernetes.io/name: pipelines-runner - app.kubernetes.io/part-of: kubeflow - app.kubernetes.io/version: 0.2.0 - name: pipeline-runner - namespace: kubeflow - resourceVersion: "11787" - selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner - uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 - secrets: - - name: pipeline-runner-token-dkjrk - ``` -**Role 2]** For sagemaker job to access S3 buckets and other Sagemaker services. This Role ARN is given as an input to the components. - ``` - SAGEMAKER_EXECUTION_ROLE_NAME=kfp-example-sagemaker-execution-role - - TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" - aws iam create-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME}--assume-role-policy-document "$TRUST" - aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess - - aws iam get-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --output text --query 'Role.Arn' - - # Note down the role arn which is of the form - arn:aws:iam:::role/ - ``` +Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md) ## Compiling the pipeline template @@ -165,6 +15,7 @@ Follow the guide to [building a pipeline](https://www.kubeflow.org/docs/guides/p dsl-compile --py mnist-classification-pipeline.py --output mnist-classification-pipeline.tar.gz ``` + ## Deploying the pipeline Open the Kubeflow pipelines UI. Create a new pipeline, and then upload the compiled specification (`.tar.gz` file) as a new pipeline template. diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py index 1787ba6d0c6..6ddcd0ba1a3 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + import kfp import json import copy @@ -39,7 +42,7 @@ name='MNIST HPO test pipeline', description='SageMaker hyperparameter tuning job test' ) -def hpo_test(region='us-west-2', +def hpo_test(region='us-east-1', hpo_job_name='HPO-kmeans-sample', image='', algorithm_name='K-Means', @@ -57,7 +60,7 @@ def hpo_test(region='us-west-2', channels=json.dumps(channelObjList), output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', - instance_type='ml.p2.16xlarge', + instance_type='ml.m5.2xlarge', instance_count='1', volume_size='50', max_num_jobs='1', @@ -115,7 +118,8 @@ def hpo_test(region='us-west-2', checkpoint_config=checkpoint_config, tags=tags, role=role_arn, - ) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + if __name__ == '__main__': kfp.compiler.Compiler().compile(hpo_test, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py index f02d47b817d..5fae21088c0 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + import kfp import json import copy @@ -45,8 +48,8 @@ name='MNIST Classification pipeline', description='MNIST Classification using KMEANS in SageMaker' ) -def mnist_classification(region='us-west-2', - image='174872318107.dkr.ecr.us-west-2.amazonaws.com/kmeans:1', +def mnist_classification(region='us-east-1', + image='382416733822.dkr.ecr.us-east-1.amazonaws.com/kmeans:1', training_input_mode='File', hpo_strategy='Bayesian', hpo_metric_name='test:msd', @@ -62,11 +65,11 @@ def mnist_classification(region='us-west-2', hpo_checkpoint_config='{}', output_location='s3://kubeflow-pipeline-data/mnist_kmeans_example/output', output_encryption_key='', - instance_type='ml.p2.16xlarge', + instance_type='ml.m5.2xlarge', instance_count='1', volume_size='50', - hpo_max_num_jobs='9', - hpo_max_parallel_jobs='3', + hpo_max_num_jobs='1', + hpo_max_parallel_jobs='1', max_run_time='3600', endpoint_url='', network_isolation='True', @@ -75,7 +78,7 @@ def mnist_classification(region='us-west-2', train_spot_instance='False', train_max_wait_time='3600', train_checkpoint_config='{}', - batch_transform_instance_type='ml.m4.xlarge', + batch_transform_instance_type='ml.m5.2xlarge', batch_transform_input='s3://kubeflow-pipeline-data/mnist_kmeans_example/input', batch_transform_data_type='S3Prefix', batch_transform_content_type='text/csv', @@ -116,7 +119,7 @@ def mnist_classification(region='us-west-2', max_wait_time=hpo_max_wait_time, checkpoint_config=hpo_checkpoint_config, role=role_arn, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training = sagemaker_train_op( region=region, @@ -137,7 +140,7 @@ def mnist_classification(region='us-west-2', max_wait_time=train_max_wait_time, checkpoint_config=train_checkpoint_config, role=role_arn, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) create_model = sagemaker_model_op( region=region, @@ -147,13 +150,13 @@ def mnist_classification(region='us-west-2', model_artifact_url=training.outputs['model_artifact_url'], network_isolation=network_isolation, role=role_arn - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) prediction = sagemaker_deploy_op( region=region, endpoint_url=endpoint_url, model_name_1=create_model.output, - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) batch_transform = sagemaker_batch_transform_op( region=region, @@ -170,7 +173,7 @@ def mnist_classification(region='us-west-2', split_type=batch_transform_split_type, compression_type=batch_transform_compression_type, output_location=batch_transform_ouput - ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) if __name__ == '__main__': kfp.compiler.Compiler().compile(mnist_classification, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/simple_train_pipeline/README.md b/samples/contrib/aws-samples/simple_train_pipeline/README.md index b875be29e38..319653b8e4a 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/README.md +++ b/samples/contrib/aws-samples/simple_train_pipeline/README.md @@ -2,134 +2,17 @@ An example pipeline with only [train component](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker/train). -# Prerequisites -1. Install Kubeflow on an EKS cluster in AWS. https://www.kubeflow.org/docs/aws/deploy/install-kubeflow/ -2. Get and store data in S3 buckets. You can get sample data using this code. - Create a new file `s3_sample_data_creator.py` with following content : - ```buildoutcfg - import io - import boto3 - import pickle, gzip, numpy, urllib.request, json - from urllib.parse import urlparse - from sagemaker.amazon.common import write_numpy_to_dense_tensor - - ########################################################################################### - # This is the only thing that you need to change in this code - # Give the name of your S3 bucket - # To use the example input below give a bucket name which is in us-east-1 region - bucket = '' +## Prerequisites - ########################################################################################### - - # Load the dataset - urllib.request.urlretrieve("http://deeplearning.net/data/mnist/mnist.pkl.gz", "mnist.pkl.gz") - with gzip.open('mnist.pkl.gz', 'rb') as f: - train_set, valid_set, test_set = pickle.load(f, encoding='latin1') +Make sure you have the setup explained in this [README.md](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/README.md) - # Upload dataset to S3 - data_key = 'mnist_kmeans_example/data' - data_location = 's3://{}/{}'.format(bucket, data_key) - print('Data will be uploaded to: {}'.format(data_location)) - - # Convert the training data into the format required by the SageMaker KMeans algorithm - buf = io.BytesIO() - write_numpy_to_dense_tensor(buf, train_set[0], train_set[1]) - buf.seek(0) - - boto3.resource('s3').Bucket(bucket).Object(data_key).upload_fileobj(buf) - ``` - Run this file `python s3_sample_data_creator.py` -3. **Prepare IAM Roles** - - We need two IAM roles to run AWS KFP components. You only have to do this once. (Re-use the Role ARNs if you have done this before) - - **Role 1]** For KFP pods to access AWS Sagemaker. Here are the steps to create it. - i. Enable OIDC support on the EKS cluster - ``` - eksctl utils associate-iam-oidc-provider --cluster \ - --region --approve - ``` - ii. Take note of the [OIDC](https://openid.net/connect/) issuer URL. This URL is in the form `oidc.eks..amazonaws.com/id/` . Note down the URL. - ``` - aws eks describe-cluster --name --query "cluster.identity.oidc.issuer" --output text - ``` - iii. Create a file named trust.json with the following content. - Replace `` with your OIDC issuer URL **(Don’t include https://)** and `` with your AWS account number. - ``` - { - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Principal": { - "Federated": "arn:aws:iam:::oidc-provider/" - }, - "Action": "sts:AssumeRoleWithWebIdentity", - "Condition": { - "StringEquals": { - ":aud": "sts.amazonaws.com", - ":sub": "system:serviceaccount:kubeflow:pipeline-runner" - } - } - } - ] - } - ``` - iv. Create an IAM role using trust.json. Make a note of the ARN returned in the output. - ``` - aws iam create-role --role-name kfp-example-pod-role --assume-role-policy-document file://trust.json - aws iam attach-role-policy --role-name kfp-example-pod-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam get-role --role-name kfp-example-pod-role --output text --query 'Role.Arn' - ``` - v. Edit your pipeline-runner service account. - ``` - kubectl edit -n kubeflow serviceaccount pipeline-runner - ``` - Add `eks.amazonaws.com/role-arn: ` to annotations, then save the file. Example: - ``` - apiVersion: v1 - kind: ServiceAccount - metadata: - annotations: - eks.amazonaws.com/role-arn: - kubectl.kubernetes.io/last-applied-configuration: | - {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} - creationTimestamp: "2020-04-16T05:48:06Z" - labels: - app: pipeline-runner - app.kubernetes.io/component: pipelines-runner - app.kubernetes.io/instance: pipelines-runner-0.2.0 - app.kubernetes.io/managed-by: kfctl - app.kubernetes.io/name: pipelines-runner - app.kubernetes.io/part-of: kubeflow - app.kubernetes.io/version: 0.2.0 - name: pipeline-runner - namespace: kubeflow - resourceVersion: "11787" - selfLink: /api/v1/namespaces/kubeflow/serviceaccounts/pipeline-runner - uid: d86234bd-7fa5-11ea-a8f2-02934be6dc88 - secrets: - - name: pipeline-runner-token-dkjrk - ``` - **Role 2]** For sagemaker job to access S3 buckets and other Sagemaker services. This Role ARN is given as an input to the components. - ``` - SAGEMAKER_EXECUTION_ROLE_NAME=kfp-example-sagemaker-execution-role - - TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" - aws iam create-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME}--assume-role-policy-document "$TRUST" - aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam attach-role-policy --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess - - aws iam get-role --role-name ${SAGEMAKER_EXECUTION_ROLE_NAME} --output text --query 'Role.Arn' - - # Note down the role arn which is of the form - arn:aws:iam:::role/ -5. Compile the pipeline: +## Steps +1. Compile the pipeline: `dsl-compile --py training-pipeline.py --output training-pipeline.tar.gz` -6. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. -7. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section. +2. In the Kubeflow UI, upload this compiled pipeline specification (the .tar.gz file) and click on create run. +3. Once the pipeline completes, you can see the outputs under 'Output parameters' in the HPO component's Input/Output section. Example inputs to this pipeline : ```buildoutcfg @@ -145,7 +28,7 @@ channels : In this JSON, along with other parameters you need to pass the S3 Uri "ChannelName": "train", "DataSource": { "S3DataSource": { - "S3Uri": "s3:///mnist_kmeans_example/data", + "S3Uri": "s3:///mnist_kmeans_example/train_data", "S3DataType": "S3Prefix", "S3DataDistributionType": "FullyReplicated" } @@ -157,7 +40,7 @@ channels : In this JSON, along with other parameters you need to pass the S3 Uri } ] -instance_type : ml.p2.xlarge +instance_type : ml.m5.2xlarge instance_count : 1 volume_size : 50 max_run_time : 3600 diff --git a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py index aa308f1c545..2a9f6a0fc80 100644 --- a/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py +++ b/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py @@ -1,5 +1,8 @@ #!/usr/bin/env python3 +# Uncomment the apply(use_aws_secret()) below if you are not using OIDC +# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md + import kfp import json import copy @@ -42,7 +45,7 @@ def training( training_input_mode='File', hyperparameters='{"k": "10", "feature_dim": "784"}', channels=json.dumps(channelObjList), - instance_type='ml.p2.xlarge', + instance_type='ml.m5.2xlarge', instance_count='1', volume_size='50', max_run_time='3600', @@ -74,7 +77,8 @@ def training( max_wait_time=max_wait_time, checkpoint_config=checkpoint_config, role=role, - ) + )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + if __name__ == '__main__': kfp.compiler.Compiler().compile(training, __file__ + '.zip') diff --git a/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py b/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py index ccd9fb3f909..647fda5345a 100644 --- a/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py +++ b/samples/contrib/aws-samples/titanic-survival-prediction/titanic-survival-prediction.py @@ -32,7 +32,7 @@ def titanic_suvival_prediction(region='us-west-2', instance_type=instance_type, instance_count=instance_count, log_s3_uri=log_s3_uri, - ) + ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) training_and_prediction = emr_submit_spark_job_op( region=region, @@ -42,13 +42,13 @@ def titanic_suvival_prediction(region='us-west-2', main_class=main_class, input=input, output=output - ) + ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) delete_cluster = emr_delete_cluster_op( region=region, jobflow_id=create_cluster.output, dependent=training_and_prediction.outputs['job_id'] - ) + ).apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) if __name__ == '__main__': kfp.compiler.Compiler().compile(titanic_suvival_prediction, __file__ + '.zip') From cbb168bfc32dc9147bebc62b9372a42f4354f31c Mon Sep 17 00:00:00 2001 From: Kartik Kalamadi Date: Thu, 14 May 2020 22:26:13 -0700 Subject: [PATCH 3/5] small changes to readme --- samples/contrib/aws-samples/README.md | 65 +++++++++++---------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/samples/contrib/aws-samples/README.md b/samples/contrib/aws-samples/README.md index 198a1c586c6..04c927ba886 100644 --- a/samples/contrib/aws-samples/README.md +++ b/samples/contrib/aws-samples/README.md @@ -19,13 +19,10 @@ This folder contains many example pipelines which use [AWS SageMaker Components ## IAM Permissions -You need two IAM Roles/Users -1. For KFP pods to access AWS SageMaker. Let's call this one **kfp-pod-permissions** -2. For SageMaker job to access S3 buckets and other SageMaker services. Let's call this one **kfp-sagemaker-execution-permissions** +To use AWS KFP Components the KFP pods need access to AWS SageMaker. +There are two ways you can give it access to SageMaker. -### kfp-pod-permissions - -**Option 1]** [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html) (This is the recommended way) +**Option 1]** (Recommended) [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html) 1. Enable OIDC support on the EKS cluster ``` eksctl utils associate-iam-oidc-provider --cluster \ @@ -101,19 +98,10 @@ You need two IAM Roles/Users secrets: - name: pipeline-runner-token-dkjrk ``` -**option 2]** Create an IAM User and store the credentials as a `aws-secret` in kubernetes cluster. Then use those in the components. - 1. Create an IAM User with SageMaker permissions - ``` - aws iam create-user --user-name kfp-example-pod-user - aws iam attach-user-policy --user-name kfp-example-pod-user --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess - aws iam create-access-key --user-name kfp-example-pod-user > /tmp/create_output.json - ``` - 2. Convert the Access key and secret to base64. - ``` - export AWS_ACCESS_KEY_ID_VALUE=$(jq -j .AccessKey.AccessKeyId /tmp/create_output.json | base64) - export AWS_SECRET_ACCESS_KEY_VALUE=$(jq -j .AccessKey.SecretAccessKey /tmp/create_output.json | base64) - ``` - 3. Apply them to k8s cluster +**option 2]** Store the IAM credentials as a `aws-secret` in kubernetes cluster. Then use those in the components. + 1. You need credentials for an IAM user with SageMakerFullAccess. Apply them to k8s cluster. + Replace `AWS_ACCESS_KEY_IN_BASE64` and `AWS_SECRET_ACCESS_IN_BASE64`. + > Note: To get base64 string you can do `echo -n $AWS_ACCESS_KEY_ID | base64` ``` cat < + AWS_SECRET_ACCESS_KEY: EOF ``` - 4. Use the credentials in pipeline code by adding this line to each component in your pipeline `.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))` + 2. Use the stored `aws-secret` in pipeline code by adding this line to each component in your pipeline `.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))` [Example](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py#L76) (uncomment this line) - -### kfp-sagemaker-execution-permissions - -Run these commands to create the sagemaker-execution-role. -Note down the Role ARN. You need to give this Role ARN as input in pipeline. -``` -TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" -aws iam create-role --role-name kfp-example-sagemaker-execution-role --assume-role-policy-document "$TRUST" -aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess -aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess -aws iam get-role --role-name kfp-example-sagemaker-execution-role --output text --query 'Role.Arn' +## Inputs to the pipeline -# note down the Role ARN. -``` - - -## Sample Mnist dataset +### Sample Mnist dataset Use the following python script to copy train_data, test_data, and valid_data to your bucket. [create a bucket](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html) in `us-east-1` region if you don't have one already. @@ -206,4 +180,19 @@ s3_client.upload_file('valid-data.csv', bucket, input_key) ``` Run this file `python s3_sample_data_creator.py` +### Role Input + +This role is used by SageMaker jobs created by the KFP to access the S3 buckets and other SageMaker resources. +Run these commands to create the sagemaker-execution-role. +Note down the Role ARN. You need to give this Role ARN as input in pipeline. + +``` +TRUST="{ \"Version\": \"2012-10-17\", \"Statement\": [ { \"Effect\": \"Allow\", \"Principal\": { \"Service\": \"sagemaker.amazonaws.com\" }, \"Action\": \"sts:AssumeRole\" } ] }" +aws iam create-role --role-name kfp-example-sagemaker-execution-role --assume-role-policy-document "$TRUST" +aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonSageMakerFullAccess +aws iam attach-role-policy --role-name kfp-example-sagemaker-execution-role --policy-arn arn:aws:iam::aws:policy/AmazonS3FullAccess +aws iam get-role --role-name kfp-example-sagemaker-execution-role --output text --query 'Role.Arn' + +# note down the Role ARN. +``` From 091a534fd408bc7c758cf3e70977f47fe0150845 Mon Sep 17 00:00:00 2001 From: Kartik Kalamadi Date: Mon, 18 May 2020 12:01:56 -0700 Subject: [PATCH 4/5] nit change --- samples/contrib/aws-samples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/contrib/aws-samples/README.md b/samples/contrib/aws-samples/README.md index 04c927ba886..d952c66cdda 100644 --- a/samples/contrib/aws-samples/README.md +++ b/samples/contrib/aws-samples/README.md @@ -182,7 +182,7 @@ Run this file `python s3_sample_data_creator.py` ### Role Input -This role is used by SageMaker jobs created by the KFP to access the S3 buckets and other SageMaker resources. +This role is used by SageMaker jobs created by the KFP to access the S3 buckets and other AWS resources. Run these commands to create the sagemaker-execution-role. Note down the Role ARN. You need to give this Role ARN as input in pipeline. From 67015ffd7c7fe91263fd24ba86d15a0b6ea98095 Mon Sep 17 00:00:00 2001 From: Kartik Kalamadi Date: Wed, 20 May 2020 15:59:40 -0700 Subject: [PATCH 5/5] Address comments --- .../batch_transform/src/batch_transform.py | 4 +-- components/aws/sagemaker/train/README.md | 4 +-- components/aws/sagemaker/train/src/train.py | 2 +- samples/contrib/aws-samples/README.md | 31 +++++++++---------- .../mini-image-classification-pipeline.py | 11 +++---- .../kmeans-hpo-pipeline.py | 4 +-- .../mnist-classification-pipeline.py | 12 +++---- 7 files changed, 29 insertions(+), 39 deletions(-) diff --git a/components/aws/sagemaker/batch_transform/src/batch_transform.py b/components/aws/sagemaker/batch_transform/src/batch_transform.py index b658dad730b..bc38b0cd1e7 100644 --- a/components/aws/sagemaker/batch_transform/src/batch_transform.py +++ b/components/aws/sagemaker/batch_transform/src/batch_transform.py @@ -44,9 +44,7 @@ def create_parser(): parser.add_argument('--input_filter', type=str, required=False, help='A JSONPath expression used to select a portion of the input data to pass to the algorithm.', default='') parser.add_argument('--output_filter', type=str, required=False, help='A JSONPath expression used to select a portion of the joined dataset to save in the output file for a batch transform job.', default='') parser.add_argument('--join_source', choices=['None', 'Input', ''], type=str, required=False, help='Specifies the source of the data to join with the transformed data.', default='None') - parser.add_argument('--instance_type', choices=['ml.m4.xlarge', 'ml.m4.2xlarge', 'ml.m4.4xlarge', 'ml.m4.10xlarge', 'ml.m4.16xlarge', 'ml.m5.large', 'ml.m5.xlarge', 'ml.m5.2xlarge', 'ml.m5.4xlarge', - 'ml.m5.12xlarge', 'ml.m5.24xlarge', 'ml.c4.xlarge', 'ml.c4.2xlarge', 'ml.c4.4xlarge', 'ml.c4.8xlarge', 'ml.p2.xlarge', 'ml.p2.8xlarge', 'ml.p2.16xlarge', 'ml.p3.2xlarge', 'ml.p3.8xlarge', 'ml.p3.16xlarge', - 'ml.c5.xlarge', 'ml.c5.2xlarge', 'ml.c5.4xlarge', 'ml.c5.9xlarge', 'ml.c5.18xlarge'], type=str, required=True, help='The ML compute instance type for the transform job.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', type=str, required=False, help='The ML compute instance type for the transform job.', default='ml.m4.xlarge') parser.add_argument('--instance_count', type=int, required=False, help='The number of ML compute instances to use in the transform job.') parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') parser.add_argument('--tags', type=_utils.yaml_or_json_str, required=False, help='An array of key-value pairs, to categorize AWS resources.', default={}) diff --git a/components/aws/sagemaker/train/README.md b/components/aws/sagemaker/train/README.md index 40271c1a6d8..e8437f6d389 100644 --- a/components/aws/sagemaker/train/README.md +++ b/components/aws/sagemaker/train/README.md @@ -20,8 +20,8 @@ algorithm_name | The name of the algorithm resource to use for the hyperparamete metric_definitions | The dictionary of name-regex pairs specify the metrics that the algorithm emits | Yes | Dict | | {} | put_mode | The input mode that the algorithm supports | No | String | File, Pipe | File | hyperparameters | Hyperparameters for the selected algorithm | No | Dict | [Depends on Algo](https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html)| | -channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | No | List of Dicts | | | -instance_type | The ML compute instance type | Yes | No | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/) | ml.m4.xlarge | +channels | A list of dicts specifying the input channels (at least one); refer to [documentation](https://github.com/awsdocs/amazon-sagemaker-developer-guide/blob/master/doc_source/API_Channel.md) for parameters | No | List of Dicts | | | +instance_type | The ML compute instance type | Yes | String | ml.m4.xlarge, ml.m4.2xlarge, ml.m4.4xlarge, ml.m4.10xlarge, ml.m4.16xlarge, ml.m5.large, ml.m5.xlarge, ml.m5.2xlarge, ml.m5.4xlarge, ml.m5.12xlarge, ml.m5.24xlarge, ml.c4.xlarge, ml.c4.2xlarge, ml.c4.4xlarge, ml.c4.8xlarge, ml.p2.xlarge, ml.p2.8xlarge, ml.p2.16xlarge, ml.p3.2xlarge, ml.p3.8xlarge, ml.p3.16xlarge, ml.c5.xlarge, ml.c5.2xlarge, ml.c5.4xlarge, ml.c5.9xlarge, ml.c5.18xlarge [and many more](https://aws.amazon.com/sagemaker/pricing/instance-types/) | ml.m4.xlarge | instance_count | The number of ML compute instances to use in each training job | Yes | Int | ≥ 1 | 1 | volume_size | The size of the ML storage volume that you want to provision in GB | Yes | Int | ≥ 1 | 30 | resource_encryption_key | The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s) | Yes | String | | | diff --git a/components/aws/sagemaker/train/src/train.py b/components/aws/sagemaker/train/src/train.py index 4aba9d3b294..1bede9930ac 100644 --- a/components/aws/sagemaker/train/src/train.py +++ b/components/aws/sagemaker/train/src/train.py @@ -28,7 +28,7 @@ def create_parser(): parser.add_argument('--training_input_mode', choices=['File', 'Pipe'], type=str, help='The input mode that the algorithm supports. File or Pipe.', default='File') parser.add_argument('--hyperparameters', type=_utils.yaml_or_json_str, help='Dictionary of hyperparameters for the the algorithm.', default={}) parser.add_argument('--channels', type=_utils.yaml_or_json_str, required=True, help='A list of dicts specifying the input channels. Must have at least one.') - parser.add_argument('--instance_type', required=True, type=str, help='The ML compute instance type.', default='ml.m4.xlarge') + parser.add_argument('--instance_type', required=False, type=str, help='The ML compute instance type.', default='ml.m4.xlarge') parser.add_argument('--instance_count', required=True, type=int, help='The registry path of the Docker image that contains the training algorithm.', default=1) parser.add_argument('--volume_size', type=int, required=True, help='The size of the ML storage volume that you want to provision.', default=1) parser.add_argument('--resource_encryption_key', type=str, required=False, help='The AWS KMS key that Amazon SageMaker uses to encrypt data on the storage volume attached to the ML compute instance(s).', default='') diff --git a/samples/contrib/aws-samples/README.md b/samples/contrib/aws-samples/README.md index d952c66cdda..fc4993231fd 100644 --- a/samples/contrib/aws-samples/README.md +++ b/samples/contrib/aws-samples/README.md @@ -1,6 +1,6 @@ # Sample AWS SageMaker Kubeflow Pipelines -This folder contains many example pipelines which use [AWS SageMaker Components for KFP](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker). Following sections explain the setup you need to run these pipelines. Once you are done with the setup, [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) is a good place to start if you have never used these components before. +This folder contains many example pipelines which use [AWS SageMaker Components for KFP](https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker). The following sections explain the setup needed to run these pipelines. Once you are done with the setup, [simple_train_pipeline](https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/simple_train_pipeline) is a good place to start if you have never used these components before. @@ -12,18 +12,18 @@ This folder contains many example pipelines which use [AWS SageMaker Components `aws configure` 2. [aws-iam-authenticator](https://docs.aws.amazon.com/eks/latest/userguide/install-aws-iam-authenticator.html) version 0.1.31 and above 3. [eksctl](https://github.com/weaveworks/eksctl) version above 0.15 - 4. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) version 1.17 and above + 4. [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/#install-kubectl) version needs to be your k8s version +/- 1 minor version. 5. [KFP SDK](https://www.kubeflow.org/docs/pipelines/sdk/install-sdk/#install-the-kubeflow-pipelines-sdk) (installs the dsl-compile and kfp cli) - 6. [jq](https://stedolan.github.io/jq/download/) ## IAM Permissions -To use AWS KFP Components the KFP pods need access to AWS SageMaker. -There are two ways you can give it access to SageMaker. +To use AWS KFP Components the KFP component pods need access to AWS SageMaker. +There are two ways you can give them access to SageMaker. +(You need EKS cluster for Option 1) -**Option 1]** (Recommended) [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html) - 1. Enable OIDC support on the EKS cluster +**Option 1** (Recommended) [IAM roles for service account](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). + 1. Enable OIDC support on EKS cluster ``` eksctl utils associate-iam-oidc-provider --cluster \ --region --approve @@ -38,8 +38,7 @@ There are two ways you can give it access to SageMaker. # Replace these two with proper values OIDC_URL="" AWS_ACC_NUM="" - ``` - ``` + # Run this to create trust.json file cat < trust.json { @@ -79,8 +78,6 @@ There are two ways you can give it access to SageMaker. metadata: annotations: eks.amazonaws.com/role-arn: - kubectl.kubernetes.io/last-applied-configuration: | - {"apiVersion":"v1","kind":"ServiceAccount","metadata":{"annotations":{},"labels":{"app":"pipeline-runner","app.kubernetes.io/component":"pipelines-runner","app.kubernetes.io/instance":"pipelines-runner-0.2.0","app.kubernetes.io/managed-by":"kfctl","app.kubernetes.io/name":"pipelines-runner","app.kubernetes.io/part-of":"kubeflow","app.kubernetes.io/version":"0.2.0"},"name":"pipeline-runner","namespace":"kubeflow"}} creationTimestamp: "2020-04-16T05:48:06Z" labels: app: pipeline-runner @@ -98,7 +95,7 @@ There are two ways you can give it access to SageMaker. secrets: - name: pipeline-runner-token-dkjrk ``` -**option 2]** Store the IAM credentials as a `aws-secret` in kubernetes cluster. Then use those in the components. +**Option 2** Store the IAM credentials as a `aws-secret` in kubernetes cluster. Then use those in the components. 1. You need credentials for an IAM user with SageMakerFullAccess. Apply them to k8s cluster. Replace `AWS_ACCESS_KEY_IN_BASE64` and `AWS_SECRET_ACCESS_IN_BASE64`. > Note: To get base64 string you can do `echo -n $AWS_ACCESS_KEY_ID | base64` @@ -115,15 +112,17 @@ There are two ways you can give it access to SageMaker. AWS_SECRET_ACCESS_KEY: EOF ``` - 2. Use the stored `aws-secret` in pipeline code by adding this line to each component in your pipeline `.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))` - [Example](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py#L76) (uncomment this line) + 2. Use the stored `aws-secret` in pipeline code by adding this line to each component in your pipeline `.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY'))` + [Kubeflow Document](https://www.kubeflow.org/docs/aws/pipeline/) + [Example Code](https://github.com/kubeflow/pipelines/blob/master/samples/contrib/aws-samples/simple_train_pipeline/training-pipeline.py#L76) (uncomment this line) ## Inputs to the pipeline -### Sample Mnist dataset +### Sample MNIST dataset Use the following python script to copy train_data, test_data, and valid_data to your bucket. -[create a bucket](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html) in `us-east-1` region if you don't have one already. +[Create a bucket](https://docs.aws.amazon.com/AmazonS3/latest/gsg/CreatingABucket.html) in `us-east-1` region if you don't have one already. +For the purposes of this demonstration, all resources will be created in the us-east-1 region. Create a new file named s3_sample_data_creator.py with following content : diff --git a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py index eb24e2462c7..bbd3d33f7c2 100644 --- a/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py +++ b/samples/contrib/aws-samples/ground_truth_pipeline_demo/mini-image-classification-pipeline.py @@ -1,8 +1,5 @@ #!/usr/bin/env python3 -# Uncomment the apply(use_aws_secret()) below if you are not using OIDC -# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md - import kfp import json import copy @@ -76,7 +73,7 @@ def ground_truth_test(region='us-west-2', user_pool=user_pool, user_groups=user_groups, client_id=client_id - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) ground_truth_train = sagemaker_gt_op( region=region, @@ -96,7 +93,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) ground_truth_validation = sagemaker_gt_op( region=region, @@ -116,7 +113,7 @@ def ground_truth_test(region='us-west-2', time_limit=ground_truth_time_limit, task_availibility=ground_truth_task_availibility, max_concurrent_tasks=ground_truth_max_concurrent_tasks - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) channelObj['ChannelName'] = 'train' channelObj['DataSource']['S3DataSource']['S3Uri'] = str(ground_truth_train.outputs['output_manifest_location']) @@ -137,7 +134,7 @@ def ground_truth_test(region='us-west-2', max_run_time=training_max_run_time, model_artifact_path=training_output_location, role=role_arn - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py index 5849409f201..327845d77cb 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/kmeans-hpo-pipeline.py @@ -1,7 +1,5 @@ #!/usr/bin/env python3 -# Uncomment the apply(use_aws_secret()) below if you are not using OIDC -# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md import kfp import json @@ -117,7 +115,7 @@ def hpo_test(region='us-east-1', checkpoint_config=checkpoint_config, tags=tags, role=role_arn, - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': diff --git a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py index 4d7fe83d373..ab9a2f2015b 100644 --- a/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py +++ b/samples/contrib/aws-samples/mnist-kmeans-sagemaker/mnist-classification-pipeline.py @@ -1,7 +1,5 @@ #!/usr/bin/env python3 -# Uncomment the apply(use_aws_secret()) below if you are not using OIDC -# more info : https://github.com/kubeflow/pipelines/tree/master/samples/contrib/aws-samples/README.md import kfp import json @@ -118,7 +116,7 @@ def mnist_classification(region='us-east-1', max_wait_time=hpo_max_wait_time, checkpoint_config=hpo_checkpoint_config, role=role_arn, - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) training = sagemaker_train_op( region=region, @@ -139,7 +137,7 @@ def mnist_classification(region='us-east-1', max_wait_time=train_max_wait_time, checkpoint_config=train_checkpoint_config, role=role_arn, - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) create_model = sagemaker_model_op( region=region, @@ -149,13 +147,13 @@ def mnist_classification(region='us-east-1', model_artifact_url=training.outputs['model_artifact_url'], network_isolation=network_isolation, role=role_arn - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) prediction = sagemaker_deploy_op( region=region, endpoint_url=endpoint_url, model_name_1=create_model.output, - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) batch_transform = sagemaker_batch_transform_op( region=region, @@ -172,7 +170,7 @@ def mnist_classification(region='us-east-1', split_type=batch_transform_split_type, compression_type=batch_transform_compression_type, output_location=batch_transform_ouput - )#.apply(use_aws_secret('aws-secret', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY')) + ) if __name__ == '__main__': kfp.compiler.Compiler().compile(mnist_classification, __file__ + '.zip')