From d1b6684cf474b2c847e95ab719e37db12fc187cf Mon Sep 17 00:00:00 2001 From: sitecao Date: Thu, 26 May 2022 19:07:44 +0000 Subject: [PATCH 1/3] add updates to pass account id --- .../data_parallel/bert/DeepLearningExamples | 1 - .../pytorch/data_parallel/bert/Dockerfile | 3 ++- .../pytorch/data_parallel/bert/build_and_push.sh | 11 ++++++----- .../bert/pytorch_smdataparallel_bert_demo.ipynb | 4 ++-- .../pytorch/data_parallel/maskrcnn/Dockerfile | 3 ++- .../data_parallel/maskrcnn/build_and_push.sh | 11 ++++++----- .../pytorch_smdataparallel_maskrcnn_demo.ipynb | 14 +++++--------- 7 files changed, 23 insertions(+), 24 deletions(-) delete mode 160000 training/distributed_training/pytorch/data_parallel/bert/DeepLearningExamples diff --git a/training/distributed_training/pytorch/data_parallel/bert/DeepLearningExamples b/training/distributed_training/pytorch/data_parallel/bert/DeepLearningExamples deleted file mode 160000 index 4fdf2e6673..0000000000 --- a/training/distributed_training/pytorch/data_parallel/bert/DeepLearningExamples +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4fdf2e6673513f04b25a1d2bb4539c479667cf5d diff --git a/training/distributed_training/pytorch/data_parallel/bert/Dockerfile b/training/distributed_training/pytorch/data_parallel/bert/Dockerfile index a437e1b469..ed0a0b633a 100644 --- a/training/distributed_training/pytorch/data_parallel/bert/Dockerfile +++ b/training/distributed_training/pytorch/data_parallel/bert/Dockerfile @@ -1,6 +1,7 @@ +ARG dlc_account_id ARG region -FROM 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker +FROM ${dlc_account_id}.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker ARG WORK_DIR="apex_build" RUN pip --no-cache-dir --no-cache install h5py boto3 'git+https://github.com/NVIDIA/dllogger' tqdm requests; \ diff --git a/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh b/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh index 4dfd8ae224..0c05b77c14 100644 --- a/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh +++ b/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh @@ -7,10 +7,11 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -if [ "$#" -eq 3 ]; then - region=$1 - image=$2 - tag=$3 +if [ "$#" -eq 4 ]; then + dlc_account_id=$1 + region=$2 + image=$3 + tag=$4 else echo "usage: $0 $1 $2 " exit 1 @@ -34,7 +35,7 @@ fi aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com -docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg region=${region} +docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} region=${region} docker tag ${image} ${fullname} docker push ${fullname} if [ $? -eq 0 ]; then diff --git a/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb b/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb index bea16e6b1a..212f303f30 100644 --- a/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb +++ b/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb @@ -135,7 +135,7 @@ "outputs": [], "source": [ "%%time\n", - "! chmod +x build_and_push.sh; bash build_and_push.sh {region} {image} {tag}" + "! chmod +x build_and_push.sh; bash build_and_push.sh {account} {region} {image} {tag}" ] }, { @@ -330,7 +330,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/training/distributed_training/pytorch/data_parallel/maskrcnn/Dockerfile b/training/distributed_training/pytorch/data_parallel/maskrcnn/Dockerfile index 0336c2d09a..6c2a9c7db5 100644 --- a/training/distributed_training/pytorch/data_parallel/maskrcnn/Dockerfile +++ b/training/distributed_training/pytorch/data_parallel/maskrcnn/Dockerfile @@ -1,6 +1,7 @@ +ARG dlc_account_id ARG region -FROM 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker +FROM ${dlc_account_id}.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker RUN pip install ninja yacs cython matplotlib tqdm opencv-python pybind11==2.5.0 'git+https://github.com/NVIDIA/dllogger' diff --git a/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh b/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh index d61e90516e..bd8378bf16 100644 --- a/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh +++ b/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh @@ -7,10 +7,11 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -if [ "$#" -eq 3 ]; then - region=$1 - image=$2 - tag=$3 +if [ "$#" -eq 4 ]; then + dlc_account_id=$1 + region=$2 + image=$3 + tag=$4 else echo "usage: $0 $1 $2 " exit 1 @@ -35,7 +36,7 @@ if [ $? -ne 0 ]; then fi $(aws ecr get-login --no-include-email --region ${region} --registry-ids 763104351884) -docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg region=${region} +docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} region=${region} docker tag ${image} ${fullname} # Get the login command from ECR and execute it directly diff --git a/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb b/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb index ce0784ec13..1fc900724a 100644 --- a/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb +++ b/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb @@ -137,7 +137,7 @@ "outputs": [], "source": [ "%%time\n", - "! chmod +x build_and_push.sh; bash build_and_push.sh {region} {image} {tag}" + "! chmod +x build_and_push.sh; bash build_and_push.sh {account} {region} {image} {tag}" ] }, { @@ -299,13 +299,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.7.4 64-bit ('base': conda)", - "metadata": { - "interpreter": { - "hash": "dca0ade3e726a953b501b15e8e990130d2b7799f14cfd9f4271676035ebe5511" - } - }, - "name": "python3" + "display_name": "conda_amazonei_pytorch_latest_p36", + "language": "python", + "name": "conda_amazonei_pytorch_latest_p36" }, "language_info": { "codemirror_mode": { @@ -317,7 +313,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.6.13" } }, "nbformat": 4, From 4f73ba133758ecfe97f76c9644e9bf442d8368d3 Mon Sep 17 00:00:00 2001 From: sitecao Date: Thu, 26 May 2022 19:32:13 +0000 Subject: [PATCH 2/3] notebook changes --- .../data_parallel/bert/build_and_push.sh | 24 ++++++++++++------- .../pytorch_smdataparallel_bert_demo.ipynb | 23 ++++++++++++++++-- .../data_parallel/maskrcnn/build_and_push.sh | 16 ++++++++----- ...pytorch_smdataparallel_maskrcnn_demo.ipynb | 13 ++++++++-- 4 files changed, 58 insertions(+), 18 deletions(-) mode change 100644 => 100755 training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh diff --git a/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh b/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh old mode 100644 new mode 100755 index 0c05b77c14..e8ccf3115f --- a/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh +++ b/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh @@ -7,40 +7,48 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Change the active directory to the one that contains the training script +cd ${DIR}/SMDDP-Examples/pytorch/image_classification + if [ "$#" -eq 4 ]; then dlc_account_id=$1 region=$2 image=$3 tag=$4 else - echo "usage: $0 $1 $2 " + echo "usage: $0 $1 $2 $3 " exit 1 fi + # Get the account number associated with the current IAM credentials account=$(aws sts get-caller-identity --query Account --output text) + if [ $? -ne 0 ] then exit 255 fi + fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:${tag}" + # If the repository doesn't exist in ECR, create it. aws ecr describe-repositories --region ${region} --repository-names "${image}" > /dev/null 2>&1 if [ $? -ne 0 ]; then + echo "creating ECR repository : ${fullname} " aws ecr create-repository --region ${region} --repository-name "${image}" > /dev/null fi -# Build the docker image locally with the image name and then push it to ECR -# with the full name. -# login ECR for the current account - -aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com -docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} region=${region} +aws ecr get-login-password --region ${region} \ +| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com +docker build . -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} --build-arg region=${region} docker tag ${image} ${fullname} + +aws ecr get-login-password --region ${region} \ +| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com docker push ${fullname} if [ $? -eq 0 ]; then echo "Amazon ECR URI: ${fullname}" else echo "Error: Image build and push failed" exit 1 -fi +fi \ No newline at end of file diff --git a/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb b/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb index 212f303f30..250fee671d 100644 --- a/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb +++ b/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb @@ -128,6 +128,25 @@ "!pygmentize ./build_and_push.sh" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dlc_account_id = 763104351884 # By default, set the account ID used for most regions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {dlc_account_id}.dkr.ecr.{region}.amazonaws.com\n", + "! chmod +x build_and_push.sh; bash build_and_push.sh {dlc_account_id} {region} {image} {tag}" + ] + }, { "cell_type": "code", "execution_count": null, @@ -316,9 +335,9 @@ ], "metadata": { "kernelspec": { - "display_name": "conda_pytorch_p36", + "display_name": "conda_amazonei_pytorch_latest_p36", "language": "python", - "name": "conda_pytorch_p36" + "name": "conda_amazonei_pytorch_latest_p36" }, "language_info": { "codemirror_mode": { diff --git a/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh b/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh index bd8378bf16..e8ccf3115f 100644 --- a/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh +++ b/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh @@ -7,13 +7,16 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# Change the active directory to the one that contains the training script +cd ${DIR}/SMDDP-Examples/pytorch/image_classification + if [ "$#" -eq 4 ]; then dlc_account_id=$1 region=$2 image=$3 tag=$4 else - echo "usage: $0 $1 $2 " + echo "usage: $0 $1 $2 $3 " exit 1 fi @@ -35,16 +38,17 @@ if [ $? -ne 0 ]; then aws ecr create-repository --region ${region} --repository-name "${image}" > /dev/null fi -$(aws ecr get-login --no-include-email --region ${region} --registry-ids 763104351884) -docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} region=${region} +aws ecr get-login-password --region ${region} \ +| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com +docker build . -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} --build-arg region=${region} docker tag ${image} ${fullname} -# Get the login command from ECR and execute it directly -$(aws ecr get-login --region ${region} --no-include-email) +aws ecr get-login-password --region ${region} \ +| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com docker push ${fullname} if [ $? -eq 0 ]; then echo "Amazon ECR URI: ${fullname}" else echo "Error: Image build and push failed" exit 1 -fi +fi \ No newline at end of file diff --git a/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb b/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb index 1fc900724a..2e264b2008 100644 --- a/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb +++ b/training/distributed_training/pytorch/data_parallel/maskrcnn/pytorch_smdataparallel_maskrcnn_demo.ipynb @@ -136,8 +136,17 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "! chmod +x build_and_push.sh; bash build_and_push.sh {account} {region} {image} {tag}" + "dlc_account_id = 763104351884 # By default, set the account ID used for most regions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {dlc_account_id}.dkr.ecr.{region}.amazonaws.com\n", + "! chmod +x build_and_push.sh; bash build_and_push.sh {dlc_account_id} {region} {image} {tag}" ] }, { From 52ed8e819130ca10b8bfacc353d06a85e532b37f Mon Sep 17 00:00:00 2001 From: sitecao Date: Fri, 27 May 2022 01:45:14 +0000 Subject: [PATCH 3/3] some clean ups --- .../pytorch/data_parallel/bert/build_and_push.sh | 8 ++------ .../bert/pytorch_smdataparallel_bert_demo.ipynb | 10 ---------- .../pytorch/data_parallel/maskrcnn/build_and_push.sh | 8 ++------ 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh b/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh index e8ccf3115f..22b34cbc0e 100755 --- a/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh +++ b/training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh @@ -7,9 +7,6 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -# Change the active directory to the one that contains the training script -cd ${DIR}/SMDDP-Examples/pytorch/image_classification - if [ "$#" -eq 4 ]; then dlc_account_id=$1 region=$2 @@ -40,12 +37,11 @@ fi aws ecr get-login-password --region ${region} \ | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com + docker build . -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} --build-arg region=${region} docker tag ${image} ${fullname} - -aws ecr get-login-password --region ${region} \ -| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com docker push ${fullname} + if [ $? -eq 0 ]; then echo "Amazon ECR URI: ${fullname}" else diff --git a/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb b/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb index 250fee671d..67ba4071e7 100644 --- a/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb +++ b/training/distributed_training/pytorch/data_parallel/bert/pytorch_smdataparallel_bert_demo.ipynb @@ -147,16 +147,6 @@ "! chmod +x build_and_push.sh; bash build_and_push.sh {dlc_account_id} {region} {image} {tag}" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "! chmod +x build_and_push.sh; bash build_and_push.sh {account} {region} {image} {tag}" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh b/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh index e8ccf3115f..22b34cbc0e 100644 --- a/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh +++ b/training/distributed_training/pytorch/data_parallel/maskrcnn/build_and_push.sh @@ -7,9 +7,6 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -# Change the active directory to the one that contains the training script -cd ${DIR}/SMDDP-Examples/pytorch/image_classification - if [ "$#" -eq 4 ]; then dlc_account_id=$1 region=$2 @@ -40,12 +37,11 @@ fi aws ecr get-login-password --region ${region} \ | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com + docker build . -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} --build-arg region=${region} docker tag ${image} ${fullname} - -aws ecr get-login-password --region ${region} \ -| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com docker push ${fullname} + if [ $? -eq 0 ]; then echo "Amazon ECR URI: ${fullname}" else