Skip to content

Commit

Permalink
Fix smddp notebook account number (#3431)
Browse files Browse the repository at this point in the history
* add updates to pass account id

* notebook changes

* some clean ups

Co-authored-by: atqy <[email protected]>
  • Loading branch information
sitecao and atqy authored Jun 7, 2022
1 parent 1927b11 commit 4e181db
Show file tree
Hide file tree
Showing 7 changed files with 61 additions and 40 deletions.
Submodule DeepLearningExamples deleted from 4fdf2e
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ARG dlc_account_id
ARG region

FROM 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker
FROM ${dlc_account_id}.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker

ARG WORK_DIR="apex_build"
RUN pip --no-cache-dir --no-cache install h5py boto3 'git+https://github.com/NVIDIA/dllogger' tqdm requests; \
Expand Down
27 changes: 16 additions & 11 deletions training/distributed_training/pytorch/data_parallel/bert/build_and_push.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,44 @@

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

if [ "$#" -eq 3 ]; then
region=$1
image=$2
tag=$3
if [ "$#" -eq 4 ]; then
dlc_account_id=$1
region=$2
image=$3
tag=$4
else
echo "usage: $0 <aws-region> $1 <image-repo> $2 <image-tag>"
echo "usage: $0 <dlc-account-id> $1 <aws-region> $2 <image-repo> $3 <image-tag>"
exit 1
fi

# Get the account number associated with the current IAM credentials
account=$(aws sts get-caller-identity --query Account --output text)

if [ $? -ne 0 ]
then
exit 255
fi


fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:${tag}"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --region ${region} --repository-names "${image}" > /dev/null 2>&1
if [ $? -ne 0 ]; then
echo "creating ECR repository : ${fullname} "
aws ecr create-repository --region ${region} --repository-name "${image}" > /dev/null
fi
# Build the docker image locally with the image name and then push it to ECR
# with the full name.
# login ECR for the current account

aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com
aws ecr get-login-password --region ${region} \
| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com

docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg region=${region}
docker build . -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} --build-arg region=${region}
docker tag ${image} ${fullname}
docker push ${fullname}

if [ $? -eq 0 ]; then
echo "Amazon ECR URI: ${fullname}"
else
echo "Error: Image build and push failed"
exit 1
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,17 @@
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"! chmod +x build_and_push.sh; bash build_and_push.sh {region} {image} {tag}"
"dlc_account_id = 763104351884 # By default, set the account ID used for most regions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {dlc_account_id}.dkr.ecr.{region}.amazonaws.com\n",
"! chmod +x build_and_push.sh; bash build_and_push.sh {dlc_account_id} {region} {image} {tag}"
]
},
{
Expand Down Expand Up @@ -316,9 +325,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "conda_pytorch_p36",
"display_name": "conda_amazonei_pytorch_latest_p36",
"language": "python",
"name": "conda_pytorch_p36"
"name": "conda_amazonei_pytorch_latest_p36"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -330,7 +339,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
"version": "3.6.13"
}
},
"nbformat": 4,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
ARG dlc_account_id
ARG region

FROM 763104351884.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker
FROM ${dlc_account_id}.dkr.ecr.${region}.amazonaws.com/pytorch-training:1.10.2-gpu-py38-cu113-ubuntu20.04-sagemaker

RUN pip install ninja yacs cython matplotlib tqdm opencv-python pybind11==2.5.0 'git+https://github.com/NVIDIA/dllogger'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@

DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

if [ "$#" -eq 3 ]; then
region=$1
image=$2
tag=$3
if [ "$#" -eq 4 ]; then
dlc_account_id=$1
region=$2
image=$3
tag=$4
else
echo "usage: $0 <aws-region> $1 <image-repo> $2 <image-tag>"
echo "usage: $0 <dlc-account-id> $1 <aws-region> $2 <image-repo> $3 <image-tag>"
exit 1
fi

Expand All @@ -34,16 +35,16 @@ if [ $? -ne 0 ]; then
aws ecr create-repository --region ${region} --repository-name "${image}" > /dev/null
fi

$(aws ecr get-login --no-include-email --region ${region} --registry-ids 763104351884)
docker build ${DIR}/ -t ${image} -f ${DIR}/Dockerfile --build-arg region=${region}
docker tag ${image} ${fullname}
aws ecr get-login-password --region ${region} \
| docker login --username AWS --password-stdin ${account}.dkr.ecr.${region}.amazonaws.com

# Get the login command from ECR and execute it directly
$(aws ecr get-login --region ${region} --no-include-email)
docker build . -t ${image} -f ${DIR}/Dockerfile --build-arg dlc_account_id=${dlc_account_id} --build-arg region=${region}
docker tag ${image} ${fullname}
docker push ${fullname}

if [ $? -eq 0 ]; then
echo "Amazon ECR URI: ${fullname}"
else
echo "Error: Image build and push failed"
exit 1
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,17 @@
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"! chmod +x build_and_push.sh; bash build_and_push.sh {region} {image} {tag}"
"dlc_account_id = 763104351884 # By default, set the account ID used for most regions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"! aws ecr get-login-password --region {region} | docker login --username AWS --password-stdin {dlc_account_id}.dkr.ecr.{region}.amazonaws.com\n",
"! chmod +x build_and_push.sh; bash build_and_push.sh {dlc_account_id} {region} {image} {tag}"
]
},
{
Expand Down Expand Up @@ -299,13 +308,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.4 64-bit ('base': conda)",
"metadata": {
"interpreter": {
"hash": "dca0ade3e726a953b501b15e8e990130d2b7799f14cfd9f4271676035ebe5511"
}
},
"name": "python3"
"display_name": "conda_amazonei_pytorch_latest_p36",
"language": "python",
"name": "conda_amazonei_pytorch_latest_p36"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -317,7 +322,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.6.13"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 4e181db

Please sign in to comment.