Skip to content

Commit

Permalink
[Examples] AWS Neuron Accelerator Example. (#4020)
Browse files Browse the repository at this point in the history
* [Examples] AWS Neuron Accelerator Example.

* add example

* auto calculate tp size & use ubuntu 2204

* add mix acc example

* fix

* rename
  • Loading branch information
cblmemo authored Sep 30, 2024
1 parent 8dd0031 commit e437e96
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 17 deletions.
62 changes: 62 additions & 0 deletions examples/aws-neuron/inferentia.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
resources:
accelerators: Inferentia:6
disk_size: 512
ports: 9000

envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: # fill

setup: |
# Install transformers-neuronx and its dependencies
sudo apt-get install -y python3.10-venv g++
python3.10 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
pip install ipykernel
python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
pip install jupyter notebook
pip install environment_kernels
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install wget
python -m pip install awscli
python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
# Install latest version of triton.
# Reference: https://github.com/vllm-project/vllm/issues/6987
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
# Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
# Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
git clone https://github.com/vllm-project/vllm.git vllm_repo
cd vllm_repo
pip install -U -r requirements-neuron.txt
VLLM_TARGET_DEVICE="neuron" pip install -e .
python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
sudo apt update
sudo apt install -y numactl
run: |
source aws_neuron_venv_pytorch/bin/activate
# Calculate the tensor parallel size. vLLM requires the tensor parallel size
# to be a factor of the number of attention heads, which is 32 for the model.
# Here we calculate the largest power of 2 that is less than or equal to the
# number of GPUs per node.
TENSOR_PARALLEL_SIZE=1
while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
done
NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
MASTER_PORT=12355
LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
numactl --cpunodebind=0 --membind=0 \
python3 -m vllm.entrypoints.openai.api_server \
--device neuron \
--model $MODEL_NAME \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--max-num-seqs 16 \
--max-model-len 32 \
--block-size 32 \
--port 9000
74 changes: 74 additions & 0 deletions examples/aws-neuron/mix-accelerator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
resources:
accelerators: {A100:1, Inferentia:6}
disk_size: 512
ports: 9000

envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: # fill

setup: |
if command -v nvidia-smi; then
pip install vllm==0.4.2
pip install flash-attn==2.5.9.post1
else
# Install transformers-neuronx and its dependencies
sudo apt-get install -y python3.10-venv g++
python3.10 -m venv aws_neuron_venv_pytorch
source aws_neuron_venv_pytorch/bin/activate
pip install ipykernel
python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
pip install jupyter notebook
pip install environment_kernels
python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
python -m pip install wget
python -m pip install awscli
python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
# Install latest version of triton.
# Reference: https://github.com/vllm-project/vllm/issues/6987
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
# Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
# Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
git clone https://github.com/vllm-project/vllm.git vllm_repo
cd vllm_repo
pip install -U -r requirements-neuron.txt
VLLM_TARGET_DEVICE="neuron" pip install -e .
python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
sudo apt update
sudo apt install -y numactl
fi
run: |
if command -v nvidia-smi; then
TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE
PREFIX=""
DEVICE="cuda"
else
source aws_neuron_venv_pytorch/bin/activate
# Calculate the tensor parallel size. vLLM requires the tensor parallel size
# to be a factor of the number of attention heads, which is 32 for the model.
# Here we calculate the largest power of 2 that is less than or equal to the
# number of GPUs per node.
TENSOR_PARALLEL_SIZE=1
while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
done
NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
MASTER_PORT=12355
LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
PREFIX="numactl --cpunodebind=0 --membind=0"
DEVICE="neuron"
fi
$PREFIX python3 -m vllm.entrypoints.openai.api_server \
--device $DEVICE \
--model $MODEL_NAME \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
--max-num-seqs 16 \
--max-model-len 32 \
--block-size 32 \
--port 9000
3 changes: 3 additions & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,9 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
if acc_name == 'K80':
image_id = service_catalog.get_image_id_from_tag(
'skypilot:k80-ubuntu-2004', region_name, clouds='aws')
if acc_name in ['Trainium', 'Inferentia']:
image_id = service_catalog.get_image_id_from_tag(
'skypilot:neuron-ubuntu-2204', region_name, clouds='aws')
if image_id is not None:
return image_id
# Raise ResourcesUnavailableError to make sure the failover in
Expand Down
41 changes: 24 additions & 17 deletions sky/clouds/service_catalog/data_fetchers/fetch_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,26 +379,33 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
#
# Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20211208
# Nvidia driver: 470.57.02, CUDA Version: 11.4
_GPU_UBUNTU_DATE_PYTORCH = [
('gpu', '20.04', '20231103', '2.1.0'),
('gpu', '18.04', '20221114', '1.10.0'),
('k80', '20.04', '20211208', '1.10.0'),
('k80', '18.04', '20211208', '1.10.0'),
#
# Neuron (Inferentia / Trainium):
# https://aws.amazon.com/releasenotes/aws-deep-learning-ami-base-neuron-ubuntu-20-04/ # pylint: disable=line-too-long
# Deep Learning Base Neuron AMI (Ubuntu 20.04) 20240923
# TODO(tian): find out the driver version.
# Neuron driver:
_GPU_DESC_UBUNTU_DATE = [
('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'),
('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
('neuron', 'Base Neuron AMI', '22.04', '20240923'),
]


def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
pytorch_version: str) -> Optional[str]:
def _fetch_image_id(region: str, description: str, ubuntu_version: str,
creation_date: str) -> Optional[str]:
try:
image = subprocess.check_output(f"""\
aws ec2 describe-images --region {region} --owners amazon \\
--filters 'Name=name,Values="Deep Learning AMI GPU PyTorch {pytorch_version} (Ubuntu {ubuntu_version}) {creation_date}"' \\
--filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\
'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
""",
shell=True)
except subprocess.CalledProcessError as e:
print(f'Failed {region}, {ubuntu_version}, {creation_date}. '
'Trying next date.')
print(f'Failed {region}, {description}, {ubuntu_version}, '
f'{creation_date}. Trying next date.')
print(f'{type(e)}: {e}')
image_id = None
else:
Expand All @@ -407,21 +414,21 @@ def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
return image_id


def _get_image_row(
region: str, gpu: str, ubuntu_version: str, date: str,
pytorch_version) -> Tuple[str, str, str, str, Optional[str], str]:
print(f'Getting image for {region}, {ubuntu_version}, {gpu}')
image_id = _fetch_image_id(region, ubuntu_version, date, pytorch_version)
def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str,
date: str) -> Tuple[str, str, str, str, Optional[str], str]:
print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}')
image_id = _fetch_image_id(region, description, ubuntu_version, date)
if image_id is None:
# not found
print(f'Failed to find image for {region}, {ubuntu_version}, {gpu}')
print(f'Failed to find image for {region}, {description}, '
f'{ubuntu_version}, {gpu}')
tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
return tag, region, 'ubuntu', ubuntu_version, image_id, date


def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
image_metas = [
(r, *i) for r, i in itertools.product(regions, _GPU_UBUNTU_DATE_PYTORCH)
(r, *i) for r, i in itertools.product(regions, _GPU_DESC_UBUNTU_DATE)
]
with mp_pool.Pool() as pool:
results = pool.starmap(_get_image_row, image_metas)
Expand Down

0 comments on commit e437e96

Please sign in to comment.