[Examples] AWS Neuron Accelerator Example. (#4020)

* [Examples] AWS Neuron Accelerator Example. * add example * auto calculate tp size & use ubuntu 2204 * add mix acc example * fix * rename
skypilot-org · Sep 30, 2024 · e437e96 · e437e96
1 parent 8dd0031
commit e437e96
Show file tree

Hide file tree

Showing 4 changed files with 163 additions and 17 deletions.
diff --git a/examples/aws-neuron/inferentia.yaml b/examples/aws-neuron/inferentia.yaml
@@ -0,0 +1,62 @@
+resources:
+  accelerators: Inferentia:6
+  disk_size: 512
+  ports: 9000
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: # fill
+
+setup: |
+  # Install transformers-neuronx and its dependencies
+  sudo apt-get install -y python3.10-venv g++
+  python3.10 -m venv aws_neuron_venv_pytorch
+  source aws_neuron_venv_pytorch/bin/activate
+  pip install ipykernel
+  python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+  pip install jupyter notebook
+  pip install environment_kernels
+  python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+  python -m pip install wget
+  python -m pip install awscli
+  python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+  # Install latest version of triton.
+  # Reference: https://github.com/vllm-project/vllm/issues/6987
+  pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
+
+  # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
+  # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
+  git clone https://github.com/vllm-project/vllm.git vllm_repo
+  cd vllm_repo
+  pip install -U -r requirements-neuron.txt
+  VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+  sudo apt update
+  sudo apt install -y numactl
+
+run: |
+  source aws_neuron_venv_pytorch/bin/activate
+  # Calculate the tensor parallel size. vLLM requires the tensor parallel size
+  # to be a factor of the number of attention heads, which is 32 for the model.
+  # Here we calculate the largest power of 2 that is less than or equal to the
+  # number of GPUs per node.
+  TENSOR_PARALLEL_SIZE=1
+  while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
+    TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
+  done
+  NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
+  OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
+  MASTER_PORT=12355
+  LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
+  numactl --cpunodebind=0 --membind=0 \
+    python3 -m vllm.entrypoints.openai.api_server \
+      --device neuron \
+      --model $MODEL_NAME \
+      --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+      --max-num-seqs 16 \
+      --max-model-len 32 \
+      --block-size 32 \
+      --port 9000
diff --git a/examples/aws-neuron/mix-accelerator.yaml b/examples/aws-neuron/mix-accelerator.yaml
@@ -0,0 +1,74 @@
+resources:
+  accelerators: {A100:1, Inferentia:6}
+  disk_size: 512
+  ports: 9000
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: # fill
+
+setup: |
+  if command -v nvidia-smi; then
+    pip install vllm==0.4.2
+    pip install flash-attn==2.5.9.post1
+  else
+    # Install transformers-neuronx and its dependencies
+    sudo apt-get install -y python3.10-venv g++
+    python3.10 -m venv aws_neuron_venv_pytorch
+    source aws_neuron_venv_pytorch/bin/activate
+    pip install ipykernel
+    python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+    pip install jupyter notebook
+    pip install environment_kernels
+    python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+    python -m pip install wget
+    python -m pip install awscli
+    python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx
+
+    # Install latest version of triton.
+    # Reference: https://github.com/vllm-project/vllm/issues/6987
+    pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly
+
+    # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict.
+    # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930
+    git clone https://github.com/vllm-project/vllm.git vllm_repo
+    cd vllm_repo
+    pip install -U -r requirements-neuron.txt
+    VLLM_TARGET_DEVICE="neuron" pip install -e .
+
+    python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
+
+    sudo apt update
+    sudo apt install -y numactl
+  fi
+
+run: |
+  if command -v nvidia-smi; then
+    TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE
+    PREFIX=""
+    DEVICE="cuda"
+  else
+    source aws_neuron_venv_pytorch/bin/activate
+    # Calculate the tensor parallel size. vLLM requires the tensor parallel size
+    # to be a factor of the number of attention heads, which is 32 for the model.
+    # Here we calculate the largest power of 2 that is less than or equal to the
+    # number of GPUs per node.
+    TENSOR_PARALLEL_SIZE=1
+    while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do
+      TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2))
+    done
+    NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))"
+    OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE
+    MASTER_PORT=12355
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib"
+    PREFIX="numactl --cpunodebind=0 --membind=0"
+    DEVICE="neuron"
+  fi
+  $PREFIX python3 -m vllm.entrypoints.openai.api_server \
+    --device $DEVICE \
+    --model $MODEL_NAME \
+    --tensor-parallel-size $TENSOR_PARALLEL_SIZE \
+    --max-num-seqs 16 \
+    --max-model-len 32 \
+    --block-size 32 \
+    --port 9000
diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py
@@ -225,6 +225,9 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str:
             if acc_name == 'K80':
                 image_id = service_catalog.get_image_id_from_tag(
                     'skypilot:k80-ubuntu-2004', region_name, clouds='aws')
+            if acc_name in ['Trainium', 'Inferentia']:
+                image_id = service_catalog.get_image_id_from_tag(
+                    'skypilot:neuron-ubuntu-2204', region_name, clouds='aws')
         if image_id is not None:
             return image_id
         # Raise ResourcesUnavailableError to make sure the failover in

diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py
@@ -379,26 +379,33 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
 #
 # Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20211208
 #   Nvidia driver: 470.57.02, CUDA Version: 11.4
-_GPU_UBUNTU_DATE_PYTORCH = [
-    ('gpu', '20.04', '20231103', '2.1.0'),
-    ('gpu', '18.04', '20221114', '1.10.0'),
-    ('k80', '20.04', '20211208', '1.10.0'),
-    ('k80', '18.04', '20211208', '1.10.0'),
+#
+# Neuron (Inferentia / Trainium):
+# https://aws.amazon.com/releasenotes/aws-deep-learning-ami-base-neuron-ubuntu-20-04/  # pylint: disable=line-too-long
+# Deep Learning Base Neuron AMI (Ubuntu 20.04) 20240923
+# TODO(tian): find out the driver version.
+#   Neuron driver:
+_GPU_DESC_UBUNTU_DATE = [
+    ('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'),
+    ('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
+    ('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
+    ('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
+    ('neuron', 'Base Neuron AMI', '22.04', '20240923'),
 ]
 
 
-def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
-                    pytorch_version: str) -> Optional[str]:
+def _fetch_image_id(region: str, description: str, ubuntu_version: str,
+                    creation_date: str) -> Optional[str]:
     try:
         image = subprocess.check_output(f"""\
             aws ec2 describe-images --region {region} --owners amazon \\
-                --filters 'Name=name,Values="Deep Learning AMI GPU PyTorch {pytorch_version} (Ubuntu {ubuntu_version}) {creation_date}"' \\
+                --filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\
                     'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
             """,
                                         shell=True)
     except subprocess.CalledProcessError as e:
-        print(f'Failed {region}, {ubuntu_version}, {creation_date}. '
-              'Trying next date.')
+        print(f'Failed {region}, {description}, {ubuntu_version}, '
+              f'{creation_date}. Trying next date.')
         print(f'{type(e)}: {e}')
         image_id = None
     else:
@@ -407,21 +414,21 @@ def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str,
     return image_id
 
 
-def _get_image_row(
-        region: str, gpu: str, ubuntu_version: str, date: str,
-        pytorch_version) -> Tuple[str, str, str, str, Optional[str], str]:
-    print(f'Getting image for {region}, {ubuntu_version}, {gpu}')
-    image_id = _fetch_image_id(region, ubuntu_version, date, pytorch_version)
+def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str,
+                   date: str) -> Tuple[str, str, str, str, Optional[str], str]:
+    print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}')
+    image_id = _fetch_image_id(region, description, ubuntu_version, date)
     if image_id is None:
         # not found
-        print(f'Failed to find image for {region}, {ubuntu_version}, {gpu}')
+        print(f'Failed to find image for {region}, {description}, '
+              f'{ubuntu_version}, {gpu}')
     tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
     return tag, region, 'ubuntu', ubuntu_version, image_id, date
 
 
 def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
     image_metas = [
-        (r, *i) for r, i in itertools.product(regions, _GPU_UBUNTU_DATE_PYTORCH)
+        (r, *i) for r, i in itertools.product(regions, _GPU_DESC_UBUNTU_DATE)
     ]
     with mp_pool.Pool() as pool:
         results = pool.starmap(_get_image_row, image_metas)