diff --git a/examples/aws-neuron/inferentia.yaml b/examples/aws-neuron/inferentia.yaml new file mode 100644 index 00000000000..0d0773b3d09 --- /dev/null +++ b/examples/aws-neuron/inferentia.yaml @@ -0,0 +1,62 @@ +resources: + accelerators: Inferentia:6 + disk_size: 512 + ports: 9000 + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # fill + +setup: | + # Install transformers-neuronx and its dependencies + sudo apt-get install -y python3.10-venv g++ + python3.10 -m venv aws_neuron_venv_pytorch + source aws_neuron_venv_pytorch/bin/activate + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install wget + python -m pip install awscli + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + + # Install latest version of triton. + # Reference: https://github.com/vllm-project/vllm/issues/6987 + pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly + + # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict. + # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930 + git clone https://github.com/vllm-project/vllm.git vllm_repo + cd vllm_repo + pip install -U -r requirements-neuron.txt + VLLM_TARGET_DEVICE="neuron" pip install -e . + + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + + sudo apt update + sudo apt install -y numactl + +run: | + source aws_neuron_venv_pytorch/bin/activate + # Calculate the tensor parallel size. vLLM requires the tensor parallel size + # to be a factor of the number of attention heads, which is 32 for the model. + # Here we calculate the largest power of 2 that is less than or equal to the + # number of GPUs per node. + TENSOR_PARALLEL_SIZE=1 + while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do + TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2)) + done + NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))" + OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE + MASTER_PORT=12355 + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib" + numactl --cpunodebind=0 --membind=0 \ + python3 -m vllm.entrypoints.openai.api_server \ + --device neuron \ + --model $MODEL_NAME \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --max-num-seqs 16 \ + --max-model-len 32 \ + --block-size 32 \ + --port 9000 diff --git a/examples/aws-neuron/mix-accelerator.yaml b/examples/aws-neuron/mix-accelerator.yaml new file mode 100644 index 00000000000..fc452a06804 --- /dev/null +++ b/examples/aws-neuron/mix-accelerator.yaml @@ -0,0 +1,74 @@ +resources: + accelerators: {A100:1, Inferentia:6} + disk_size: 512 + ports: 9000 + +envs: + MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct + HF_TOKEN: # fill + +setup: | + if command -v nvidia-smi; then + pip install vllm==0.4.2 + pip install flash-attn==2.5.9.post1 + else + # Install transformers-neuronx and its dependencies + sudo apt-get install -y python3.10-venv g++ + python3.10 -m venv aws_neuron_venv_pytorch + source aws_neuron_venv_pytorch/bin/activate + pip install ipykernel + python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)" + pip install jupyter notebook + pip install environment_kernels + python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com + python -m pip install wget + python -m pip install awscli + python -m pip install --upgrade neuronx-cc==2.* --pre torch-neuronx==2.1.* torchvision transformers-neuronx + + # Install latest version of triton. + # Reference: https://github.com/vllm-project/vllm/issues/6987 + pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple triton-nightly + + # Install vLLM from source. Avoid using dir name 'vllm' due to import conflict. + # Reference: https://github.com/vllm-project/vllm/issues/1814#issuecomment-1837122930 + git clone https://github.com/vllm-project/vllm.git vllm_repo + cd vllm_repo + pip install -U -r requirements-neuron.txt + VLLM_TARGET_DEVICE="neuron" pip install -e . + + python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')" + + sudo apt update + sudo apt install -y numactl + fi + +run: | + if command -v nvidia-smi; then + TENSOR_PARALLEL_SIZE=$SKYPILOT_NUM_GPUS_PER_NODE + PREFIX="" + DEVICE="cuda" + else + source aws_neuron_venv_pytorch/bin/activate + # Calculate the tensor parallel size. vLLM requires the tensor parallel size + # to be a factor of the number of attention heads, which is 32 for the model. + # Here we calculate the largest power of 2 that is less than or equal to the + # number of GPUs per node. + TENSOR_PARALLEL_SIZE=1 + while [ $(($TENSOR_PARALLEL_SIZE * 2)) -le $SKYPILOT_NUM_GPUS_PER_NODE ]; do + TENSOR_PARALLEL_SIZE=$(($TENSOR_PARALLEL_SIZE * 2)) + done + NEURON_RT_VISIBLE_CORES="0-$(($TENSOR_PARALLEL_SIZE - 1))" + OMP_NUM_THREADS=$SKYPILOT_NUM_GPUS_PER_NODE + MASTER_PORT=12355 + LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/ubuntu/miniconda3/lib" + PREFIX="numactl --cpunodebind=0 --membind=0" + DEVICE="neuron" + fi + $PREFIX python3 -m vllm.entrypoints.openai.api_server \ + --device $DEVICE \ + --model $MODEL_NAME \ + --tensor-parallel-size $TENSOR_PARALLEL_SIZE \ + --max-num-seqs 16 \ + --max-model-len 32 \ + --block-size 32 \ + --port 9000 diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index 4ca57d75420..be1ecce0350 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -225,6 +225,9 @@ def _get_default_ami(cls, region_name: str, instance_type: str) -> str: if acc_name == 'K80': image_id = service_catalog.get_image_id_from_tag( 'skypilot:k80-ubuntu-2004', region_name, clouds='aws') + if acc_name in ['Trainium', 'Inferentia']: + image_id = service_catalog.get_image_id_from_tag( + 'skypilot:neuron-ubuntu-2204', region_name, clouds='aws') if image_id is not None: return image_id # Raise ResourcesUnavailableError to make sure the failover in diff --git a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py index 1e1d6e98c03..e0e5ffa21a1 100644 --- a/sky/clouds/service_catalog/data_fetchers/fetch_aws.py +++ b/sky/clouds/service_catalog/data_fetchers/fetch_aws.py @@ -379,26 +379,33 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame': # # Deep Learning AMI GPU PyTorch 1.10.0 (Ubuntu 18.04) 20211208 # Nvidia driver: 470.57.02, CUDA Version: 11.4 -_GPU_UBUNTU_DATE_PYTORCH = [ - ('gpu', '20.04', '20231103', '2.1.0'), - ('gpu', '18.04', '20221114', '1.10.0'), - ('k80', '20.04', '20211208', '1.10.0'), - ('k80', '18.04', '20211208', '1.10.0'), +# +# Neuron (Inferentia / Trainium): +# https://aws.amazon.com/releasenotes/aws-deep-learning-ami-base-neuron-ubuntu-20-04/ # pylint: disable=line-too-long +# Deep Learning Base Neuron AMI (Ubuntu 20.04) 20240923 +# TODO(tian): find out the driver version. +# Neuron driver: +_GPU_DESC_UBUNTU_DATE = [ + ('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'), + ('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'), + ('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'), + ('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'), + ('neuron', 'Base Neuron AMI', '22.04', '20240923'), ] -def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str, - pytorch_version: str) -> Optional[str]: +def _fetch_image_id(region: str, description: str, ubuntu_version: str, + creation_date: str) -> Optional[str]: try: image = subprocess.check_output(f"""\ aws ec2 describe-images --region {region} --owners amazon \\ - --filters 'Name=name,Values="Deep Learning AMI GPU PyTorch {pytorch_version} (Ubuntu {ubuntu_version}) {creation_date}"' \\ + --filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\ 'Name=state,Values=available' --query 'Images[:1].ImageId' --output text """, shell=True) except subprocess.CalledProcessError as e: - print(f'Failed {region}, {ubuntu_version}, {creation_date}. ' - 'Trying next date.') + print(f'Failed {region}, {description}, {ubuntu_version}, ' + f'{creation_date}. Trying next date.') print(f'{type(e)}: {e}') image_id = None else: @@ -407,21 +414,21 @@ def _fetch_image_id(region: str, ubuntu_version: str, creation_date: str, return image_id -def _get_image_row( - region: str, gpu: str, ubuntu_version: str, date: str, - pytorch_version) -> Tuple[str, str, str, str, Optional[str], str]: - print(f'Getting image for {region}, {ubuntu_version}, {gpu}') - image_id = _fetch_image_id(region, ubuntu_version, date, pytorch_version) +def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str, + date: str) -> Tuple[str, str, str, str, Optional[str], str]: + print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}') + image_id = _fetch_image_id(region, description, ubuntu_version, date) if image_id is None: # not found - print(f'Failed to find image for {region}, {ubuntu_version}, {gpu}') + print(f'Failed to find image for {region}, {description}, ' + f'{ubuntu_version}, {gpu}') tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}' return tag, region, 'ubuntu', ubuntu_version, image_id, date def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame': image_metas = [ - (r, *i) for r, i in itertools.product(regions, _GPU_UBUNTU_DATE_PYTORCH) + (r, *i) for r, i in itertools.product(regions, _GPU_DESC_UBUNTU_DATE) ] with mp_pool.Pool() as pool: results = pool.starmap(_get_image_row, image_metas)