From 3cc3ab64246af6b5bc90fbc525e90762c10ce11b Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 15 Feb 2024 17:27:38 -0800 Subject: [PATCH 1/2] update it --- .github/workflows/pr-cpu.yaml | 4 ---- .github/workflows/pr-gpu.yaml | 10 ---------- README.md | 12 ++++++------ mcli/mcli-1b-max-seq-len-8k.yaml | 4 ++-- mcli/mcli-1b.yaml | 4 ++-- mcli/mcli-benchmark-mpt.yaml | 4 ++-- mcli/mcli-convert-composer-to-hf.yaml | 2 +- mcli/mcli-hf-eval.yaml | 4 ++-- mcli/mcli-hf-generate.yaml | 4 ++-- mcli/mcli-llama2-finetune.yaml | 4 ++-- mcli/mcli-openai-eval.yaml | 4 ++-- mcli/mcli-pretokenize-oci-upload.yaml | 2 +- setup.py | 2 +- 13 files changed, 23 insertions(+), 37 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 8ebf7de053..0bba0fadb9 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -19,10 +19,6 @@ jobs: strategy: matrix: include: - - name: "cpu-2.1.0" - container: mosaicml/pytorch:2.1.0_cpu-python3.10-ubuntu20.04 - markers: "not gpu" - pytest_command: "coverage run -m pytest" - name: "cpu-2.2.0" container: mosaicml/pytorch:2.2.0_cpu-python3.11-ubuntu20.04 markers: "not gpu" diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index f7dbd16b2c..05ba590342 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -19,16 +19,6 @@ jobs: strategy: matrix: include: - - name: "gpu-2.1.0" - container: mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04 - markers: "gpu" - pytest_command: "coverage run -m pytest" - deps_group: "all" - - name: "gpu-2.1.0-flash2" - container: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest - markers: "gpu" - pytest_command: "coverage run -m pytest" - deps_group: "all-flash2" - name: "gpu-2.2.0" container: mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04 markers: "gpu" diff --git a/README.md b/README.md index 6668476fd4..94a95daa77 100644 --- a/README.md +++ b/README.md @@ -92,14 +92,14 @@ Something missing? Contribute with a PR! # Hardware and Software Requirements -This codebase has been tested with PyTorch 2.1 with NVIDIA A100s and H100s. +This codebase has been tested with PyTorch 2.2 with NVIDIA A100s and H100s. This codebase may also work on systems with other devices, such as consumer NVIDIA cards and AMD cards, but we are not actively testing these systems. If you have success/failure using LLM Foundry on other systems, please let us know in a Github issue and we will update the support matrix! | Device | Torch Version | Cuda Version | Status | | -------------- | ------------- | ------------ | ---------------------------- | -| A100-40GB/80GB | 2.1.0 | 12.1 | :white_check_mark: Supported | -| H100-80GB | 2.1.0 | 12.1 | :white_check_mark: Supported | +| A100-40GB/80GB | 2.2.0 | 12.1 | :white_check_mark: Supported | +| H100-80GB | 2.2.0 | 12.1 | :white_check_mark: Supported | ## MosaicML Docker Images We highly recommend using our prebuilt Docker images. You can find them here: https://hub.docker.com/orgs/mosaicml/repositories. @@ -113,9 +113,9 @@ You can select a specific commit hash such as `mosaicml/llm-foundry:1.13.1_cu117 | Docker Image | Torch Version | Cuda Version | LLM Foundry dependencies installed? | | ------------------------------------------------------ | ------------- | ----------------- | ----------------------------------- | -| `mosaicml/pytorch:2.1.2_cu121-python3.10-ubuntu20.04` | 2.1.2 | 12.1 (Infiniband) | No | -| `mosaicml/llm-foundry:2.1.2_cu121_flash2-latest` | 2.1.2 | 12.1 (Infiniband) | Yes | -| `mosaicml/llm-foundry:2.1.2_cu121_flash2_aws-latest` | 2.1.2 | 12.1 (EFA) | Yes | +| `mosaicml/pytorch:2.2.0_cu121-python3.11-ubuntu20.04` | 2.2.0 | 12.1 (Infiniband) | No | +| `mosaicml/llm-foundry:2.2.0_cu121_flash2-latest` | 2.2.0 | 12.1 (Infiniband) | Yes | +| `mosaicml/llm-foundry:2.2.0_cu121_flash2_aws-latest` | 2.2.0 | 12.1 (EFA) | Yes | # Installation diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index 5c6b38be6d..3963b4a8d4 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -3,7 +3,7 @@ integrations: git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: -e .[gpu] + pip_install: -e .[gpu-flash2] ssh_clone: false # Should be true if using a private repo # We are fetching, converting, and training on the 'val' split @@ -17,7 +17,7 @@ command: | --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest name: mpt-1b-ctx-8k-gpus-8 compute: diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index f8e782ad84..96965ddcb8 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -3,7 +3,7 @@ integrations: git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: -e .[gpu] + pip_install: -e .[gpu-flash2] ssh_clone: false # Should be true if using a private repo # We are fetching, converting, and training on the 'val' split @@ -21,7 +21,7 @@ command: | eval_loader.dataset.split=val_small \ max_duration=100ba \ eval_interval=0 -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest name: mpt-1b-gpus-8 compute: diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index ae799abb4c..104157cf27 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -6,14 +6,14 @@ compute: # cluster: TODO # Name of the cluster to use for this run # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: ".[gpu]" + pip_install: ".[gpu-flash2]" command: | cd llm-foundry/scripts/inference/benchmarking diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index 5904cad522..d62648c042 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -13,7 +13,7 @@ command: | --hf_output_path s3://bucket/folder/hf/ \ --output_precision bf16 \ -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest name: convert-composer-hf compute: diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 92faf8233b..0249cd2495 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -3,7 +3,7 @@ integrations: git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: -e ".[gpu]" + pip_install: -e ".[gpu-flash2]" ssh_clone: false # Should be true if using a private repo command: | @@ -16,7 +16,7 @@ gpu_num: 8 # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index 6f38db07b3..b155d362fb 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -3,7 +3,7 @@ integrations: git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: -e .[gpu] + pip_install: -e .[gpu-flash2] ssh_clone: false # Should be true if using a private repo command: | @@ -35,7 +35,7 @@ command: | "Here's a quick recipe for baking chocolate chip cookies: Start by" \ "The best 5 cities to visit in Europe are" -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest name: hf-generate compute: diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 110c92889a..763a4f8355 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -3,13 +3,13 @@ integrations: git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: -e .[gpu] + pip_install: -e .[gpu-flash2] ssh_clone: false # Should be true if using a private repo command: | cd llm-foundry/scripts composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:1.13.1_cu117-latest +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest name: llama2-finetune compute: diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index 5461c59d49..b2536c4a1a 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -3,7 +3,7 @@ integrations: git_repo: mosaicml/llm-foundry git_branch: v0.5.0 # git_commit: # OR use your commit hash - pip_install: -e ".[gpu,openai]" + pip_install: -e ".[gpu-flash2,openai]" ssh_clone: false # Should be true if using a private repo command: | @@ -16,7 +16,7 @@ gpu_num: # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index c9c7ad9f5e..5de4c447a6 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -1,5 +1,5 @@ name: c4-2k-pre-tokenized -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest compute: gpus: 8 # Number of GPUs to use diff --git a/setup.py b/setup.py index 4dc771a7b4..fefa5f550a 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,7 @@ 'accelerate>=0.25,<0.26', # for HF inference `device_map` 'transformers>=4.37,<4.38', 'mosaicml-streaming>=0.7.4,<0.8', - 'torch>=2.1,<2.3', + 'torch>=2.2,<2.3', 'datasets>=2.16,<2.17', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.1.97', From 78d91eb9cf5a2c46417ef412ce04712257b187a4 Mon Sep 17 00:00:00 2001 From: Daniel King Date: Thu, 15 Feb 2024 17:30:28 -0800 Subject: [PATCH 2/2] fix --- mcli/mcli-1b-eval.yaml | 2 +- mcli/mcli-1b-max-seq-len-8k.yaml | 2 +- mcli/mcli-1b.yaml | 2 +- mcli/mcli-benchmark-mpt.yaml | 2 +- mcli/mcli-convert-composer-to-hf.yaml | 2 +- mcli/mcli-hf-eval.yaml | 2 +- mcli/mcli-hf-generate.yaml | 2 +- mcli/mcli-llama2-finetune.yaml | 2 +- mcli/mcli-openai-eval.yaml | 2 +- mcli/mcli-pretokenize-oci-upload.yaml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml index a8b5ed0112..9ae77af6ca 100644 --- a/mcli/mcli-1b-eval.yaml +++ b/mcli/mcli-1b-eval.yaml @@ -9,7 +9,7 @@ integrations: command: | cd llm-foundry/scripts/ composer eval/eval.py /mnt/config/parameters.yaml -image: mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04 +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest name: mpt-1b-eval compute: diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index 3963b4a8d4..e413c3bf81 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -17,7 +17,7 @@ command: | --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest name: mpt-1b-ctx-8k-gpus-8 compute: diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index 96965ddcb8..3713d29cc9 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -21,7 +21,7 @@ command: | eval_loader.dataset.split=val_small \ max_duration=100ba \ eval_interval=0 -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest name: mpt-1b-gpus-8 compute: diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index 104157cf27..cb8adcac00 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -6,7 +6,7 @@ compute: # cluster: TODO # Name of the cluster to use for this run # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest integrations: - integration_type: git_repo diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index d62648c042..8ef894bf85 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -13,7 +13,7 @@ command: | --hf_output_path s3://bucket/folder/hf/ \ --output_precision bf16 \ -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest name: convert-composer-hf compute: diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 0249cd2495..6800319df2 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -16,7 +16,7 @@ gpu_num: 8 # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index b155d362fb..6880564a06 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -35,7 +35,7 @@ command: | "Here's a quick recipe for baking chocolate chip cookies: Start by" \ "The best 5 cities to visit in Europe are" -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest name: hf-generate compute: diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 763a4f8355..36de709aed 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -9,7 +9,7 @@ integrations: command: | cd llm-foundry/scripts composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest name: llama2-finetune compute: diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index b2536c4a1a..38844a76cf 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -16,7 +16,7 @@ gpu_num: # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index 5de4c447a6..4a4781cea3 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -1,5 +1,5 @@ name: c4-2k-pre-tokenized -image: mosaicml/llm-foundry:2.2.0_cu121_flash2-latest +image: mosaicml/llm-foundry:2.1.0_cu121_flash2-latest compute: gpus: 8 # Number of GPUs to use