diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index b75d5621e78..e68393b1d12 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -277,7 +277,7 @@ commands: - when: condition: <> steps: - - run: docker pull determinedai/pytorch-ngc-dev:8c90e80 + - run: docker pull determinedai/pytorch-ngc-dev:0e43056 login-docker: parameters: @@ -2401,7 +2401,7 @@ jobs: test-unit-harness-gpu-tf: docker: - - image: determinedai/tensorflow-ngc-dev:8c90e80 + - image: determinedai/tensorflow-ngc-dev:0e43056 resource_class: determined-ai/container-runner-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -2428,7 +2428,7 @@ jobs: test-unit-harness-pytorch2-gpu: docker: - - image: determinedai/pytorch-ngc-dev:8c90e80 + - image: determinedai/pytorch-ngc-dev:0e43056 resource_class: determined-ai/container-runner-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -2455,7 +2455,7 @@ jobs: test-unit-harness-pytorch2-cpu: docker: - - image: determinedai/pytorch-ngc-dev:8c90e80 + - image: determinedai/pytorch-ngc-dev:0e43056 steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts - checkout @@ -2481,7 +2481,7 @@ jobs: test-unit-harness-gpu-parallel: docker: - - image: determinedai/pytorch-ngc-dev:8c90e80 + - image: determinedai/pytorch-ngc-dev:0e43056 resource_class: determined-ai/container-runner-multi-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -2508,7 +2508,7 @@ jobs: test-unit-harness-gpu-deepspeed: docker: - - image: determinedai/pytorch-ngc-dev:8c90e80 + - image: determinedai/pytorch-ngc-dev:0e43056 resource_class: determined-ai/container-runner-gpu steps: - run: mkdir -p ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts @@ -3747,7 +3747,7 @@ jobs: type: string default: "1" environment-image: - default: determinedai/pytorch-ngc-dev:8c90e80 + default: determinedai/pytorch-ngc-dev:0e43056 type: string accel-node-taints: type: string diff --git a/.circleci/scripts/pull_image_daemonset.yaml b/.circleci/scripts/pull_image_daemonset.yaml index 7fba163370e..bb0c9a25eeb 100644 --- a/.circleci/scripts/pull_image_daemonset.yaml +++ b/.circleci/scripts/pull_image_daemonset.yaml @@ -13,7 +13,7 @@ spec: spec: containers: - name: pull-docker-daemonset - image: determinedai/pytorch-ngc-dev:8c90e80 + image: determinedai/pytorch-ngc-dev:0e43056 command: ["/bin/bash"] args: ["echo", "test"] resources: diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 38fdfa6d491..f17e2a0a8af 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -35,6 +35,7 @@ /e2e_tests/tests/requirements.txt @determined-ai/model-dev /e2e_tests/tests/experiment @determined-ai/model-dev /e2e_tests/tests/nightly @determined-ai/model-dev +/e2e_tests/tests/environment @determined-ai/model-dev # Backend owns some e2e tests. /e2e_tests/tests/cluster @determined-ai/backend diff --git a/docs/model-dev-guide/prepare-container/custom-env.rst b/docs/model-dev-guide/prepare-container/custom-env.rst index c425a60d43f..c255e5574ac 100644 --- a/docs/model-dev-guide/prepare-container/custom-env.rst +++ b/docs/model-dev-guide/prepare-container/custom-env.rst @@ -114,9 +114,9 @@ Default Images - - Environment - File Name - - CPUs - - ``determinedai/pytorch-ngc-dev:8c90e80`` + - ``determinedai/pytorch-ngc-dev:0e43056`` - - NVIDIA GPUs - - ``determinedai/pytorch-ngc-dev:8c90e80`` + - ``determinedai/pytorch-ngc-dev:0e43056`` - - AMD GPUs - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` @@ -155,7 +155,7 @@ Example Dockerfile that installs custom ``conda``-, ``pip``-, and ``apt``-based .. code:: bash # Determined Image - FROM determinedai/tensorflow-ngc-dev:8c90e80 + FROM determinedai/tensorflow-ngc-dev:0e43056 # Custom Configuration RUN apt-get update && \ @@ -216,7 +216,7 @@ environments using :ref:`custom images `: .. code:: bash # Determined Image - FROM determinedai/pytorch-ngc-dev:8c90e80 + FROM determinedai/pytorch-ngc-dev:0e43056 # Create a virtual environment RUN conda create -n myenv python=3.8 diff --git a/docs/model-dev-guide/prepare-container/tensorflow-support.rst b/docs/model-dev-guide/prepare-container/tensorflow-support.rst index 33457a9e55f..ad67093e7d7 100644 --- a/docs/model-dev-guide/prepare-container/tensorflow-support.rst +++ b/docs/model-dev-guide/prepare-container/tensorflow-support.rst @@ -20,7 +20,7 @@ Determined supports both TensorFlow 1 and 2. The version of TensorFlow used for experiment is controlled by the configured container image. Determined provides prebuilt Docker images that include TensorFlow 2+, 1.15, and 2.8, respectively: -- ``determinedai/tensorflow-ngc-dev:8c90e80`` +- ``determinedai/tensorflow-ngc-dev:0e43056`` - ``determinedai/environments:cuda-10.2-pytorch-1.7-tf-1.15-gpu-0.21.2`` - ``determinedai/environments:cuda-11.2-tf-2.8-gpu-0.29.1`` diff --git a/docs/reference/deploy/helm-config-reference.rst b/docs/reference/deploy/helm-config-reference.rst index c422436b56f..f52f4d4f42f 100644 --- a/docs/reference/deploy/helm-config-reference.rst +++ b/docs/reference/deploy/helm-config-reference.rst @@ -197,13 +197,13 @@ - ``cpuImage``: Sets the default Docker image for all non-GPU tasks. If a Docker image is specified in the :ref:`experiment config ` this default is overriden. - Defaults to: ``determinedai/pytorch-ngc-dev:8c90e80``. + Defaults to: ``determinedai/pytorch-ngc-dev:0e43056``. - ``startupHook``: An optional inline script that will be executed as part of task set up. - ``gpuImage``: Sets the default Docker image for all GPU tasks. If a Docker image is specified in the :ref:`experiment config ` this default is overriden. Defaults - to: ``determinedai/pytorch-ngc-dev:8c90e80``. + to: ``determinedai/pytorch-ngc-dev:0e43056``. - ``logPolicies``: Sets log policies for trials. For details, visit :ref:`log_policies `. diff --git a/docs/reference/deploy/master-config-reference.rst b/docs/reference/deploy/master-config-reference.rst index a3fd58d06cb..8e30d76d7d0 100644 --- a/docs/reference/deploy/master-config-reference.rst +++ b/docs/reference/deploy/master-config-reference.rst @@ -89,12 +89,12 @@ configure different container images for NVIDIA GPU tasks using the ``cuda`` key Determined 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using the ``rocm`` key. Default values: -- ``determinedai/pytorch-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs. +- ``determinedai/pytorch-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs. - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm. For TensorFlow users, we provide an image that must be referenced in the experiment configuration: -- ``determinedai/tensorflow-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs. +- ``determinedai/tensorflow-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs. ``environment_variables`` ========================= diff --git a/docs/reference/experiment-config-reference.rst b/docs/reference/experiment-config-reference.rst index 6a2be869c87..8787f006534 100644 --- a/docs/reference/experiment-config-reference.rst +++ b/docs/reference/experiment-config-reference.rst @@ -1333,12 +1333,12 @@ Optional. The Docker image to use when executing the workload. This image must b container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values: -- ``determinedai/pytorch-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs. +- ``determinedai/pytorch-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs. - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm. For TensorFlow users, we provide an image that must be referenced in the experiment configuration: -- ``determinedai/tensorflow-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs. +- ``determinedai/tensorflow-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs. When the cluster is configured with :ref:`resource_manager.type: slurm ` and ``container_run_type: singularity``, images are executed using diff --git a/docs/reference/job-config-reference.rst b/docs/reference/job-config-reference.rst index 1f104af5a41..6efa0c92ce5 100644 --- a/docs/reference/job-config-reference.rst +++ b/docs/reference/job-config-reference.rst @@ -45,13 +45,13 @@ The following configuration settings are supported: different container images for NVIDIA GPU tasks using ``cuda`` key (``gpu`` prior to 0.17.6), CPU tasks using ``cpu`` key, and ROCm (AMD GPU) tasks using ``rocm`` key. Default values: - - ``determinedai/pytorch-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs. + - ``determinedai/pytorch-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs. - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4`` for ROCm. For TensorFlow users, we provide an image that must be referenced in the experiment configuration: - - ``determinedai/tensorflow-ngc-dev:8c90e80`` for NVIDIA GPUs and for CPUs. + - ``determinedai/tensorflow-ngc-dev:0e43056`` for NVIDIA GPUs and for CPUs. - ``force_pull_image``: Forcibly pull the image from the Docker registry and bypass the Docker cache. Defaults to ``false``. diff --git a/docs/setup-cluster/deploy-cluster/slurm/singularity.rst b/docs/setup-cluster/deploy-cluster/slurm/singularity.rst index a380a2bf0f6..1ebae5767cb 100644 --- a/docs/setup-cluster/deploy-cluster/slurm/singularity.rst +++ b/docs/setup-cluster/deploy-cluster/slurm/singularity.rst @@ -30,9 +30,9 @@ by default in this version of Determined are described below. - - Environment - File Name - - CPUs - - ``determinedai/pytorch-ngc-dev:8c90e80`` + - ``determinedai/pytorch-ngc-dev:0e43056`` - - NVIDIA GPUs - - ``determinedai/pytorch-ngc-dev:8c90e80`` + - ``determinedai/pytorch-ngc-dev:0e43056`` - - AMD GPUs - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512`` diff --git a/docs/setup-cluster/gcp/install-gcp.rst b/docs/setup-cluster/gcp/install-gcp.rst index ca03a376153..1a04fcd868f 100644 --- a/docs/setup-cluster/gcp/install-gcp.rst +++ b/docs/setup-cluster/gcp/install-gcp.rst @@ -406,5 +406,5 @@ This command line will spin up a cluster of up to 2 A100s in the ``us-central1-c --compute-agent-instance-type a2-highgpu-1g --gpu-num 1 \ --gpu-type nvidia-tesla-a100 \ --region us-central1 --zone us-central1-c \ - --gpu-env-image determinedai/pytorch-ngc-dev:8c90e80 \ - --cpu-env-image determinedai/pytorch-ngc-dev:8c90e80 + --gpu-env-image determinedai/pytorch-ngc-dev:0e43056 \ + --cpu-env-image determinedai/pytorch-ngc-dev:0e43056 diff --git a/docs/setup-cluster/slurm/singularity.rst b/docs/setup-cluster/slurm/singularity.rst index 19225182bde..62586b774d4 100644 --- a/docs/setup-cluster/slurm/singularity.rst +++ b/docs/setup-cluster/slurm/singularity.rst @@ -30,9 +30,9 @@ by default in this version of Determined are described below. - - Environment - File Name - - CPUs - - ``determinedai/pytorch-ngc-dev:8c90e80`` + - ``determinedai/pytorch-ngc-dev:0e43056`` - - NVIDIA GPUs - - ``determinedai/pytorch-ngc-dev:8c90e80`` + - ``determinedai/pytorch-ngc-dev:0e43056`` - - AMD GPUs - ``determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512`` diff --git a/docs/setup-cluster/slurm/slurm-requirements.rst b/docs/setup-cluster/slurm/slurm-requirements.rst index 9ffa61e876a..6b2502a851c 100644 --- a/docs/setup-cluster/slurm/slurm-requirements.rst +++ b/docs/setup-cluster/slurm/slurm-requirements.rst @@ -510,7 +510,7 @@ platform. There may be additional per-user configuration that is required. .. code:: bash - image=determinedai/pytorch-ngc-dev:8c90e80 + image=determinedai/pytorch-ngc-dev:0e43056 cd /shared/enroot/images enroot import docker://$image enroot create /shared/enroot/images/${image//[\/:]/\+}.sqsh diff --git a/e2e_tests/tests/config.py b/e2e_tests/tests/config.py index bc83070be68..93b306a9957 100644 --- a/e2e_tests/tests/config.py +++ b/e2e_tests/tests/config.py @@ -14,12 +14,12 @@ MAX_TRIAL_BUILD_SECS = 90 -DEFAULT_TF2_CPU_IMAGE = "determinedai/tensorflow-ngc-dev:8c90e80" -DEFAULT_TF2_GPU_IMAGE = "determinedai/tensorflow-ngc-dev:8c90e80" -DEFAULT_PT_CPU_IMAGE = "determinedai/pytorch-tensorflow-cpu-dev:8c90e80" -DEFAULT_PT_GPU_IMAGE = "determinedai/pytorch-tensorflow-cuda-dev:8c90e80" -DEFAULT_PT2_CPU_IMAGE = "determinedai/pytorch-ngc-dev:8c90e80" -DEFAULT_PT2_GPU_IMAGE = "determinedai/pytorch-ngc-dev:8c90e80" +DEFAULT_TF2_CPU_IMAGE = "determinedai/tensorflow-ngc-dev:0e43056" +DEFAULT_TF2_GPU_IMAGE = "determinedai/tensorflow-ngc-dev:0e43056" +DEFAULT_PT_CPU_IMAGE = "determinedai/pytorch-tensorflow-cpu-dev:0e43056" +DEFAULT_PT_GPU_IMAGE = "determinedai/pytorch-tensorflow-cuda-dev:0e43056" +DEFAULT_PT2_CPU_IMAGE = "determinedai/pytorch-ngc-dev:0e43056" +DEFAULT_PT2_GPU_IMAGE = "determinedai/pytorch-ngc-dev:0e43056" TF2_CPU_IMAGE = os.environ.get("TF2_CPU_IMAGE") or DEFAULT_TF2_CPU_IMAGE TF2_GPU_IMAGE = os.environ.get("TF2_GPU_IMAGE") or DEFAULT_TF2_GPU_IMAGE diff --git a/e2e_tests/tests/environment/test_nvidia_driver.py b/e2e_tests/tests/environment/test_nvidia_driver.py new file mode 100644 index 00000000000..71848410766 --- /dev/null +++ b/e2e_tests/tests/environment/test_nvidia_driver.py @@ -0,0 +1,37 @@ +import re + +import pytest + +from tests import api_utils +from tests import command as cmd + + +@pytest.mark.e2e_gpu +@pytest.mark.gpu_required +def test_nvidia_drivers_version_matching() -> None: + sess = api_utils.user_session() + + with cmd.interactive_command(sess, ["shell", "start"]) as shell: + shell.stdin.write(b"nvidia-smi\n") + shell.stdin.write(b"nv-fabricmanager -v\n") + # Exit the shell, so we can read output below until EOF instead of timeout + shell.stdin.write(b"exit\n") + shell.stdin.close() + + lines = "" + for line in shell.stdout: + lines += line + + m = re.search(r"Driver Version: ([\d.]+)", lines) + if not m: + pytest.fail(f"Did not find Nvidia driver version in shell output.\n {lines}\n") + driver_version = m.group(1) + + m = re.search(r"Fabric Manager version is[\s:]*([\d.]+)", lines) + if not m: + pytest.fail(f"Did not find fabric manager version in shell output.\n {lines}\n") + fabric_manager_version = m.group(1) + + assert ( + driver_version == fabric_manager_version + ), f"nvidia driver {driver_version} doesn't match fabric manager {fabric_manager_version}" diff --git a/e2e_tests/tests/fixtures/ports-proxy/config.yaml b/e2e_tests/tests/fixtures/ports-proxy/config.yaml index 503a208bc2c..e104ad0f737 100644 --- a/e2e_tests/tests/fixtures/ports-proxy/config.yaml +++ b/e2e_tests/tests/fixtures/ports-proxy/config.yaml @@ -23,7 +23,7 @@ max_restarts: 0 # Hardcode the image because the new image has a bug. TODO fix this when the image bug is fixed. environment: - image: determinedai/pytorch-tensorflow-cpu-dev:8c90e80 + image: determinedai/pytorch-tensorflow-cpu-dev:0e43056 proxy_ports: - proxy_port: 8000 proxy_tcp: false diff --git a/examples/computer_vision/iris_tf_keras/adaptive.yaml b/examples/computer_vision/iris_tf_keras/adaptive.yaml index 7392b354d64..64ede8131b4 100644 --- a/examples/computer_vision/iris_tf_keras/adaptive.yaml +++ b/examples/computer_vision/iris_tf_keras/adaptive.yaml @@ -4,8 +4,8 @@ data: test_url: http://download.tensorflow.org/data/iris_test.csv environment: image: - cpu: determinedai/tensorflow-ngc-dev:8c90e80 - gpu: determinedai/tensorflow-ngc-dev:8c90e80 + cpu: determinedai/tensorflow-ngc-dev:0e43056 + gpu: determinedai/tensorflow-ngc-dev:0e43056 hyperparameters: learning_rate: type: log diff --git a/examples/computer_vision/iris_tf_keras/const.yaml b/examples/computer_vision/iris_tf_keras/const.yaml index 4b27dbd5778..3a4660fee70 100644 --- a/examples/computer_vision/iris_tf_keras/const.yaml +++ b/examples/computer_vision/iris_tf_keras/const.yaml @@ -4,8 +4,8 @@ data: test_url: http://download.tensorflow.org/data/iris_test.csv environment: image: - cpu: determinedai/tensorflow-ngc-dev:8c90e80 - gpu: determinedai/tensorflow-ngc-dev:8c90e80 + cpu: determinedai/tensorflow-ngc-dev:0e43056 + gpu: determinedai/tensorflow-ngc-dev:0e43056 hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 diff --git a/examples/computer_vision/iris_tf_keras/distributed.yaml b/examples/computer_vision/iris_tf_keras/distributed.yaml index fc77b9a0cb9..39223b576d8 100644 --- a/examples/computer_vision/iris_tf_keras/distributed.yaml +++ b/examples/computer_vision/iris_tf_keras/distributed.yaml @@ -4,8 +4,8 @@ data: test_url: http://download.tensorflow.org/data/iris_test.csv environment: image: - cpu: determinedai/tensorflow-ngc-dev:8c90e80 - gpu: determinedai/tensorflow-ngc-dev:8c90e80 + cpu: determinedai/tensorflow-ngc-dev:0e43056 + gpu: determinedai/tensorflow-ngc-dev:0e43056 hyperparameters: learning_rate: 1.0e-4 learning_rate_decay: 1.0e-6 diff --git a/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml b/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml index b18ebbf9e0c..7ccdf2600ab 100644 --- a/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml +++ b/examples/computer_vision/iris_tf_keras/iris_tf_keras_cancelable.yaml @@ -4,8 +4,8 @@ data: test_url: http://download.tensorflow.org/data/iris_test.csv environment: image: - cpu: determinedai/tensorflow-ngc-dev:8c90e80 - gpu: determinedai/tensorflow-ngc-dev:8c90e80 + cpu: determinedai/tensorflow-ngc-dev:0e43056 + gpu: determinedai/tensorflow-ngc-dev:0e43056 resources: slots_per_trial: 8 resource_pool: defq_GPU_cancelable diff --git a/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml b/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml index 10c21ae78b0..49cd69cf868 100644 --- a/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml +++ b/examples/computer_vision/iris_tf_keras/iris_tf_keras_high_priority.yaml @@ -4,8 +4,8 @@ data: test_url: http://download.tensorflow.org/data/iris_test.csv environment: image: - cpu: determinedai/tensorflow-ngc-dev:8c90e80 - gpu: determinedai/tensorflow-ngc-dev:8c90e80 + cpu: determinedai/tensorflow-ngc-dev:0e43056 + gpu: determinedai/tensorflow-ngc-dev:0e43056 resources: slots_per_trial: 8 resource_pool: defq_GPU_hipri diff --git a/examples/deepspeed_autotune/torchvision/core_api/deepspeed.yaml b/examples/deepspeed_autotune/torchvision/core_api/deepspeed.yaml index b6fae94346a..9f2fb93c953 100644 --- a/examples/deepspeed_autotune/torchvision/core_api/deepspeed.yaml +++ b/examples/deepspeed_autotune/torchvision/core_api/deepspeed.yaml @@ -2,7 +2,7 @@ name: torchvision dsat core_api max_restarts: 0 environment: image: - gpu: determinedai/pytorch-ngc-dev:8c90e80 + gpu: determinedai/pytorch-ngc-dev:0e43056 resources: slots_per_trial: 2 shm_size: 4294967296 # 4 GiB. diff --git a/examples/deepspeed_autotune/torchvision/deepspeed_trial/deepspeed.yaml b/examples/deepspeed_autotune/torchvision/deepspeed_trial/deepspeed.yaml index 34c9bfe015f..dfb7af55416 100644 --- a/examples/deepspeed_autotune/torchvision/deepspeed_trial/deepspeed.yaml +++ b/examples/deepspeed_autotune/torchvision/deepspeed_trial/deepspeed.yaml @@ -2,7 +2,7 @@ name: torchvision dsat deepspeed_trial max_restarts: 0 environment: image: - gpu: determinedai/pytorch-ngc-dev:8c90e80 + gpu: determinedai/pytorch-ngc-dev:0e43056 resources: slots_per_trial: 2 shm_size: 4294967296 # 4 GiB. diff --git a/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml b/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml index 62854bc22e9..b9fca49c608 100644 --- a/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml +++ b/examples/hf_trainer_api/hf_image_classification/deepspeed.yaml @@ -6,7 +6,7 @@ environment: # You may need to modify this to match your network configuration. - NCCL_SOCKET_IFNAME=ens,eth,ib image: - gpu: determinedai/pytorch-ngc-dev:8c90e80 + gpu: determinedai/pytorch-ngc-dev:0e43056 resources: slots_per_trial: 2 searcher: diff --git a/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml b/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml index 983fbd94e18..cee2a137fb7 100644 --- a/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml +++ b/examples/hf_trainer_api/hf_language_modeling/deepspeed.yaml @@ -6,7 +6,7 @@ environment: # You may need to modify this to match your network configuration. - NCCL_SOCKET_IFNAME=ens,eth,ib image: - gpu: determinedai/pytorch-ngc-dev:8c90e80 + gpu: determinedai/pytorch-ngc-dev:0e43056 resources: slots_per_trial: 2 searcher: diff --git a/harness/determined/deploy/aws/templates/efs.yaml b/harness/determined/deploy/aws/templates/efs.yaml index 4f20901e5ca..73fff6a23a0 100644 --- a/harness/determined/deploy/aws/templates/efs.yaml +++ b/harness/determined/deploy/aws/templates/efs.yaml @@ -3,35 +3,35 @@ Mappings: RegionMap: ap-northeast-1: Master: ami-00910ef9457f0df47 - Agent: ami-0b9655c3686ad290f + Agent: ami-0d5ce4472d6286746 # TODO(DET-4258) Uncomment these when we fully support all P3 regions. # ap-northeast-2: # Master: ami-035e3e44dc41db6a2 - # Agent: ami-0dc620552c1aaa2cf + # Agent: ami-0090f68a647f10126 # ap-southeast-1: # Master: ami-0fd1ee6c8b656f020 - # Agent: ami-0adbb66c690fafe37 + # Agent: ami-048be732b76a4679e # ap-southeast-2: # Master: ami-0b62ecd3babd1c548 - # Agent: ami-0a453138b8d55c36d + # Agent: ami-05c05ba492caa1c68 eu-central-1: Master: ami-0abbe417ed83c0b29 - Agent: ami-09284ff11565b3ae3 + Agent: ami-0ebd39eab325463c0 eu-west-1: Master: ami-0e3f7dd2dc743e48a - Agent: ami-037b373c0075ea120 + Agent: ami-05c7e44456501d01d # eu-west-2: # Master: ami-0d78429fb6af30994 - # Agent: ami-0a523360a75ece477 + # Agent: ami-0d3b65b9d8e18b354 us-east-1: Master: ami-0172070f66a8ebe63 - Agent: ami-0f8ed0567336433e2 + Agent: ami-0b25b64346732d0b5 us-east-2: Master: ami-0bafa3699418551cd - Agent: ami-09578b4c5ea532f24 + Agent: ami-07517c67a90714250 us-west-2: Master: ami-0ceeab680f529cc36 - Agent: ami-0c8ad935d75f2f73d + Agent: ami-083f8147aeeba1eb2 Parameters: VpcCIDR: diff --git a/harness/determined/deploy/aws/templates/fsx.yaml b/harness/determined/deploy/aws/templates/fsx.yaml index 50130844b57..71ff5a4d524 100644 --- a/harness/determined/deploy/aws/templates/fsx.yaml +++ b/harness/determined/deploy/aws/templates/fsx.yaml @@ -3,35 +3,35 @@ Mappings: RegionMap: ap-northeast-1: Master: ami-00910ef9457f0df47 - Agent: ami-0b9655c3686ad290f + Agent: ami-0d5ce4472d6286746 # TODO(DET-4258) Uncomment these when we fully support all P3 regions. # ap-northeast-2: # Master: ami-035e3e44dc41db6a2 - # Agent: ami-0dc620552c1aaa2cf + # Agent: ami-0090f68a647f10126 # ap-southeast-1: # Master: ami-0fd1ee6c8b656f020 - # Agent: ami-0adbb66c690fafe37 + # Agent: ami-048be732b76a4679e # ap-southeast-2: # Master: ami-0b62ecd3babd1c548 - # Agent: ami-0a453138b8d55c36d + # Agent: ami-05c05ba492caa1c68 eu-central-1: Master: ami-0abbe417ed83c0b29 - Agent: ami-09284ff11565b3ae3 + Agent: ami-0ebd39eab325463c0 eu-west-1: Master: ami-0e3f7dd2dc743e48a - Agent: ami-037b373c0075ea120 + Agent: ami-05c7e44456501d01d # eu-west-2: # Master: ami-0d78429fb6af30994 - # Agent: ami-0a523360a75ece477 + # Agent: ami-0d3b65b9d8e18b354 us-east-1: Master: ami-0172070f66a8ebe63 - Agent: ami-0f8ed0567336433e2 + Agent: ami-0b25b64346732d0b5 us-east-2: Master: ami-0bafa3699418551cd - Agent: ami-09578b4c5ea532f24 + Agent: ami-07517c67a90714250 us-west-2: Master: ami-0ceeab680f529cc36 - Agent: ami-0c8ad935d75f2f73d + Agent: ami-083f8147aeeba1eb2 Parameters: VpcCIDR: diff --git a/harness/determined/deploy/aws/templates/govcloud.yaml b/harness/determined/deploy/aws/templates/govcloud.yaml index 65d9f143a8a..acd2524db54 100644 --- a/harness/determined/deploy/aws/templates/govcloud.yaml +++ b/harness/determined/deploy/aws/templates/govcloud.yaml @@ -5,10 +5,10 @@ Mappings: RegionMap: us-gov-east-1: Master: ami-04ef693ebcf519dc3 - Agent: ami-09a56ff763262c365 + Agent: ami-0cc63f942a6c1e5fd us-gov-west-1: Master: ami-08bd15d820a3c087e - Agent: ami-0e7d4dc4ebb742216 + Agent: ami-0707563a683a19ed2 Parameters: Keypair: Type: AWS::EC2::KeyPair::KeyName diff --git a/harness/determined/deploy/aws/templates/lore.yaml b/harness/determined/deploy/aws/templates/lore.yaml index bc9acfe1191..c9dad6a8d0e 100644 --- a/harness/determined/deploy/aws/templates/lore.yaml +++ b/harness/determined/deploy/aws/templates/lore.yaml @@ -3,35 +3,35 @@ Mappings: RegionMap: ap-northeast-1: Master: ami-00910ef9457f0df47 - Agent: ami-0b9655c3686ad290f + Agent: ami-0d5ce4472d6286746 # TODO(DET-4258) Uncomment these when we fully support all P3 regions. # ap-northeast-2: # Master: ami-035e3e44dc41db6a2 - # Agent: ami-0dc620552c1aaa2cf + # Agent: ami-0090f68a647f10126 # ap-southeast-1: # Master: ami-0fd1ee6c8b656f020 - # Agent: ami-0adbb66c690fafe37 + # Agent: ami-048be732b76a4679e # ap-southeast-2: # Master: ami-0b62ecd3babd1c548 - # Agent: ami-0a453138b8d55c36d + # Agent: ami-05c05ba492caa1c68 eu-central-1: Master: ami-0abbe417ed83c0b29 - Agent: ami-09284ff11565b3ae3 + Agent: ami-0ebd39eab325463c0 eu-west-1: Master: ami-0e3f7dd2dc743e48a - Agent: ami-037b373c0075ea120 + Agent: ami-05c7e44456501d01d # eu-west-2: # Master: ami-0d78429fb6af30994 - # Agent: ami-0a523360a75ece477 + # Agent: ami-0d3b65b9d8e18b354 us-east-1: Master: ami-0172070f66a8ebe63 - Agent: ami-0f8ed0567336433e2 + Agent: ami-0b25b64346732d0b5 us-east-2: Master: ami-0bafa3699418551cd - Agent: ami-09578b4c5ea532f24 + Agent: ami-07517c67a90714250 us-west-2: Master: ami-0ceeab680f529cc36 - Agent: ami-0c8ad935d75f2f73d + Agent: ami-083f8147aeeba1eb2 Parameters: VpcCIDR: diff --git a/harness/determined/deploy/aws/templates/secure.yaml b/harness/determined/deploy/aws/templates/secure.yaml index 322575cae63..6d42acc5ad9 100644 --- a/harness/determined/deploy/aws/templates/secure.yaml +++ b/harness/determined/deploy/aws/templates/secure.yaml @@ -4,44 +4,44 @@ Mappings: RegionMap: ap-northeast-1: Master: ami-00910ef9457f0df47 - Agent: ami-0b9655c3686ad290f + Agent: ami-0d5ce4472d6286746 Bastion: ami-00910ef9457f0df47 # TODO(DET-4258) Uncomment these when we fully support all P3 regions. # ap-northeast-2: # Master: ami-035e3e44dc41db6a2 - # Agent: ami-0dc620552c1aaa2cf + # Agent: ami-0090f68a647f10126 # Bastion: ami-035e3e44dc41db6a2 # ap-southeast-1: # Master: ami-0fd1ee6c8b656f020 - # Agent: ami-0adbb66c690fafe37 + # Agent: ami-048be732b76a4679e # Bastion: ami-0fd1ee6c8b656f020 # ap-southeast-2: # Master: ami-0b62ecd3babd1c548 - # Agent: ami-0a453138b8d55c36d + # Agent: ami-05c05ba492caa1c68 # Bastion: ami-0b62ecd3babd1c548 eu-central-1: Master: ami-0abbe417ed83c0b29 - Agent: ami-09284ff11565b3ae3 + Agent: ami-0ebd39eab325463c0 Bastion: ami-0abbe417ed83c0b29 eu-west-1: Master: ami-0e3f7dd2dc743e48a - Agent: ami-037b373c0075ea120 + Agent: ami-05c7e44456501d01d Bastion: ami-0e3f7dd2dc743e48a # eu-west-2: # Master: ami-0d78429fb6af30994 - # Agent: ami-0a523360a75ece477 + # Agent: ami-0d3b65b9d8e18b354 # Bastion: ami-0d78429fb6af30994 us-east-1: Master: ami-0172070f66a8ebe63 - Agent: ami-0f8ed0567336433e2 + Agent: ami-0b25b64346732d0b5 Bastion: ami-0172070f66a8ebe63 us-east-2: Master: ami-0bafa3699418551cd - Agent: ami-09578b4c5ea532f24 + Agent: ami-07517c67a90714250 Bastion: ami-0bafa3699418551cd us-west-2: Master: ami-0ceeab680f529cc36 - Agent: ami-0c8ad935d75f2f73d + Agent: ami-083f8147aeeba1eb2 Bastion: ami-0ceeab680f529cc36 Parameters: diff --git a/harness/determined/deploy/aws/templates/simple-rds.yaml b/harness/determined/deploy/aws/templates/simple-rds.yaml index 4acbf4f8b48..68962ade415 100644 --- a/harness/determined/deploy/aws/templates/simple-rds.yaml +++ b/harness/determined/deploy/aws/templates/simple-rds.yaml @@ -5,35 +5,35 @@ Mappings: RegionMap: ap-northeast-1: Master: ami-00910ef9457f0df47 - Agent: ami-0b9655c3686ad290f + Agent: ami-0d5ce4472d6286746 # TODO(DET-4258) Uncomment these when we fully support all P3 regions. # ap-northeast-2: # Master: ami-035e3e44dc41db6a2 - # Agent: ami-0dc620552c1aaa2cf + # Agent: ami-0090f68a647f10126 # ap-southeast-1: # Master: ami-0fd1ee6c8b656f020 - # Agent: ami-0adbb66c690fafe37 + # Agent: ami-048be732b76a4679e # ap-southeast-2: # Master: ami-0b62ecd3babd1c548 - # Agent: ami-0a453138b8d55c36d + # Agent: ami-05c05ba492caa1c68 eu-central-1: Master: ami-0abbe417ed83c0b29 - Agent: ami-09284ff11565b3ae3 + Agent: ami-0ebd39eab325463c0 eu-west-1: Master: ami-0e3f7dd2dc743e48a - Agent: ami-037b373c0075ea120 + Agent: ami-05c7e44456501d01d # eu-west-2: # Master: ami-0d78429fb6af30994 - # Agent: ami-0a523360a75ece477 + # Agent: ami-0d3b65b9d8e18b354 us-east-1: Master: ami-0172070f66a8ebe63 - Agent: ami-0f8ed0567336433e2 + Agent: ami-0b25b64346732d0b5 us-east-2: Master: ami-0bafa3699418551cd - Agent: ami-09578b4c5ea532f24 + Agent: ami-07517c67a90714250 us-west-2: Master: ami-0ceeab680f529cc36 - Agent: ami-0c8ad935d75f2f73d + Agent: ami-083f8147aeeba1eb2 Parameters: Keypair: diff --git a/harness/determined/deploy/aws/templates/simple.yaml b/harness/determined/deploy/aws/templates/simple.yaml index 6db59bea89b..3ceab98646d 100644 --- a/harness/determined/deploy/aws/templates/simple.yaml +++ b/harness/determined/deploy/aws/templates/simple.yaml @@ -5,35 +5,35 @@ Mappings: RegionMap: ap-northeast-1: Master: ami-00910ef9457f0df47 - Agent: ami-0b9655c3686ad290f + Agent: ami-0d5ce4472d6286746 # TODO(DET-4258) Uncomment these when we fully support all P3 regions. # ap-northeast-2: # Master: ami-035e3e44dc41db6a2 - # Agent: ami-0dc620552c1aaa2cf + # Agent: ami-0090f68a647f10126 # ap-southeast-1: # Master: ami-0fd1ee6c8b656f020 - # Agent: ami-0adbb66c690fafe37 + # Agent: ami-048be732b76a4679e # ap-southeast-2: # Master: ami-0b62ecd3babd1c548 - # Agent: ami-0a453138b8d55c36d + # Agent: ami-05c05ba492caa1c68 eu-central-1: Master: ami-0abbe417ed83c0b29 - Agent: ami-09284ff11565b3ae3 + Agent: ami-0ebd39eab325463c0 eu-west-1: Master: ami-0e3f7dd2dc743e48a - Agent: ami-037b373c0075ea120 + Agent: ami-05c7e44456501d01d # eu-west-2: # Master: ami-0d78429fb6af30994 - # Agent: ami-0a523360a75ece477 + # Agent: ami-0d3b65b9d8e18b354 us-east-1: Master: ami-0172070f66a8ebe63 - Agent: ami-0f8ed0567336433e2 + Agent: ami-0b25b64346732d0b5 us-east-2: Master: ami-0bafa3699418551cd - Agent: ami-09578b4c5ea532f24 + Agent: ami-07517c67a90714250 us-west-2: Master: ami-0ceeab680f529cc36 - Agent: ami-0c8ad935d75f2f73d + Agent: ami-083f8147aeeba1eb2 Parameters: Keypair: diff --git a/harness/determined/deploy/gcp/constants.py b/harness/determined/deploy/gcp/constants.py index 0e838eb408f..3dbbf206269 100644 --- a/harness/determined/deploy/gcp/constants.py +++ b/harness/determined/deploy/gcp/constants.py @@ -4,7 +4,7 @@ class defaults: DB_PASSWORD = "postgres" BOOT_DISK_SIZE = 200 BOOT_DISK_TYPE = "pd-standard" - ENVIRONMENT_IMAGE = "det-environments-8c90e80" + ENVIRONMENT_IMAGE = "det-environments-0e43056" GPU_NUM = 4 GPU_TYPE = "nvidia-tesla-t4" MASTER_INSTANCE_TYPE = "n1-standard-2" diff --git a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json index 5af2e963b36..c884211ef8d 100644 --- a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json +++ b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-keras/metadata.json @@ -39,8 +39,8 @@ }, "force_pull_image": false, "image": { - "cpu": "determinedai/tensorflow-ngc-dev:8c90e80", - "cuda": "determinedai/tensorflow-ngc-dev:8c90e80", + "cpu": "determinedai/tensorflow-ngc-dev:0e43056", + "cuda": "determinedai/tensorflow-ngc-dev:0e43056", "rocm": "determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" }, "pod_spec": null, diff --git a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json index 9549bb22e6e..2e8dd411697 100644 --- a/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json +++ b/harness/tests/experiment/fixtures/ancient-checkpoints/0.17.6-pytorch/metadata.json @@ -38,8 +38,8 @@ }, "force_pull_image": false, "image": { - "cpu": "determinedai/tensorflow-ngc-dev:8c90e80", - "cuda": "determinedai/tensorflow-ngc-dev:8c90e80", + "cpu": "determinedai/tensorflow-ngc-dev:0e43056", + "cuda": "determinedai/tensorflow-ngc-dev:0e43056", "rocm": "determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" }, "pod_spec": null, diff --git a/harness/tests/fixtures/checkpoint.json b/harness/tests/fixtures/checkpoint.json index 08456227aad..40f68912411 100644 --- a/harness/tests/fixtures/checkpoint.json +++ b/harness/tests/fixtures/checkpoint.json @@ -69,8 +69,8 @@ }, "force_pull_image":false, "image":{ - "cpu":"determinedai/pytorch-ngc-dev:8c90e80", - "cuda":"determinedai/pytorch-ngc-dev:8c90e80", + "cpu":"determinedai/pytorch-ngc-dev:0e43056", + "cuda":"determinedai/pytorch-ngc-dev:0e43056", "rocm":"determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" }, "pod_spec":null, diff --git a/helm/charts/determined/values.yaml b/helm/charts/determined/values.yaml index 2de4d97c32a..7ce68b82af5 100644 --- a/helm/charts/determined/values.yaml +++ b/helm/charts/determined/values.yaml @@ -31,8 +31,8 @@ defaultImages: kubeSchedulerPreemption: "determinedai/kube-scheduler:0.17.0" # default images for CPU and GPU environments - cpuImage: "determinedai/pytorch-ngc-dev:8c90e80" - gpuImage: "determinedai/pytorch-ngc-dev:8c90e80" + cpuImage: "determinedai/pytorch-ngc-dev:0e43056" + gpuImage: "determinedai/pytorch-ngc-dev:0e43056" # Install Determined enterprise edition. enterpriseEdition: false diff --git a/master/internal/config/provconfig/aws_config.go b/master/internal/config/provconfig/aws_config.go index c7810bb6089..87796032157 100644 --- a/master/internal/config/provconfig/aws_config.go +++ b/master/internal/config/provconfig/aws_config.go @@ -50,16 +50,16 @@ type AWSClusterConfig struct { } var defaultAWSImageID = map[string]string{ - "ap-northeast-1": "ami-0b9655c3686ad290f", - "ap-northeast-2": "ami-0dc620552c1aaa2cf", - "ap-southeast-1": "ami-0adbb66c690fafe37", - "ap-southeast-2": "ami-0a453138b8d55c36d", - "us-east-2": "ami-09578b4c5ea532f24", - "us-east-1": "ami-0f8ed0567336433e2", - "us-west-2": "ami-0c8ad935d75f2f73d", - "eu-central-1": "ami-09284ff11565b3ae3", - "eu-west-2": "ami-0a523360a75ece477", - "eu-west-1": "ami-037b373c0075ea120", + "ap-northeast-1": "ami-0d5ce4472d6286746", + "ap-northeast-2": "ami-0090f68a647f10126", + "ap-southeast-1": "ami-048be732b76a4679e", + "ap-southeast-2": "ami-05c05ba492caa1c68", + "us-east-2": "ami-07517c67a90714250", + "us-east-1": "ami-0b25b64346732d0b5", + "us-west-2": "ami-083f8147aeeba1eb2", + "eu-central-1": "ami-0ebd39eab325463c0", + "eu-west-2": "ami-0d3b65b9d8e18b354", + "eu-west-1": "ami-05c7e44456501d01d", } var defaultAWSClusterConfig = AWSClusterConfig{ diff --git a/master/internal/config/provconfig/gcp_config.go b/master/internal/config/provconfig/gcp_config.go index b6a8ed2db7d..5f6db4b2712 100644 --- a/master/internal/config/provconfig/gcp_config.go +++ b/master/internal/config/provconfig/gcp_config.go @@ -56,7 +56,7 @@ type GCPClusterConfig struct { func DefaultGCPClusterConfig() *GCPClusterConfig { return &GCPClusterConfig{ BootDiskSize: 200, - BootDiskSourceImage: "projects/determined-ai/global/images/det-environments-8c90e80", + BootDiskSourceImage: "projects/determined-ai/global/images/det-environments-0e43056", LabelKey: "managed-by", InstanceType: gceInstanceType{ MachineType: "n1-standard-32", diff --git a/master/pkg/schemas/expconf/const.go b/master/pkg/schemas/expconf/const.go index 16cb7115d55..5d210527b43 100644 --- a/master/pkg/schemas/expconf/const.go +++ b/master/pkg/schemas/expconf/const.go @@ -8,7 +8,7 @@ const ( // Default task environment docker image names. const ( - CPUImage = "determinedai/pytorch-ngc-dev:8c90e80" - CUDAImage = "determinedai/pytorch-ngc-dev:8c90e80" + CPUImage = "determinedai/pytorch-ngc-dev:0e43056" + CUDAImage = "determinedai/pytorch-ngc-dev:0e43056" ROCMImage = "determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512" ) diff --git a/model_hub/Makefile b/model_hub/Makefile index 584bc81db1f..4a543ca5ba8 100644 --- a/model_hub/Makefile +++ b/model_hub/Makefile @@ -5,7 +5,7 @@ SHORT_GIT_HASH := $(shell git rev-parse --short HEAD) ARTIFACTS_DIR := /tmp/artifacts # Model-hub library environments will be built on top of the default GPU and CPU images in master/pkg/model/defaults.go -DEFAULT_GPU_IMAGE := determinedai/pytorch-tensorflow-cuda-dev:8c90e80 +DEFAULT_GPU_IMAGE := determinedai/pytorch-tensorflow-cuda-dev:0e43056 ############REMINDER############ # When bumping third-party library versions, remember to bump versions in diff --git a/schemas/test_cases/v0/experiment.yaml b/schemas/test_cases/v0/experiment.yaml index d6c414d967a..9491c026c58 100644 --- a/schemas/test_cases/v0/experiment.yaml +++ b/schemas/test_cases/v0/experiment.yaml @@ -47,8 +47,8 @@ environment_variables: {} force_pull_image: false image: - cpu: determinedai/pytorch-ngc-dev:8c90e80 - cuda: determinedai/pytorch-ngc-dev:8c90e80 + cpu: determinedai/pytorch-ngc-dev:0e43056 + cuda: determinedai/pytorch-ngc-dev:0e43056 rocm: determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-622d512 pod_spec: null ports: diff --git a/tools/scripts/bumpenvs.yaml b/tools/scripts/bumpenvs.yaml index 9d22f1ca10a..5ba0a89772c 100644 --- a/tools/scripts/bumpenvs.yaml +++ b/tools/scripts/bumpenvs.yaml @@ -1,20 +1,20 @@ -ap_northeast_1_agent_ami: {new: ami-0b9655c3686ad290f, old: ami-0f6a577ce354fd4f8} +ap_northeast_1_agent_ami: {new: ami-0d5ce4472d6286746, old: ami-0b9655c3686ad290f} ap_northeast_1_bastion_ami: {new: ami-00910ef9457f0df47, old: ami-0c7cb70d3eb61492b} ap_northeast_1_master_ami: {new: ami-00910ef9457f0df47, old: ami-0c7cb70d3eb61492b} -ap_northeast_2_agent_ami: {new: ami-0dc620552c1aaa2cf, old: ami-0eb300cb6581ff816} +ap_northeast_2_agent_ami: {new: ami-0090f68a647f10126, old: ami-0dc620552c1aaa2cf} ap_northeast_2_bastion_ami: {new: ami-035e3e44dc41db6a2, old: ami-003bb1772f36a39a3} ap_northeast_2_master_ami: {new: ami-035e3e44dc41db6a2, old: ami-003bb1772f36a39a3} -ap_southeast_1_agent_ami: {new: ami-0adbb66c690fafe37, old: ami-07e8f240df433bbdc} +ap_southeast_1_agent_ami: {new: ami-048be732b76a4679e, old: ami-0adbb66c690fafe37} ap_southeast_1_bastion_ami: {new: ami-0fd1ee6c8b656f020, old: ami-09f03fa5572692399} ap_southeast_1_master_ami: {new: ami-0fd1ee6c8b656f020, old: ami-09f03fa5572692399} -ap_southeast_2_agent_ami: {new: ami-0a453138b8d55c36d, old: ami-03c758b2777e9ba93} +ap_southeast_2_agent_ami: {new: ami-05c05ba492caa1c68, old: ami-0a453138b8d55c36d} ap_southeast_2_bastion_ami: {new: ami-0b62ecd3babd1c548, old: ami-06139e5e22cc2f7b1} ap_southeast_2_master_ami: {new: ami-0b62ecd3babd1c548, old: ami-06139e5e22cc2f7b1} deepspeed_0_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-748dda4, old: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-079eb6d} deepspeed_0_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.31.1, old: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.30.1} -deepspeed_gpt_neox_0_hashed: {new: determinedai/deepspeed-cuda-gpt-neox:8c90e80, old: determinedai/deepspeed-cuda-gpt-neox:e960eae} +deepspeed_gpt_neox_0_hashed: {new: determinedai/deepspeed-cuda-gpt-neox:0e43056, old: determinedai/deepspeed-cuda-gpt-neox:8c90e80} deepspeed_gpu_0_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-2196775, old: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-f66cbce} deepspeed_gpu_0_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.29.1, @@ -23,16 +23,16 @@ deepspeed_gpu_1_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-mpi-9119094} deepspeed_gpu_1_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.10-tf-2.8-deepspeed-0.7.0-gpu-mpi-0.19.1} -eu_central_1_agent_ami: {new: ami-09284ff11565b3ae3, old: ami-0c8b1567095263195} +eu_central_1_agent_ami: {new: ami-0ebd39eab325463c0, old: ami-09284ff11565b3ae3} eu_central_1_bastion_ami: {new: ami-0abbe417ed83c0b29, old: ami-0b81e95bb0a06ea8c} eu_central_1_master_ami: {new: ami-0abbe417ed83c0b29, old: ami-0b81e95bb0a06ea8c} -eu_west_1_agent_ami: {new: ami-037b373c0075ea120, old: ami-0b1f4aa99e3062a6a} +eu_west_1_agent_ami: {new: ami-05c7e44456501d01d, old: ami-037b373c0075ea120} eu_west_1_bastion_ami: {new: ami-0e3f7dd2dc743e48a, old: ami-029cfca952b331b52} eu_west_1_master_ami: {new: ami-0e3f7dd2dc743e48a, old: ami-029cfca952b331b52} -eu_west_2_agent_ami: {new: ami-0a523360a75ece477, old: ami-0fe3e19feb3fca8cb} +eu_west_2_agent_ami: {new: ami-0d3b65b9d8e18b354, old: ami-0a523360a75ece477} eu_west_2_bastion_ami: {new: ami-0d78429fb6af30994, old: ami-035469b606478d63d} eu_west_2_master_ami: {new: ami-0d78429fb6af30994, old: ami-035469b606478d63d} -gcp_env: {new: det-environments-8c90e80, old: det-environments-e960eae} +gcp_env: {new: det-environments-0e43056, old: det-environments-8c90e80} gpt_neox_deepspeed_0_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.10-gpt-neox-deepspeed-gpu-748dda4, old: determinedai/environments:cuda-11.3-pytorch-1.10-gpt-neox-deepspeed-gpu-079eb6d} gpt_neox_deepspeed_0_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.10-gpt-neox-deepspeed-gpu-0.31.1, @@ -81,8 +81,8 @@ pytorch10_tf27_rocm50_0_hashed: {new: determinedai/environments:rocm-5.0-pytorch old: determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-bf9480b} pytorch10_tf27_rocm50_0_versioned: {new: determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4, old: determinedai/environments-dev:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.26.4} -pytorch13_tf210_rocm56_0_hashed: {new: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-8c90e80, - old: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-e960eae} +pytorch13_tf210_rocm56_0_hashed: {new: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-0e43056, + old: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-8c90e80} pytorch13_tf210_rocm56_0_versioned: {new: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-0.33.1, old: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-0.31.2} pytorch13_tf210_rocm56_1_hashed: {new: determinedai/environments:rocm-5.6-pytorch-1.3-tf-2.10-rocm-ompi-2196775, @@ -95,29 +95,29 @@ pytorch19_tf25_rocm_0_versioned: {new: determinedai/environments:rocm-5.0-pytorc old: determinedai/environments:rocm-4.2-pytorch-1.9-tf-2.5-rocm-0.18.5} pytorch19_tf25_rocm_1_hashed: {new: determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-096d730} pytorch19_tf25_rocm_1_versioned: {new: determinedai/environments:rocm-5.0-pytorch-1.10-tf-2.7-rocm-0.19.4} -pytorch20_tf210_rocm56_0_hashed: {new: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-8c90e80, - old: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-e960eae} +pytorch20_tf210_rocm56_0_hashed: {new: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-0e43056, + old: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-8c90e80} pytorch20_tf210_rocm56_0_versioned: {new: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-0.33.1, old: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-0.31.2} pytorch20_tf210_rocm56_1_hashed: {new: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-2196775, old: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-f66cbce} pytorch20_tf210_rocm56_1_versioned: {new: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-0.29.1, old: determinedai/environments:rocm-5.6-pytorch-2.0-tf-2.10-rocm-ompi-0.27.1} -pytorch_cpu_0_hashed: {new: determinedai/pytorch-cpu-dev:8c90e80, old: determinedai/pytorch-cpu-dev:e960eae} -pytorch_cpu_1_hashed: {new: determinedai/pytorch-cpu-hpc-dev:8c90e80, old: determinedai/pytorch-cpu-hpc-dev:e960eae} -pytorch_cuda_0_hashed: {new: determinedai/pytorch-cuda-dev:8c90e80, old: determinedai/pytorch-cuda-dev:e960eae} -pytorch_cuda_1_hashed: {new: determinedai/pytorch-cuda-hpc-dev:8c90e80, old: determinedai/pytorch-cuda-hpc-dev:e960eae} -pytorch_ngc_hashed: {new: determinedai/pytorch-ngc-dev:8c90e80, old: determinedai/pytorch-ngc-dev:e960eae} -pytorch_ngc_hpc_hashed: {new: determinedai/pytorch-ngc-hpc-dev:8c90e80, old: determinedai/pytorch-ngc-hpc-dev:e960eae} -tensorflow_cpu_0_hashed: {new: determinedai/pytorch-tensorflow-cpu-dev:8c90e80, old: determinedai/pytorch-tensorflow-cpu-dev:e960eae} -tensorflow_cpu_1_hashed: {new: determinedai/pytorch-tensorflow-cpu-hpc-dev:8c90e80, - old: determinedai/pytorch-tensorflow-cpu-hpc-dev:e960eae} -tensorflow_cuda_0_hashed: {new: determinedai/pytorch-tensorflow-cuda-dev:8c90e80, - old: determinedai/pytorch-tensorflow-cuda-dev:e960eae} -tensorflow_cuda_1_hashed: {new: determinedai/pytorch-tensorflow-cuda-hpc-dev:8c90e80, - old: determinedai/pytorch-tensorflow-cuda-hpc-dev:e960eae} -tensorflow_ngc_hashed: {new: determinedai/tensorflow-ngc-dev:8c90e80, old: determinedai/tensorflow-ngc-dev:e960eae} -tensorflow_ngc_hpc_hashed: {new: determinedai/tensorflow-ngc-hpc-dev:8c90e80, old: determinedai/tensorflow-ngc-hpc-dev:e960eae} +pytorch_cpu_0_hashed: {new: determinedai/pytorch-cpu-dev:0e43056, old: determinedai/pytorch-cpu-dev:8c90e80} +pytorch_cpu_1_hashed: {new: determinedai/pytorch-cpu-hpc-dev:0e43056, old: determinedai/pytorch-cpu-hpc-dev:8c90e80} +pytorch_cuda_0_hashed: {new: determinedai/pytorch-cuda-dev:0e43056, old: determinedai/pytorch-cuda-dev:8c90e80} +pytorch_cuda_1_hashed: {new: determinedai/pytorch-cuda-hpc-dev:0e43056, old: determinedai/pytorch-cuda-hpc-dev:8c90e80} +pytorch_ngc_hashed: {new: determinedai/pytorch-ngc-dev:0e43056, old: determinedai/pytorch-ngc-dev:8c90e80} +pytorch_ngc_hpc_hashed: {new: determinedai/pytorch-ngc-hpc-dev:0e43056, old: determinedai/pytorch-ngc-hpc-dev:8c90e80} +tensorflow_cpu_0_hashed: {new: determinedai/pytorch-tensorflow-cpu-dev:0e43056, old: determinedai/pytorch-tensorflow-cpu-dev:8c90e80} +tensorflow_cpu_1_hashed: {new: determinedai/pytorch-tensorflow-cpu-hpc-dev:0e43056, + old: determinedai/pytorch-tensorflow-cpu-hpc-dev:8c90e80} +tensorflow_cuda_0_hashed: {new: determinedai/pytorch-tensorflow-cuda-dev:0e43056, + old: determinedai/pytorch-tensorflow-cuda-dev:8c90e80} +tensorflow_cuda_1_hashed: {new: determinedai/pytorch-tensorflow-cuda-hpc-dev:0e43056, + old: determinedai/pytorch-tensorflow-cuda-hpc-dev:8c90e80} +tensorflow_ngc_hashed: {new: determinedai/tensorflow-ngc-dev:0e43056, old: determinedai/tensorflow-ngc-dev:8c90e80} +tensorflow_ngc_hpc_hashed: {new: determinedai/tensorflow-ngc-hpc-dev:0e43056, old: determinedai/tensorflow-ngc-hpc-dev:8c90e80} tf24_cpu_0_hashed: {new: determinedai/environments:py-3.8-pytorch-1.9-tf-2.4-cpu-24586f0, old: determinedai/environments-dev:py-3.8-pytorch-1.9-tf-2.4-cpu-1c769fb} tf24_cpu_0_versioned: {new: determinedai/environments:py-3.8-pytorch-1.9-tf-2.4-cpu-0.19.10, @@ -208,16 +208,16 @@ tf2_gpu_1_hashed: {new: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11 old: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-mpi-079eb6d} tf2_gpu_1_versioned: {new: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-mpi-ofi-0.31.1, old: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-mpi-0.30.1} -us_east_1_agent_ami: {new: ami-0f8ed0567336433e2, old: ami-0b421f60378fea445} +us_east_1_agent_ami: {new: ami-0b25b64346732d0b5, old: ami-0f8ed0567336433e2} us_east_1_bastion_ami: {new: ami-0172070f66a8ebe63, old: ami-0b93ce03dcbcb10f6} us_east_1_master_ami: {new: ami-0172070f66a8ebe63, old: ami-0b93ce03dcbcb10f6} -us_east_2_agent_ami: {new: ami-09578b4c5ea532f24, old: ami-0ca910d9ffdd24901} +us_east_2_agent_ami: {new: ami-07517c67a90714250, old: ami-09578b4c5ea532f24} us_east_2_bastion_ami: {new: ami-0bafa3699418551cd, old: ami-0cbea92f2377277a4} us_east_2_master_ami: {new: ami-0bafa3699418551cd, old: ami-0cbea92f2377277a4} -us_gov_east_1_agent_ami: {new: ami-09a56ff763262c365, old: ami-0562ab12436f4ff71} +us_gov_east_1_agent_ami: {new: ami-0cc63f942a6c1e5fd, old: ami-09a56ff763262c365} us_gov_east_1_master_ami: {new: ami-04ef693ebcf519dc3, old: ami-01d71f6009765d511} -us_gov_west_1_agent_ami: {new: ami-0e7d4dc4ebb742216, old: ami-0267f0e85bebf36d3} +us_gov_west_1_agent_ami: {new: ami-0707563a683a19ed2, old: ami-0e7d4dc4ebb742216} us_gov_west_1_master_ami: {new: ami-08bd15d820a3c087e, old: ami-0b64b04df085adbf1} -us_west_2_agent_ami: {new: ami-0c8ad935d75f2f73d, old: ami-02857ee434c5595e9} +us_west_2_agent_ami: {new: ami-083f8147aeeba1eb2, old: ami-0c8ad935d75f2f73d} us_west_2_bastion_ami: {new: ami-0ceeab680f529cc36, old: ami-0d31d7c9fc9503726} us_west_2_master_ami: {new: ami-0ceeab680f529cc36, old: ami-0d31d7c9fc9503726} diff --git a/tools/scripts/environments-target.txt b/tools/scripts/environments-target.txt index 4c5da9db191..ae1aefd0383 100644 --- a/tools/scripts/environments-target.txt +++ b/tools/scripts/environments-target.txt @@ -1 +1 @@ -8c90e80 +0e43056 diff --git a/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json b/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json index 4fe434ee949..0ef8ad15fbd 100644 --- a/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json +++ b/webui/react/src/fixtures/responses/experiment-details/non-scalar-metrics-4078.json @@ -32,8 +32,8 @@ "name": "Fork of Fork of mnist_tp_to_estimator_const", "environment": { "image": { - "cpu": "determinedai/pytorch-ngc-dev:8c90e80", - "gpu": "determinedai/pytorch-ngc-dev:8c90e80" + "cpu": "determinedai/pytorch-ngc-dev:0e43056", + "gpu": "determinedai/pytorch-ngc-dev:0e43056" }, "ports": null, "pod_spec": null, diff --git a/webui/react/src/fixtures/responses/experiment-details/set-a.json b/webui/react/src/fixtures/responses/experiment-details/set-a.json index 2b1a03209bd..19aed4419fe 100644 --- a/webui/react/src/fixtures/responses/experiment-details/set-a.json +++ b/webui/react/src/fixtures/responses/experiment-details/set-a.json @@ -694,8 +694,8 @@ "environment_variables": {}, "force_pull_image": false, "image": { - "cpu": "determinedai/pytorch-ngc-dev:8c90e80", - "gpu": "determinedai/pytorch-ngc-dev:8c90e80" + "cpu": "determinedai/pytorch-ngc-dev:0e43056", + "gpu": "determinedai/pytorch-ngc-dev:0e43056" }, "pod_spec": null, "ports": null @@ -838,8 +838,8 @@ "environment_variables": {}, "force_pull_image": false, "image": { - "cpu": "determinedai/tensorflow-ngc-dev:8c90e80", - "gpu": "determinedai/tensorflow-ngc-dev:8c90e80" + "cpu": "determinedai/tensorflow-ngc-dev:0e43056", + "gpu": "determinedai/tensorflow-ngc-dev:0e43056" }, "pod_spec": { "metadata": { diff --git a/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json b/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json index 87aefcda8b0..ab343b7b21d 100644 --- a/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json +++ b/webui/react/src/fixtures/responses/trial-details/old-trial-config-noop-adaptive.json @@ -30,8 +30,8 @@ "name": "noop_adaptive", "environment": { "image": { - "cpu": "determinedai/pytorch-ngc-dev:8c90e80", - "gpu": "determinedai/pytorch-ngc-dev:8c90e80" + "cpu": "determinedai/pytorch-ngc-dev:0e43056", + "gpu": "determinedai/pytorch-ngc-dev:0e43056" }, "ports": null, "force_pull_image": false,